From c69adc929a52eed6c4d428e1e84a1cd18caecdcf Mon Sep 17 00:00:00 2001
From: Colin Leong <122366389+cleong110@users.noreply.github.com>
Date: Thu, 23 May 2024 11:06:01 -0400
Subject: [PATCH 1/8] CDL: arxiv citation for rust2024PrivacyAwareSign

---
 src/references.bib | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/references.bib b/src/references.bib
index f6b44d2..de00c2b 100644
--- a/src/references.bib
+++ b/src/references.bib
@@ -2191,3 +2191,12 @@ @misc{gongLLMsAreGood2024
   archiveprefix = {arxiv},
   langid = {english}
 }
+
+@misc{rust2024PrivacyAwareSign,
+      title={Towards Privacy-Aware Sign Language Translation at Scale}, 
+      author={Phillip Rust and Bowen Shi and Skyler Wang and Necati Cihan Camgöz and Jean Maillard},
+      year={2024},
+      eprint={2402.09611},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
\ No newline at end of file

From 0303dc41213b1c13ff2f8877856befcde4cdd2cd Mon Sep 17 00:00:00 2001
From: Colin Leong <122366389+cleong110@users.noreply.github.com>
Date: Thu, 23 May 2024 11:44:45 -0400
Subject: [PATCH 2/8] CDL: rust2024PrivacyAwareSign citations for Hiera,
 Youtube-ASL, T5

---
 src/references.bib | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/src/references.bib b/src/references.bib
index de00c2b..c6cc272 100644
--- a/src/references.bib
+++ b/src/references.bib
@@ -2199,4 +2199,43 @@ @misc{rust2024PrivacyAwareSign
       eprint={2402.09611},
       archivePrefix={arXiv},
       primaryClass={cs.CL}
+}
+
+@inproceedings{dataset:uthus2023YoutubeASL,
+ author = {Uthus, Dave and Tanzer, Garrett and Georg, Manfred},
+ booktitle = {Advances in Neural Information Processing Systems},
+ editor = {A. Oh and T. Naumann and A. Globerson and K. Saenko and M. Hardt and S. Levine},
+ pages = {29029--29047},
+ publisher = {Curran Associates, Inc.},
+ title = {YouTube-ASL: A Large-Scale, Open-Domain American Sign Language-English Parallel Corpus},
+ url = {https://proceedings.neurips.cc/paper_files/paper/2023/file/5c61452daca5f0c260e683b317d13a3f-Paper-Datasets_and_Benchmarks.pdf},
+ volume = {36},
+ year = {2023}
+}
+
+
+@InProceedings{ryali2023HieraVisionTransformer,
+  title = 	 {Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles},
+  author =       {Ryali, Chaitanya and Hu, Yuan-Ting and Bolya, Daniel and Wei, Chen and Fan, Haoqi and Huang, Po-Yao and Aggarwal, Vaibhav and Chowdhury, Arkabandhu and Poursaeed, Omid and Hoffman, Judy and Malik, Jitendra and Li, Yanghao and Feichtenhofer, Christoph},
+  booktitle = 	 {Proceedings of the 40th International Conference on Machine Learning},
+  pages = 	 {29441--29454},
+  year = 	 {2023},
+  editor = 	 {Krause, Andreas and Brunskill, Emma and Cho, Kyunghyun and Engelhardt, Barbara and Sabato, Sivan and Scarlett, Jonathan},
+  volume = 	 {202},
+  series = 	 {Proceedings of Machine Learning Research},
+  month = 	 {23--29 Jul},
+  publisher =    {PMLR},
+  pdf = 	 {https://proceedings.mlr.press/v202/ryali23a/ryali23a.pdf},
+  url = 	 {https://proceedings.mlr.press/v202/ryali23a.html}
+}
+
+@article{raffel2020T5Transformer,
+  author  = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},
+  title   = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
+  journal = {Journal of Machine Learning Research},
+  year    = {2020},
+  volume  = {21},
+  number  = {140},
+  pages   = {1--67},
+  url     = {http://jmlr.org/papers/v21/20-074.html}
 }
\ No newline at end of file

From 2d4c9ec4be0d512f63ab07adb1309cb5de4833af Mon Sep 17 00:00:00 2001
From: Colin Leong <122366389+cleong110@users.noreply.github.com>
Date: Thu, 23 May 2024 11:51:50 -0400
Subject: [PATCH 3/8] CDL: adding citation for BLEURT

---
 src/references.bib | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/src/references.bib b/src/references.bib
index c6cc272..d4f5826 100644
--- a/src/references.bib
+++ b/src/references.bib
@@ -2238,4 +2238,23 @@ @article{raffel2020T5Transformer
   number  = {140},
   pages   = {1--67},
   url     = {http://jmlr.org/papers/v21/20-074.html}
-}
\ No newline at end of file
+}
+
+@inproceedings{sellam-etal-2020-bleurt,
+    title = "{BLEURT}: Learning Robust Metrics for Text Generation",
+    author = "Sellam, Thibault  and
+      Das, Dipanjan  and
+      Parikh, Ankur",
+    editor = "Jurafsky, Dan  and
+      Chai, Joyce  and
+      Schluter, Natalie  and
+      Tetreault, Joel",
+    booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
+    month = jul,
+    year = "2020",
+    address = "Online",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2020.acl-main.704",
+    doi = "10.18653/v1/2020.acl-main.704",
+    pages = "7881--7892"
+}

From 597146b34a93fa7920f874c643a2212f02116e65 Mon Sep 17 00:00:00 2001
From: Colin Leong <122366389+cleong110@users.noreply.github.com>
Date: Thu, 23 May 2024 12:06:12 -0400
Subject: [PATCH 4/8] CDL: initial very rough summary for
 rust2024PrivacyAwareSign

---
 src/index.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/index.md b/src/index.md
index 3a4cbdd..61367f7 100644
--- a/src/index.md
+++ b/src/index.md
@@ -745,6 +745,14 @@ In training, the VQ-Sign "character-level" module is trained with a context pred
 The framework achieves state-of-the-art results on the RWTH-PHOENIX-Weather-2014T [@cihan2018neural] and CSL-Daily [@dataset:huang2018video] datasets without relying on gloss annotations.
 <!-- TODO: c.f. SignLLM with https://github.com/sign-language-processing/sign-vq? -->
 
+<!-- TODO: YoutubeASL explanation would fit nicely here before Rust et al 2024. They don't just do data IIRC. -->
+
+<!-- Colin's original rough draft summary: In Rust et al's 2024 work \cite{rustPrivacyAwareSignLanguage2024}, they propose a self-supervised method based on Masked Auto-encoding as well as a new Linguistic-Supervised Pretraining, that makes no assumptions about model architecture.  They use this in conjunction with a Hierarchichal transformer,  pretrained on a number of large-scale sign language datasets including Youtube-ASL\cite{uthusYouTubeASLLargeScaleOpenDomain2023}, How2Sign\cite{duarteHow2SignLargeScaleMultimodal2021}, and a new dataset they release known as DailyMoth-70h. Results on How2Sign were significantly increased from previous SOTA such as \cite{tarres_sign_2023} and \cite{uthusYouTubeASLLargeScaleOpenDomain2023} ]\cite{linGlossFreeEndtoEndSign2023} -->
+<!-- Semantic Scholar's tl;dr: SSVP-SLT is introduced, which leverages self-supervised video pretraining on anonymized and unannotated videos, followed by supervised SLT finetuning on a curated parallel dataset, which achieves state-of-the-art finetuned and zero-shot gloss-free SLT performance on the How2Sign dataset -->
+<!-- Colin's Commentary: big kudos for mentioning *how* they calculate BLEU (SacreBLEU). -->
+@rust2024PrivacyAwareSign introduce SSVP-SLT, a privacy-aware framework for training sign language translation at scale. They first do a video transformer on a number of large-scale sign language datasets [@dataset:uthus2023YoutubeASL,dataset:duarte2020how2sign] without parallel, then finetune on target parllel translation dataset. They release SignHiera, a Hiera vision transformer [@ryali2023HieraVisionTransformer] pretrained. For the text translation task they leverage T5 (T5; @raffel2020T5Transformer) In addition they release a new dataset they call DailyMoth-70h.
+<!-- TODO: BLEURT explanation -->
+
 <!-- TODO: AFRISIGN (Shester and Mathias at AfricaNLP, ICLR 2023 workshop) -->
 
 #### Text-to-Video

From ca9c3458c51fe2a31b336c84a0013704edce6e53 Mon Sep 17 00:00:00 2001
From: Colin Leong <122366389+cleong110@users.noreply.github.com>
Date: Thu, 23 May 2024 13:08:06 -0400
Subject: [PATCH 5/8] CDL: rust2024PrivacyAwareSign summary v2

---
 src/index.md | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/index.md b/src/index.md
index 61367f7..e0d4104 100644
--- a/src/index.md
+++ b/src/index.md
@@ -747,11 +747,15 @@ The framework achieves state-of-the-art results on the RWTH-PHOENIX-Weather-2014
 
 <!-- TODO: YoutubeASL explanation would fit nicely here before Rust et al 2024. They don't just do data IIRC. -->
 
-<!-- Colin's original rough draft summary: In Rust et al's 2024 work \cite{rustPrivacyAwareSignLanguage2024}, they propose a self-supervised method based on Masked Auto-encoding as well as a new Linguistic-Supervised Pretraining, that makes no assumptions about model architecture.  They use this in conjunction with a Hierarchichal transformer,  pretrained on a number of large-scale sign language datasets including Youtube-ASL\cite{uthusYouTubeASLLargeScaleOpenDomain2023}, How2Sign\cite{duarteHow2SignLargeScaleMultimodal2021}, and a new dataset they release known as DailyMoth-70h. Results on How2Sign were significantly increased from previous SOTA such as \cite{tarres_sign_2023} and \cite{uthusYouTubeASLLargeScaleOpenDomain2023} ]\cite{linGlossFreeEndtoEndSign2023} -->
-<!-- Semantic Scholar's tl;dr: SSVP-SLT is introduced, which leverages self-supervised video pretraining on anonymized and unannotated videos, followed by supervised SLT finetuning on a curated parallel dataset, which achieves state-of-the-art finetuned and zero-shot gloss-free SLT performance on the How2Sign dataset -->
-<!-- Colin's Commentary: big kudos for mentioning *how* they calculate BLEU (SacreBLEU). -->
-@rust2024PrivacyAwareSign introduce SSVP-SLT, a privacy-aware framework for training sign language translation at scale. They first do a video transformer on a number of large-scale sign language datasets [@dataset:uthus2023YoutubeASL,dataset:duarte2020how2sign] without parallel, then finetune on target parllel translation dataset. They release SignHiera, a Hiera vision transformer [@ryali2023HieraVisionTransformer] pretrained. For the text translation task they leverage T5 (T5; @raffel2020T5Transformer) In addition they release a new dataset they call DailyMoth-70h.
+@rust2024PrivacyAwareSign introduce a privacy-aware method for sign language translation at scale which they call Self Supervised Video Pretraining for Sign Language Translation (SSVP-SLT). 
+SSVP-SLT is a two-stage method: they first pretrain a vision transformer [@ryali2023HieraVisionTransformer] with a self-supervised task on large unannotated video datasets [@dataset:uthus2023YoutubeASL,@dataset:duarte2020how2sign]. 
+In the second stage they freeze their vision model and project its outputs into a multingual LLM (T5; @raffel2020T5Transformer), which they finetune for translation on the How2Sign dataset [@dataset:duarte2020how2sign]. 
+They address privacy concerns by face-blurring during training. 
+They release their pretrained vision model, SignHiera, based on a Hiera vision transformer [@ryali2023HieraVisionTransformer]. 
+In addition they release a new dataset they call DailyMoth-70h, containing video data from the Daily Moth, a Deaf News site. 
+The model achieves state-of-the-art results on the How2Sign dataset [@dataset:duarte2020how2sign].
 <!-- TODO: BLEURT explanation -->
+<!-- TODO: add DailyMoth to datasets list. Table 8 has stats: 497 videos, 70 hours, 1 signer, vocabulary of words 19 740, segmented video clips, -->
 
 <!-- TODO: AFRISIGN (Shester and Mathias at AfricaNLP, ICLR 2023 workshop) -->
 
@@ -982,6 +986,7 @@ sign language resources are scarce and, currently only support translation and p
 Unfortunately, most of the sign language corpora discussed in the literature are either not available for use or available under heavy restrictions and licensing terms. 
 Furthermore, sign language data is especially challenging to anonymize due to the importance of facial and other physical features in signing videos, limiting its open distribution. Developing anonymization with minimal information loss or accurate anonymous representations is a promising research direction.
 
+<!-- TODO: a discussion on anonymization methods, including the thoughts of @rust2024PrivacyAwareSign, who mention a few approaches and cite them. They also argue that poses "do not offer meaningful privacy protection either" (Appendix A).  -->
 
 ### Collect Real-World Data
 

From cd788625449ac7b11a3db3c07f089e5466318c17 Mon Sep 17 00:00:00 2001
From: Colin Leong <122366389+cleong110@users.noreply.github.com>
Date: Thu, 23 May 2024 13:28:49 -0400
Subject: [PATCH 6/8] CDL: rust2024PrivacyAwareSign summary v3

---
 src/index.md | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/index.md b/src/index.md
index e0d4104..5d7fcdb 100644
--- a/src/index.md
+++ b/src/index.md
@@ -747,13 +747,12 @@ The framework achieves state-of-the-art results on the RWTH-PHOENIX-Weather-2014
 
 <!-- TODO: YoutubeASL explanation would fit nicely here before Rust et al 2024. They don't just do data IIRC. -->
 
-@rust2024PrivacyAwareSign introduce a privacy-aware method for sign language translation at scale which they call Self Supervised Video Pretraining for Sign Language Translation (SSVP-SLT). 
-SSVP-SLT is a two-stage method: they first pretrain a vision transformer [@ryali2023HieraVisionTransformer] with a self-supervised task on large unannotated video datasets [@dataset:uthus2023YoutubeASL,@dataset:duarte2020how2sign]. 
-In the second stage they freeze their vision model and project its outputs into a multingual LLM (T5; @raffel2020T5Transformer), which they finetune for translation on the How2Sign dataset [@dataset:duarte2020how2sign]. 
-They address privacy concerns by face-blurring during training. 
-They release their pretrained vision model, SignHiera, based on a Hiera vision transformer [@ryali2023HieraVisionTransformer]. 
-In addition they release a new dataset they call DailyMoth-70h, containing video data from the Daily Moth, a Deaf News site. 
-The model achieves state-of-the-art results on the How2Sign dataset [@dataset:duarte2020how2sign].
+@rust2024PrivacyAwareSign introduce a two-stage privacy-aware method for sign language translation (SLT) at scale, termed Self-Supervised Video Pretraining for Sign Language Translation (SSVP-SLT). 
+The first stage involves self-supervised pretraining of a Hiera vision transformer on large unannotated video datasets [@ryali2023HieraVisionTransformer; @dataset:uthus2023YoutubeASL]. 
+In the second stage, the vision model's outputs are fed into a multilingual language model (T5) for finetuning on the How2Sign dataset [@raffel2020T5Transformer; @dataset:duarte2020how2sign].
+To mitigate privacy risks, the framework employs facial obfuscation. 
+Additionally, the authors release DailyMoth-70h, a new 70-hour ASL dataset from [The Daily Moth](https://www.dailymoth.com/). 
+SSVP-SLT achieves state-of-the-art performance on How2Sign [@dataset:duarte2020how2sign].
 <!-- TODO: BLEURT explanation -->
 <!-- TODO: add DailyMoth to datasets list. Table 8 has stats: 497 videos, 70 hours, 1 signer, vocabulary of words 19 740, segmented video clips, -->
 

From f8910f3bc3ad9b72130f8beb6bcd4c47288a9aeb Mon Sep 17 00:00:00 2001
From: Colin Leong <122366389+cleong110@users.noreply.github.com>
Date: Mon, 27 May 2024 17:40:48 -0400
Subject: [PATCH 7/8] CDL: Citation fixes and expanding on 'facial obfuscation'

---
 src/index.md | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/index.md b/src/index.md
index 70265dc..9c26062 100644
--- a/src/index.md
+++ b/src/index.md
@@ -749,10 +749,12 @@ The framework achieves state-of-the-art results on the RWTH-PHOENIX-Weather-2014
 <!-- TODO: YoutubeASL explanation would fit nicely here before Rust et al 2024. They don't just do data IIRC. -->
 
 @rust2024PrivacyAwareSign introduce a two-stage privacy-aware method for sign language translation (SLT) at scale, termed Self-Supervised Video Pretraining for Sign Language Translation (SSVP-SLT). 
-The first stage involves self-supervised pretraining of a Hiera vision transformer on large unannotated video datasets [@ryali2023HieraVisionTransformer; @dataset:uthus2023YoutubeASL]. 
-In the second stage, the vision model's outputs are fed into a multilingual language model (T5) for finetuning on the How2Sign dataset [@raffel2020T5Transformer; @dataset:duarte2020how2sign].
-To mitigate privacy risks, the framework employs facial obfuscation. 
-Additionally, the authors release DailyMoth-70h, a new 70-hour ASL dataset from [The Daily Moth](https://www.dailymoth.com/). 
+The first stage involves self-supervised pretraining of a Hiera vision transformer [@ryali2023HieraVisionTransformer] on large unannotated video datasets [@dataset:duarte2020how2sign, @dataset:uthus2023YoutubeASL]. 
+In the second stage, the vision model's outputs are fed into a multilingual language model [@raffel2020T5Transformer] for finetuning on the How2Sign dataset [@dataset:duarte2020how2sign].
+To mitigate privacy risks, the framework employs facial blurring during pretraining.
+They find that while pretraining with blurring hurts performance, some can be recovered when finetuning with unblurred data.
+They conclude that SLT models can be pretrained in a privacy-aware manner without sacrificing too much performance.
+Additionally, the authors release DailyMoth-70h, a new 70-hour ASL dataset from [The Daily Moth](https://www.dailymoth.com/).
 SSVP-SLT achieves state-of-the-art performance on How2Sign [@dataset:duarte2020how2sign].
 <!-- TODO: BLEURT explanation -->
 <!-- TODO: add DailyMoth to datasets list. Table 8 has stats: 497 videos, 70 hours, 1 signer, vocabulary of words 19 740, segmented video clips, -->

From b6123db2047be1c8066571664d2d171a08893e7f Mon Sep 17 00:00:00 2001
From: Colin Leong <122366389+cleong110@users.noreply.github.com>
Date: Tue, 28 May 2024 10:18:51 -0400
Subject: [PATCH 8/8] CDL: rearranging sentences in rust2024PrivacyAwareSign

---
 src/index.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/index.md b/src/index.md
index 8229e26..8e27711 100644
--- a/src/index.md
+++ b/src/index.md
@@ -773,9 +773,10 @@ The first stage involves self-supervised pretraining of a Hiera vision transform
 In the second stage, the vision model's outputs are fed into a multilingual language model [@raffel2020T5Transformer] for finetuning on the How2Sign dataset [@dataset:duarte2020how2sign].
 To mitigate privacy risks, the framework employs facial blurring during pretraining.
 They find that while pretraining with blurring hurts performance, some can be recovered when finetuning with unblurred data.
+SSVP-SLT achieves state-of-the-art performance on How2Sign [@dataset:duarte2020how2sign].
 They conclude that SLT models can be pretrained in a privacy-aware manner without sacrificing too much performance.
 Additionally, the authors release DailyMoth-70h, a new 70-hour ASL dataset from [The Daily Moth](https://www.dailymoth.com/).
-SSVP-SLT achieves state-of-the-art performance on How2Sign [@dataset:duarte2020how2sign].
+
 <!-- TODO: BLEURT explanation -->
 <!-- TODO: add DailyMoth to datasets list. Table 8 has stats: 497 videos, 70 hours, 1 signer, vocabulary of words 19 740, segmented video clips, -->