From c69adc929a52eed6c4d428e1e84a1cd18caecdcf Mon Sep 17 00:00:00 2001 From: Colin Leong <122366389+cleong110@users.noreply.github.com> Date: Thu, 23 May 2024 11:06:01 -0400 Subject: [PATCH 1/8] CDL: arxiv citation for rust2024PrivacyAwareSign --- src/references.bib | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/references.bib b/src/references.bib index f6b44d2..de00c2b 100644 --- a/src/references.bib +++ b/src/references.bib @@ -2191,3 +2191,12 @@ @misc{gongLLMsAreGood2024 archiveprefix = {arxiv}, langid = {english} } + +@misc{rust2024PrivacyAwareSign, + title={Towards Privacy-Aware Sign Language Translation at Scale}, + author={Phillip Rust and Bowen Shi and Skyler Wang and Necati Cihan Camgöz and Jean Maillard}, + year={2024}, + eprint={2402.09611}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} \ No newline at end of file From 0303dc41213b1c13ff2f8877856befcde4cdd2cd Mon Sep 17 00:00:00 2001 From: Colin Leong <122366389+cleong110@users.noreply.github.com> Date: Thu, 23 May 2024 11:44:45 -0400 Subject: [PATCH 2/8] CDL: rust2024PrivacyAwareSign citations for Hiera, Youtube-ASL, T5 --- src/references.bib | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/src/references.bib b/src/references.bib index de00c2b..c6cc272 100644 --- a/src/references.bib +++ b/src/references.bib @@ -2199,4 +2199,43 @@ @misc{rust2024PrivacyAwareSign eprint={2402.09611}, archivePrefix={arXiv}, primaryClass={cs.CL} +} + +@inproceedings{dataset:uthus2023YoutubeASL, + author = {Uthus, Dave and Tanzer, Garrett and Georg, Manfred}, + booktitle = {Advances in Neural Information Processing Systems}, + editor = {A. Oh and T. Naumann and A. Globerson and K. Saenko and M. Hardt and S. Levine}, + pages = {29029--29047}, + publisher = {Curran Associates, Inc.}, + title = {YouTube-ASL: A Large-Scale, Open-Domain American Sign Language-English Parallel Corpus}, + url = {https://proceedings.neurips.cc/paper_files/paper/2023/file/5c61452daca5f0c260e683b317d13a3f-Paper-Datasets_and_Benchmarks.pdf}, + volume = {36}, + year = {2023} +} + + +@InProceedings{ryali2023HieraVisionTransformer, + title = {Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles}, + author = {Ryali, Chaitanya and Hu, Yuan-Ting and Bolya, Daniel and Wei, Chen and Fan, Haoqi and Huang, Po-Yao and Aggarwal, Vaibhav and Chowdhury, Arkabandhu and Poursaeed, Omid and Hoffman, Judy and Malik, Jitendra and Li, Yanghao and Feichtenhofer, Christoph}, + booktitle = {Proceedings of the 40th International Conference on Machine Learning}, + pages = {29441--29454}, + year = {2023}, + editor = {Krause, Andreas and Brunskill, Emma and Cho, Kyunghyun and Engelhardt, Barbara and Sabato, Sivan and Scarlett, Jonathan}, + volume = {202}, + series = {Proceedings of Machine Learning Research}, + month = {23--29 Jul}, + publisher = {PMLR}, + pdf = {https://proceedings.mlr.press/v202/ryali23a/ryali23a.pdf}, + url = {https://proceedings.mlr.press/v202/ryali23a.html} +} + +@article{raffel2020T5Transformer, + author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu}, + title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer}, + journal = {Journal of Machine Learning Research}, + year = {2020}, + volume = {21}, + number = {140}, + pages = {1--67}, + url = {http://jmlr.org/papers/v21/20-074.html} } \ No newline at end of file From 2d4c9ec4be0d512f63ab07adb1309cb5de4833af Mon Sep 17 00:00:00 2001 From: Colin Leong <122366389+cleong110@users.noreply.github.com> Date: Thu, 23 May 2024 11:51:50 -0400 Subject: [PATCH 3/8] CDL: adding citation for BLEURT --- src/references.bib | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/references.bib b/src/references.bib index c6cc272..d4f5826 100644 --- a/src/references.bib +++ b/src/references.bib @@ -2238,4 +2238,23 @@ @article{raffel2020T5Transformer number = {140}, pages = {1--67}, url = {http://jmlr.org/papers/v21/20-074.html} -} \ No newline at end of file +} + +@inproceedings{sellam-etal-2020-bleurt, + title = "{BLEURT}: Learning Robust Metrics for Text Generation", + author = "Sellam, Thibault and + Das, Dipanjan and + Parikh, Ankur", + editor = "Jurafsky, Dan and + Chai, Joyce and + Schluter, Natalie and + Tetreault, Joel", + booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics", + month = jul, + year = "2020", + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2020.acl-main.704", + doi = "10.18653/v1/2020.acl-main.704", + pages = "7881--7892" +} From 597146b34a93fa7920f874c643a2212f02116e65 Mon Sep 17 00:00:00 2001 From: Colin Leong <122366389+cleong110@users.noreply.github.com> Date: Thu, 23 May 2024 12:06:12 -0400 Subject: [PATCH 4/8] CDL: initial very rough summary for rust2024PrivacyAwareSign --- src/index.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/index.md b/src/index.md index 3a4cbdd..61367f7 100644 --- a/src/index.md +++ b/src/index.md @@ -745,6 +745,14 @@ In training, the VQ-Sign "character-level" module is trained with a context pred The framework achieves state-of-the-art results on the RWTH-PHOENIX-Weather-2014T [@cihan2018neural] and CSL-Daily [@dataset:huang2018video] datasets without relying on gloss annotations. + + + + + +@rust2024PrivacyAwareSign introduce SSVP-SLT, a privacy-aware framework for training sign language translation at scale. They first do a video transformer on a number of large-scale sign language datasets [@dataset:uthus2023YoutubeASL,dataset:duarte2020how2sign] without parallel, then finetune on target parllel translation dataset. They release SignHiera, a Hiera vision transformer [@ryali2023HieraVisionTransformer] pretrained. For the text translation task they leverage T5 (T5; @raffel2020T5Transformer) In addition they release a new dataset they call DailyMoth-70h. + + #### Text-to-Video From ca9c3458c51fe2a31b336c84a0013704edce6e53 Mon Sep 17 00:00:00 2001 From: Colin Leong <122366389+cleong110@users.noreply.github.com> Date: Thu, 23 May 2024 13:08:06 -0400 Subject: [PATCH 5/8] CDL: rust2024PrivacyAwareSign summary v2 --- src/index.md | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/index.md b/src/index.md index 61367f7..e0d4104 100644 --- a/src/index.md +++ b/src/index.md @@ -747,11 +747,15 @@ The framework achieves state-of-the-art results on the RWTH-PHOENIX-Weather-2014 - - - -@rust2024PrivacyAwareSign introduce SSVP-SLT, a privacy-aware framework for training sign language translation at scale. They first do a video transformer on a number of large-scale sign language datasets [@dataset:uthus2023YoutubeASL,dataset:duarte2020how2sign] without parallel, then finetune on target parllel translation dataset. They release SignHiera, a Hiera vision transformer [@ryali2023HieraVisionTransformer] pretrained. For the text translation task they leverage T5 (T5; @raffel2020T5Transformer) In addition they release a new dataset they call DailyMoth-70h. +@rust2024PrivacyAwareSign introduce a privacy-aware method for sign language translation at scale which they call Self Supervised Video Pretraining for Sign Language Translation (SSVP-SLT). +SSVP-SLT is a two-stage method: they first pretrain a vision transformer [@ryali2023HieraVisionTransformer] with a self-supervised task on large unannotated video datasets [@dataset:uthus2023YoutubeASL,@dataset:duarte2020how2sign]. +In the second stage they freeze their vision model and project its outputs into a multingual LLM (T5; @raffel2020T5Transformer), which they finetune for translation on the How2Sign dataset [@dataset:duarte2020how2sign]. +They address privacy concerns by face-blurring during training. +They release their pretrained vision model, SignHiera, based on a Hiera vision transformer [@ryali2023HieraVisionTransformer]. +In addition they release a new dataset they call DailyMoth-70h, containing video data from the Daily Moth, a Deaf News site. +The model achieves state-of-the-art results on the How2Sign dataset [@dataset:duarte2020how2sign]. + @@ -982,6 +986,7 @@ sign language resources are scarce and, currently only support translation and p Unfortunately, most of the sign language corpora discussed in the literature are either not available for use or available under heavy restrictions and licensing terms. Furthermore, sign language data is especially challenging to anonymize due to the importance of facial and other physical features in signing videos, limiting its open distribution. Developing anonymization with minimal information loss or accurate anonymous representations is a promising research direction. + ### Collect Real-World Data From cd788625449ac7b11a3db3c07f089e5466318c17 Mon Sep 17 00:00:00 2001 From: Colin Leong <122366389+cleong110@users.noreply.github.com> Date: Thu, 23 May 2024 13:28:49 -0400 Subject: [PATCH 6/8] CDL: rust2024PrivacyAwareSign summary v3 --- src/index.md | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/index.md b/src/index.md index e0d4104..5d7fcdb 100644 --- a/src/index.md +++ b/src/index.md @@ -747,13 +747,12 @@ The framework achieves state-of-the-art results on the RWTH-PHOENIX-Weather-2014 -@rust2024PrivacyAwareSign introduce a privacy-aware method for sign language translation at scale which they call Self Supervised Video Pretraining for Sign Language Translation (SSVP-SLT). -SSVP-SLT is a two-stage method: they first pretrain a vision transformer [@ryali2023HieraVisionTransformer] with a self-supervised task on large unannotated video datasets [@dataset:uthus2023YoutubeASL,@dataset:duarte2020how2sign]. -In the second stage they freeze their vision model and project its outputs into a multingual LLM (T5; @raffel2020T5Transformer), which they finetune for translation on the How2Sign dataset [@dataset:duarte2020how2sign]. -They address privacy concerns by face-blurring during training. -They release their pretrained vision model, SignHiera, based on a Hiera vision transformer [@ryali2023HieraVisionTransformer]. -In addition they release a new dataset they call DailyMoth-70h, containing video data from the Daily Moth, a Deaf News site. -The model achieves state-of-the-art results on the How2Sign dataset [@dataset:duarte2020how2sign]. +@rust2024PrivacyAwareSign introduce a two-stage privacy-aware method for sign language translation (SLT) at scale, termed Self-Supervised Video Pretraining for Sign Language Translation (SSVP-SLT). +The first stage involves self-supervised pretraining of a Hiera vision transformer on large unannotated video datasets [@ryali2023HieraVisionTransformer; @dataset:uthus2023YoutubeASL]. +In the second stage, the vision model's outputs are fed into a multilingual language model (T5) for finetuning on the How2Sign dataset [@raffel2020T5Transformer; @dataset:duarte2020how2sign]. +To mitigate privacy risks, the framework employs facial obfuscation. +Additionally, the authors release DailyMoth-70h, a new 70-hour ASL dataset from [The Daily Moth](https://www.dailymoth.com/). +SSVP-SLT achieves state-of-the-art performance on How2Sign [@dataset:duarte2020how2sign]. From f8910f3bc3ad9b72130f8beb6bcd4c47288a9aeb Mon Sep 17 00:00:00 2001 From: Colin Leong <122366389+cleong110@users.noreply.github.com> Date: Mon, 27 May 2024 17:40:48 -0400 Subject: [PATCH 7/8] CDL: Citation fixes and expanding on 'facial obfuscation' --- src/index.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/index.md b/src/index.md index 70265dc..9c26062 100644 --- a/src/index.md +++ b/src/index.md @@ -749,10 +749,12 @@ The framework achieves state-of-the-art results on the RWTH-PHOENIX-Weather-2014 @rust2024PrivacyAwareSign introduce a two-stage privacy-aware method for sign language translation (SLT) at scale, termed Self-Supervised Video Pretraining for Sign Language Translation (SSVP-SLT). -The first stage involves self-supervised pretraining of a Hiera vision transformer on large unannotated video datasets [@ryali2023HieraVisionTransformer; @dataset:uthus2023YoutubeASL]. -In the second stage, the vision model's outputs are fed into a multilingual language model (T5) for finetuning on the How2Sign dataset [@raffel2020T5Transformer; @dataset:duarte2020how2sign]. -To mitigate privacy risks, the framework employs facial obfuscation. -Additionally, the authors release DailyMoth-70h, a new 70-hour ASL dataset from [The Daily Moth](https://www.dailymoth.com/). +The first stage involves self-supervised pretraining of a Hiera vision transformer [@ryali2023HieraVisionTransformer] on large unannotated video datasets [@dataset:duarte2020how2sign, @dataset:uthus2023YoutubeASL]. +In the second stage, the vision model's outputs are fed into a multilingual language model [@raffel2020T5Transformer] for finetuning on the How2Sign dataset [@dataset:duarte2020how2sign]. +To mitigate privacy risks, the framework employs facial blurring during pretraining. +They find that while pretraining with blurring hurts performance, some can be recovered when finetuning with unblurred data. +They conclude that SLT models can be pretrained in a privacy-aware manner without sacrificing too much performance. +Additionally, the authors release DailyMoth-70h, a new 70-hour ASL dataset from [The Daily Moth](https://www.dailymoth.com/). SSVP-SLT achieves state-of-the-art performance on How2Sign [@dataset:duarte2020how2sign]. From b6123db2047be1c8066571664d2d171a08893e7f Mon Sep 17 00:00:00 2001 From: Colin Leong <122366389+cleong110@users.noreply.github.com> Date: Tue, 28 May 2024 10:18:51 -0400 Subject: [PATCH 8/8] CDL: rearranging sentences in rust2024PrivacyAwareSign --- src/index.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/index.md b/src/index.md index 8229e26..8e27711 100644 --- a/src/index.md +++ b/src/index.md @@ -773,9 +773,10 @@ The first stage involves self-supervised pretraining of a Hiera vision transform In the second stage, the vision model's outputs are fed into a multilingual language model [@raffel2020T5Transformer] for finetuning on the How2Sign dataset [@dataset:duarte2020how2sign]. To mitigate privacy risks, the framework employs facial blurring during pretraining. They find that while pretraining with blurring hurts performance, some can be recovered when finetuning with unblurred data. +SSVP-SLT achieves state-of-the-art performance on How2Sign [@dataset:duarte2020how2sign]. They conclude that SLT models can be pretrained in a privacy-aware manner without sacrificing too much performance. Additionally, the authors release DailyMoth-70h, a new 70-hour ASL dataset from [The Daily Moth](https://www.dailymoth.com/). -SSVP-SLT achieves state-of-the-art performance on How2Sign [@dataset:duarte2020how2sign]. +