diff --git a/src/index.md b/src/index.md index 271e872..8e27711 100644 --- a/src/index.md +++ b/src/index.md @@ -766,6 +766,20 @@ In training, the VQ-Sign "character-level" module is trained with a context pred The framework achieves state-of-the-art results on the RWTH-PHOENIX-Weather-2014T [@cihan2018neural] and CSL-Daily [@dataset:huang2018video] datasets without relying on gloss annotations. + + +@rust2024PrivacyAwareSign introduce a two-stage privacy-aware method for sign language translation (SLT) at scale, termed Self-Supervised Video Pretraining for Sign Language Translation (SSVP-SLT). +The first stage involves self-supervised pretraining of a Hiera vision transformer [@ryali2023HieraVisionTransformer] on large unannotated video datasets [@dataset:duarte2020how2sign, @dataset:uthus2023YoutubeASL]. +In the second stage, the vision model's outputs are fed into a multilingual language model [@raffel2020T5Transformer] for finetuning on the How2Sign dataset [@dataset:duarte2020how2sign]. +To mitigate privacy risks, the framework employs facial blurring during pretraining. +They find that while pretraining with blurring hurts performance, some can be recovered when finetuning with unblurred data. +SSVP-SLT achieves state-of-the-art performance on How2Sign [@dataset:duarte2020how2sign]. +They conclude that SLT models can be pretrained in a privacy-aware manner without sacrificing too much performance. +Additionally, the authors release DailyMoth-70h, a new 70-hour ASL dataset from [The Daily Moth](https://www.dailymoth.com/). + + + + #### Text-to-Video @@ -1008,6 +1022,7 @@ sign language resources are scarce and, currently only support translation and p Unfortunately, most of the sign language corpora discussed in the literature are either not available for use or available under heavy restrictions and licensing terms. Furthermore, sign language data is especially challenging to anonymize due to the importance of facial and other physical features in signing videos, limiting its open distribution. Developing anonymization with minimal information loss or accurate anonymous representations is a promising research direction. + ### Collect Real-World Data diff --git a/src/references.bib b/src/references.bib index cb3ec7b..48cb3e0 100644 --- a/src/references.bib +++ b/src/references.bib @@ -3100,3 +3100,70 @@ @inproceedings{post-2018-call-sacrebleu doi = "10.18653/v1/W18-6319", pages = "186--191" } + +@misc{rust2024PrivacyAwareSign, + title={Towards Privacy-Aware Sign Language Translation at Scale}, + author={Phillip Rust and Bowen Shi and Skyler Wang and Necati Cihan Camgöz and Jean Maillard}, + year={2024}, + eprint={2402.09611}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} + +@inproceedings{dataset:uthus2023YoutubeASL, + author = {Uthus, Dave and Tanzer, Garrett and Georg, Manfred}, + booktitle = {Advances in Neural Information Processing Systems}, + editor = {A. Oh and T. Naumann and A. Globerson and K. Saenko and M. Hardt and S. Levine}, + pages = {29029--29047}, + publisher = {Curran Associates, Inc.}, + title = {YouTube-ASL: A Large-Scale, Open-Domain American Sign Language-English Parallel Corpus}, + url = {https://proceedings.neurips.cc/paper_files/paper/2023/file/5c61452daca5f0c260e683b317d13a3f-Paper-Datasets_and_Benchmarks.pdf}, + volume = {36}, + year = {2023} +} + + +@InProceedings{ryali2023HieraVisionTransformer, + title = {Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles}, + author = {Ryali, Chaitanya and Hu, Yuan-Ting and Bolya, Daniel and Wei, Chen and Fan, Haoqi and Huang, Po-Yao and Aggarwal, Vaibhav and Chowdhury, Arkabandhu and Poursaeed, Omid and Hoffman, Judy and Malik, Jitendra and Li, Yanghao and Feichtenhofer, Christoph}, + booktitle = {Proceedings of the 40th International Conference on Machine Learning}, + pages = {29441--29454}, + year = {2023}, + editor = {Krause, Andreas and Brunskill, Emma and Cho, Kyunghyun and Engelhardt, Barbara and Sabato, Sivan and Scarlett, Jonathan}, + volume = {202}, + series = {Proceedings of Machine Learning Research}, + month = {23--29 Jul}, + publisher = {PMLR}, + pdf = {https://proceedings.mlr.press/v202/ryali23a/ryali23a.pdf}, + url = {https://proceedings.mlr.press/v202/ryali23a.html} +} + +@article{raffel2020T5Transformer, + author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu}, + title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer}, + journal = {Journal of Machine Learning Research}, + year = {2020}, + volume = {21}, + number = {140}, + pages = {1--67}, + url = {http://jmlr.org/papers/v21/20-074.html} +} + +@inproceedings{sellam-etal-2020-bleurt, + title = "{BLEURT}: Learning Robust Metrics for Text Generation", + author = "Sellam, Thibault and + Das, Dipanjan and + Parikh, Ankur", + editor = "Jurafsky, Dan and + Chai, Joyce and + Schluter, Natalie and + Tetreault, Joel", + booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics", + month = jul, + year = "2020", + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2020.acl-main.704", + doi = "10.18653/v1/2020.acl-main.704", + pages = "7881--7892" +}