From 39f6a5a8bf3d6ba715d9ea22332b21e572906520 Mon Sep 17 00:00:00 2001
From: Colin Leong <122366389+cleong110@users.noreply.github.com>
Date: Wed, 5 Jun 2024 11:20:51 -0400
Subject: [PATCH 1/5] CDL: add citation for
 Zhao2023BESTPretrainingSignLanguageRecognition

---
 src/references.bib | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/references.bib b/src/references.bib
index 3fabbfd..85ae451 100644
--- a/src/references.bib
+++ b/src/references.bib
@@ -3150,3 +3150,16 @@ @inproceedings{sellam-etal-2020-bleurt
  url = {https://aclanthology.org/2020.acl-main.704},
  year = {2020}
 }
+
+@article{Zhao2023BESTPretrainingSignLanguageRecognition,
+  title        = {BEST: BERT Pre-training for Sign Language Recognition with Coupling Tokenization},
+  volume       = {37},
+  url          = {https://ojs.aaai.org/index.php/AAAI/article/view/25470},
+  doi          = {10.1609/aaai.v37i3.25470},
+  number       = {3},
+  journal      = {Proceedings of the AAAI Conference on Artificial Intelligence},
+  author       = {Zhao, Weichao and Hu, Hezhen and Zhou, Wengang and Shi, Jiaxin and Li, Houqiang},
+  year         = {2023},
+  month        = {Jun.},
+  pages        = {3597-3605}
+}
\ No newline at end of file

From 86da6f0f29d8da6aa48530c4bb1453cd2f742a38 Mon Sep 17 00:00:00 2001
From: Colin Leong <122366389+cleong110@users.noreply.github.com>
Date: Wed, 5 Jun 2024 11:21:25 -0400
Subject: [PATCH 2/5] CDL: citation for VQ-VAE

---
 src/references.bib | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/references.bib b/src/references.bib
index 85ae451..018f36e 100644
--- a/src/references.bib
+++ b/src/references.bib
@@ -3162,4 +3162,18 @@ @article{Zhao2023BESTPretrainingSignLanguageRecognition
   year         = {2023},
   month        = {Jun.},
   pages        = {3597-3605}
+}
+
+@inproceedings{van_den_Oord_2017NeuralDiscreteRepresentationLearning,
+author = {van den Oord, Aaron and Vinyals, Oriol and Kavukcuoglu, Koray},
+title = {Neural discrete representation learning},
+year = {2017},
+isbn = {9781510860964},
+publisher = {Curran Associates Inc.},
+address = {Red Hook, NY, USA},
+booktitle = {Proceedings of the 31st International Conference on Neural Information Processing Systems},
+pages = {6309–6318},
+numpages = {10},
+location = {Long Beach, California, USA},
+series = {NIPS'17}
 }
\ No newline at end of file

From 210c73e04be84661291016841af625c95a8fe4bd Mon Sep 17 00:00:00 2001
From: Colin Leong <122366389+cleong110@users.noreply.github.com>
Date: Wed, 5 Jun 2024 14:31:57 -0400
Subject: [PATCH 3/5] CDL: first draft of BEST summary

---
 src/index.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/index.md b/src/index.md
index fe7d494..b819f3b 100644
--- a/src/index.md
+++ b/src/index.md
@@ -961,6 +961,19 @@ They found that for both forms of fingerspelling, on average, the longer the wor
 Furthermore, they found that less time is spent on middle letters on average, and the last letter is held on average for longer than the other letters in the word.
 Finally, they used this information to construct an animation system using letter pose interpolation and controlled the timing using a data-driven statistical model.
 
+### Pretraining and Representation-Learning
+
+<!-- BEST seems to be **B**ERT pre-training for **S**ign language recognition with coupling **T**okenization -->
+@Zhao2023BESTPretrainingSignLanguageRecognition introduce BEST, a pretraining method based on masked modeling of pose sequences using a coupled tokenization scheme.
+The method takes in pose triplet units (left hand, right hand, and upper-body with arms) as inputs.
+The pose for each part of the triplet is tokenized into discrete codes [@van_den_Oord_2017NeuralDiscreteRepresentationLearning].
+Then masked modeling is employed: any or all of the three parts may be masked, e.g. left hand, or right hand, or body+hand, or all of them...
+Unlike @hu2023SignBertPlus, they do not mask multi-frame sequnces ("clips") or sub-frame portions of a pose unit (joints).
+They validate their pretraining method isolated ISR (MS-ASL [@dataset:joze2018ms], WLASL [@dataset:li2020word], SLR500 [@huang2019attention3DCNNsSLR] and NMFs-CSL [@hu2021NMFAwareSLR]).
+They experiment with both pose-to-gloss and video-to-gloss via fusion with I3D [@carreira2017quo].
+Results on these datasets are SOTA compared to previous methods, and quite similar to those of SignBERT+ [@hu2023SignBertPlus]
+
+
 ## Annotation Tools
 
 ##### ELAN - EUDICO Linguistic Annotator

From d00bd1e5241ca1248460a1b9268f072f2eb5a3ca Mon Sep 17 00:00:00 2001
From: Colin Leong <122366389+cleong110@users.noreply.github.com>
Date: Tue, 11 Jun 2024 13:14:22 -0400
Subject: [PATCH 4/5] CDL: {} in BEST citation

---
 src/references.bib | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/references.bib b/src/references.bib
index bdf3ac7..3ae4cd7 100644
--- a/src/references.bib
+++ b/src/references.bib
@@ -3355,7 +3355,7 @@ @INPROCEEDINGS{dataset:Zhou2021_SignBackTranslation_CSLDaily
 }
 
 @article{Zhao2023BESTPretrainingSignLanguageRecognition,
-  title        = {BEST: BERT Pre-training for Sign Language Recognition with Coupling Tokenization},
+  title        = {{BEST}: {BERT} Pre-training for {S}ign Language Recognition with Coupling {T}okenization},
   volume       = {37},
   url          = {https://ojs.aaai.org/index.php/AAAI/article/view/25470},
   doi          = {10.1609/aaai.v37i3.25470},

From 73358ac081362abca4bae3252b9025c2d607587d Mon Sep 17 00:00:00 2001
From: Colin Leong <122366389+cleong110@users.noreply.github.com>
Date: Tue, 11 Jun 2024 15:58:03 -0400
Subject: [PATCH 5/5] CDL: another rewrite with some synthesis of advice from
 various sources

---
 src/index.md | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/index.md b/src/index.md
index e44aeb4..4b0fa91 100644
--- a/src/index.md
+++ b/src/index.md
@@ -960,15 +960,13 @@ They then perform multi-level masked modeling (joints, frames, clips) on these s
 Validation on isolated SLR (MS-ASL [@dataset:joze2018ms], WLASL [@dataset:li2020word], SLR500 [@huang2019attention3DCNNsSLR]), continuous SLR (RWTH-PHOENIX-Weather [@koller2015ContinuousSLR]), and SLT (RWTH-PHOENIX-Weather 2014T [@dataset:forster2014extensions;@cihan2018neural]) demonstrates state-of-the-art performance.
 
 <!-- BEST seems to be **B**ERT pre-training for **S**ign language recognition with coupling **T**okenization -->
-@Zhao2023BESTPretrainingSignLanguageRecognition introduce BEST, a pretraining method based on masked modeling of pose sequences using a coupled tokenization scheme.
-The method takes in pose triplet units (left hand, right hand, and upper-body with arms) as inputs.
-The pose for each part of the triplet is tokenized into discrete codes [@van_den_Oord_2017NeuralDiscreteRepresentationLearning].
-Then masked modeling is employed: any or all of the three parts may be masked, e.g. left hand, or right hand, or body+hand, or all of them...
-Unlike @hu2023SignBertPlus, they do not mask multi-frame sequnces ("clips") or sub-frame portions of a pose unit (joints).
-They validate their pretraining method isolated ISR (MS-ASL [@dataset:joze2018ms], WLASL [@dataset:li2020word], SLR500 [@huang2019attention3DCNNsSLR] and NMFs-CSL [@hu2021NMFAwareSLR]).
-They experiment with both pose-to-gloss and video-to-gloss via fusion with I3D [@carreira2017quo].
-Results on these datasets are SOTA compared to previous methods, and quite similar to those of SignBERT+ [@hu2023SignBertPlus]
-
+@Zhao2023BESTPretrainingSignLanguageRecognition introduce BEST (BERT Pre-training for Sign Language Recognition with Coupling Tokenization), a pre-training method based on masked modeling of pose sequences using a coupled tokenization scheme.
+This method takes pose triplet units (left hand, right hand, and upper-body with arms) as inputs, each tokenized into discrete codes [@van_den_Oord_2017NeuralDiscreteRepresentationLearning] that are then coupled together.
+Masked modeling is then applied, where any or all components of the triplet (left hand, right hand, or upper-body) may be masked, to learn hierarchical correlations among them.
+Unlike @hu2023SignBertPlus, BEST does not mask multi-frame pose sequences or individual joints.
+The authors validate their pre-training method on isolated sign recognition (ISR) tasks using MS-ASL [@dataset:joze2018ms], WLASL [@dataset:li2020word], SLR500 [@huang2019attention3DCNNsSLR], and NMFs-CSL [@hu2021NMFAwareSLR].
+Besides pose-to-gloss, they also experiment with video-to-gloss tasks via fusion with I3D [@carreira2017quo].
+Results on these datasets demonstrate state-of-the-art performance compared to previous methods and are comparable to those of SignBERT+ [@hu2023SignBertPlus].
 
 ## Annotation Tools