From e2ce2ad62d09f9893897bad99aa83bf9cbfdd68f Mon Sep 17 00:00:00 2001 From: Colin Leong <--unset> Date: Thu, 20 Jun 2024 17:39:49 -0400 Subject: [PATCH 1/4] CDL: updating NCSLGR (take 2) --- src/datasets/NCSLGR.json | 4 ++-- src/index.md | 2 +- src/references.bib | 16 ++++++++++++++++ 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/src/datasets/NCSLGR.json b/src/datasets/NCSLGR.json index aad9aad..a6cb675 100644 --- a/src/datasets/NCSLGR.json +++ b/src/datasets/NCSLGR.json @@ -15,7 +15,7 @@ "#items": null, "#samples": "1,875 sentences", "#signers": 4, - "license": "TODO", - "licenseUrl": null, + "license": "Research Attribution", + "licenseUrl": "https://www.bu.edu/asllrp/data-credits.html", "contact": "carol@bu.edu" } diff --git a/src/index.md b/src/index.md index 51a543e..5ddb333 100644 --- a/src/index.md +++ b/src/index.md @@ -1046,7 +1046,7 @@ are collections of annotated single signs. They are synthesized [@dataset:ebling contain parallel sequences of signs and spoken language. Available continuous sign corpora are extremely limited, containing 4-6 orders of magnitude fewer sentence pairs than similar corpora for spoken language machine translation [@arivazhagan2019massively]. Moreover, while automatic speech recognition (ASR) datasets contain up to 50,000 hours of recordings [@pratap2020mls], the most extensive continuous sign language corpus contains only 1,150 hours, and only 50 of them are publicly available [@dataset:hanke-etal-2020-extending]. -These datasets are usually synthesized [@dataset:databases2007volumes;@dataset:Crasborn2008TheCN;@dataset:ko2019neural;@dataset:hanke-etal-2020-extending] or recorded in studio conditions [@dataset:forster2014extensions;@cihan2018neural], which does not account for noise in real-life conditions. Moreover, some contain signed interpretations of spoken language rather than naturally-produced signs, which may not accurately represent native signing since translation is now a part of the discourse event. +These datasets are usually synthesized [@dataset:Neidle_2020_NCSLGR_ISLRN;@dataset:Crasborn2008TheCN;@dataset:ko2019neural;@dataset:hanke-etal-2020-extending] or recorded in studio conditions [@dataset:forster2014extensions;@cihan2018neural], which does not account for noise in real-life conditions. Moreover, some contain signed interpretations of spoken language rather than naturally-produced signs, which may not accurately represent native signing since translation is now a part of the discourse event. ###### Availability {-} diff --git a/src/references.bib b/src/references.bib index b5c3c6a..3f6c3a1 100644 --- a/src/references.bib +++ b/src/references.bib @@ -3457,3 +3457,19 @@ @inproceedings{dataset:dal2022lsa url = {https://doi.org/10.1007/978-3-031-22419-5_25}, year = {2023} } + +@inproceedings{Vogler2012ANW, + title={A new web interface to facilitate access to corpora: development of the ASLLRP data access interface}, + author={Christian Vogler and C. Neidle}, + year={2012}, + url={https://api.semanticscholar.org/CorpusID:58305327} +} + +@misc{dataset:Neidle_2020_NCSLGR_ISLRN, + type = {Languageresource}, + title = {National Center for Sign Language and Gesture Resources (NCSLGR) corpus. ISLRN 833-505-711-564-4}, + author = {Carol Neidle and Stan Sclaroff}, + year = {2012}, + publisher = {Boston University}, + url = {https://www.islrn.org/resources/833-505-711-564-4/} +} \ No newline at end of file From 51415a5c13db92efbd3753cec7a4cdad848809b9 Mon Sep 17 00:00:00 2001 From: Colin Leong <--unset> Date: Thu, 20 Jun 2024 17:43:29 -0400 Subject: [PATCH 2/4] CDL: change ref in JSON --- src/datasets/NCSLGR.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/NCSLGR.json b/src/datasets/NCSLGR.json index a6cb675..f9ef6c0 100644 --- a/src/datasets/NCSLGR.json +++ b/src/datasets/NCSLGR.json @@ -2,7 +2,7 @@ "pub": { "name": "NCSLGR", "year": 2007, - "publication": "dataset:databases2007volumes", + "publication": "dataset:Neidle_2020_NCSLGR_ISLRN", "url": "https://www.bu.edu/asllrp/ncslgr.html" }, "loader": "ncslgr", From 7395b4af3cabfd8b21da0979602d0b0a30db305f Mon Sep 17 00:00:00 2001 From: Colin Leong <122366389+cleong110@users.noreply.github.com> Date: Fri, 21 Jun 2024 09:48:59 -0400 Subject: [PATCH 3/4] Adding a few notes on code/comment style --- README.md | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index cbea270..6dbabd3 100644 --- a/README.md +++ b/README.md @@ -72,4 +72,18 @@ Respond in a Markdown code block, conserving \n characters. Respond in a Markdown code block. Conserve \n characters (new-line characters). -i.e., break line before "They parse the English text..." \ No newline at end of file +i.e., break line before "They parse the English text..." + + +### Code Style + +Some guidelines for good code and commenting style. + +#### Commenting + +- Comments should not be redundant. Meaning, that if someone with a basic knowledge of the programming languaged can tell at a glance what it does, there's no need to explain. For example, the JavaScript `const fs = require('fs');` does not need to be explained. +- Don't use personal sign-offs or openings, the code should exist regardless of authors. For example `// maps to emoji`, not `//Colin: maps to emoji`. + +#### Further Reading + +Google has a [Javascript Style guide](https://google.github.io/styleguide/jsguide.html). From 540b49c4c3cfc7021c70c58c813be4216d449822 Mon Sep 17 00:00:00 2001 From: Colin Leong <122366389+cleong110@users.noreply.github.com> Date: Fri, 21 Jun 2024 09:50:15 -0400 Subject: [PATCH 4/4] CDL: updating citation key for NCSLGR --- src/index.md | 2 +- src/references.bib | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/index.md b/src/index.md index 5ddb333..e6ddbb0 100644 --- a/src/index.md +++ b/src/index.md @@ -1046,7 +1046,7 @@ are collections of annotated single signs. They are synthesized [@dataset:ebling contain parallel sequences of signs and spoken language. Available continuous sign corpora are extremely limited, containing 4-6 orders of magnitude fewer sentence pairs than similar corpora for spoken language machine translation [@arivazhagan2019massively]. Moreover, while automatic speech recognition (ASR) datasets contain up to 50,000 hours of recordings [@pratap2020mls], the most extensive continuous sign language corpus contains only 1,150 hours, and only 50 of them are publicly available [@dataset:hanke-etal-2020-extending]. -These datasets are usually synthesized [@dataset:Neidle_2020_NCSLGR_ISLRN;@dataset:Crasborn2008TheCN;@dataset:ko2019neural;@dataset:hanke-etal-2020-extending] or recorded in studio conditions [@dataset:forster2014extensions;@cihan2018neural], which does not account for noise in real-life conditions. Moreover, some contain signed interpretations of spoken language rather than naturally-produced signs, which may not accurately represent native signing since translation is now a part of the discourse event. +These datasets are usually synthesized [@dataset:Neidle_2012_NCSLGR_ISLRN;@dataset:Crasborn2008TheCN;@dataset:ko2019neural;@dataset:hanke-etal-2020-extending] or recorded in studio conditions [@dataset:forster2014extensions;@cihan2018neural], which does not account for noise in real-life conditions. Moreover, some contain signed interpretations of spoken language rather than naturally-produced signs, which may not accurately represent native signing since translation is now a part of the discourse event. ###### Availability {-} diff --git a/src/references.bib b/src/references.bib index 3f6c3a1..865b6df 100644 --- a/src/references.bib +++ b/src/references.bib @@ -3465,7 +3465,7 @@ @inproceedings{Vogler2012ANW url={https://api.semanticscholar.org/CorpusID:58305327} } -@misc{dataset:Neidle_2020_NCSLGR_ISLRN, +@misc{dataset:Neidle_2012_NCSLGR_ISLRN, type = {Languageresource}, title = {National Center for Sign Language and Gesture Resources (NCSLGR) corpus. ISLRN 833-505-711-564-4}, author = {Carol Neidle and Stan Sclaroff},