Updated split arg. Added docs for SST2 and CC100 (#1604)

Nayef211 · web-flow · commit 72094214b370 · 2022-02-11T21:09:05.000-05:00
diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst
@@ -32,84 +32,74 @@ AG_NEWS
 
 .. autofunction:: AG_NEWS
 
+AmazonReviewFull
+~~~~~~~~~~~~~~~~
 
-SogouNews
-~~~~~~~~~
+.. autofunction:: AmazonReviewFull
 
-.. autofunction:: SogouNews
+AmazonReviewPolarity
+~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: AmazonReviewPolarity
 
 DBpedia
 ~~~~~~~
 
 .. autofunction:: DBpedia
 
-YelpReviewPolarity
-~~~~~~~~~~~~~~~~~~
+IMDb
+~~~~
 
-.. autofunction:: YelpReviewPolarity
+.. autofunction:: IMDB
 
-YelpReviewFull
-~~~~~~~~~~~~~~
+SogouNews
+~~~~~~~~~
 
-.. autofunction:: YelpReviewFull
+.. autofunction:: SogouNews
+
+SST2
+~~~~
+
+.. autofunction:: SST2
 
 YahooAnswers
 ~~~~~~~~~~~~
 
 .. autofunction:: YahooAnswers
 
-AmazonReviewPolarity
-~~~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: AmazonReviewPolarity
-
-AmazonReviewFull
-~~~~~~~~~~~~~~~~
-
-.. autofunction:: AmazonReviewFull
-
-IMDb
-~~~~
+YelpReviewFull
+~~~~~~~~~~~~~~
 
-.. autofunction:: IMDB
+.. autofunction:: YelpReviewFull
 
-SST2
-~~~~
+YelpReviewPolarity
+~~~~~~~~~~~~~~~~~~
 
-.. autofunction:: SST2
+.. autofunction:: YelpReviewPolarity
 
 
 Language Modeling
 ^^^^^^^^^^^^^^^^^
 
+PennTreebank
+~~~~~~~~~~~~
+
+.. autofunction:: PennTreebank
+
 WikiText-2
 ~~~~~~~~~~
 
 .. autofunction:: WikiText2
 
-
 WikiText103
 ~~~~~~~~~~~
 
 .. autofunction:: WikiText103
 
 
-PennTreebank
-~~~~~~~~~~~~
-
-.. autofunction:: PennTreebank
-
-
 Machine Translation
 ^^^^^^^^^^^^^^^^^^^
 
-Multi30k
-~~~~~~~~
-
-.. autofunction:: Multi30k
-
-
-
 IWSLT2016
 ~~~~~~~~~
 
@@ -120,20 +110,25 @@ IWSLT2017
 
 .. autofunction:: IWSLT2017
 
+Multi30k
+~~~~~~~~
 
-Sequence Tagging
-^^^^^^^^^^^^^^^^
+.. autofunction:: Multi30k
 
-UDPOS
-~~~~~
 
-.. autofunction:: UDPOS
+Sequence Tagging
+^^^^^^^^^^^^^^^^
 
 CoNLL2000Chunking
 ~~~~~~~~~~~~~~~~~
 
 .. autofunction:: CoNLL2000Chunking
 
+UDPOS
+~~~~~
+
+.. autofunction:: UDPOS
+
 
 Question Answer
 ^^^^^^^^^^^^^^^
@@ -153,6 +148,11 @@ SQuAD 2.0
 Unsupervised Learning
 ^^^^^^^^^^^^^^^^^^^^^
 
+CC100
+~~~~~~
+
+.. autofunction:: CC100
+
 EnWik9
 ~~~~~~
 
diff --git a/torchtext/datasets/cc100.py b/torchtext/datasets/cc100.py
@@ -30,6 +30,17 @@
 
 @_create_dataset_directory(dataset_name=DATASET_NAME)
 def CC100(root: str, language_code: str = "en"):
+    """CC100 Dataset
+
+    For additional details refer to https://data.statmt.org/cc-100/
+
+    Args:
+        root: Directory where the datasets are saved. Default: os.path.expanduser('~/.torchtext/cache')
+        language_code: the language of the dataset
+
+    :returns: DataPipe that yields tuple of language code and text
+    :rtype: (str, str)
+    """
     if language_code not in VALID_CODES:
         raise ValueError(f"Invalid language code {language_code}")
 
diff --git a/torchtext/datasets/conll2000chunking.py b/torchtext/datasets/conll2000chunking.py
@@ -39,9 +39,8 @@ def CoNLL2000Chunking(root: str, split: Union[Tuple[str], str]):
     For additional details refer to https://www.clips.uantwerpen.be/conll2000/chunking/
 
     Number of lines per split:
-        train: 8936
-
-        test: 2012
+        - train: 8936
+        - test: 2012
 
     Args:
         root: Directory where the datasets are saved. Default: os.path.expanduser('~/.torchtext/cache')
diff --git a/torchtext/datasets/multi30k.py b/torchtext/datasets/multi30k.py
@@ -47,6 +47,11 @@ def Multi30k(
 
     For additional details refer to https://www.statmt.org/wmt16/multimodal-task.html#task1
 
+    Number of lines per split:
+        - train: 29000
+        - valid: 1014
+        - test: 1000
+
     Args:
         root: Directory where the datasets are saved. Default: os.path.expanduser('~/.torchtext/cache')
         split: split or splits to be returned. Can be a string or tuple of strings. Default: ('train', 'valid', 'test')
diff --git a/torchtext/datasets/squad1.py b/torchtext/datasets/squad1.py
@@ -38,10 +38,8 @@ def SQuAD1(root: str, split: Union[Tuple[str], str]):
     For additional details refer to https://rajpurkar.github.io/SQuAD-explorer/
 
     Number of lines per split:
-        train: 87599
-
-        Dev: 10570
-
+        - train: 87599
+        - dev: 10570
 
     Args:
         root: Directory where the datasets are saved. Default: os.path.expanduser('~/.torchtext/cache')
diff --git a/torchtext/datasets/squad2.py b/torchtext/datasets/squad2.py
@@ -38,9 +38,8 @@ def SQuAD2(root: str, split: Union[Tuple[str], str]):
     For additional details refer to https://rajpurkar.github.io/SQuAD-explorer/
 
     Number of lines per split:
-        train: 130319
-
-        Dev: 11873
+        - train: 130319
+        - dev: 11873
 
 
     Args:
diff --git a/torchtext/datasets/sst2.py b/torchtext/datasets/sst2.py
@@ -3,7 +3,6 @@
 
 from torchtext._internal.module_utils import is_module_available
 from torchtext.data.datasets_utils import (
-    _add_docstring_header,
     _create_dataset_directory,
     _wrap_split_argument,
 )
@@ -37,10 +36,25 @@
 }
 
 
-@_add_docstring_header(num_lines=NUM_LINES, num_classes=2)
 @_create_dataset_directory(dataset_name=DATASET_NAME)
 @_wrap_split_argument(("train", "dev", "test"))
 def SST2(root, split):
+    """SST2 Dataset
+
+    For additional details refer to https://nlp.stanford.edu/sentiment/
+
+    Number of lines per split:
+        - train: 67349
+        - dev: 872
+        - test: 1821
+
+    Args:
+        root: Directory where the datasets are saved. Default: os.path.expanduser('~/.torchtext/cache')
+        split: split or splits to be returned. Can be a string or tuple of strings. Default: (`train`, `dev`, `test`)
+
+    :returns: DataPipe that yields tuple of text and/or label (1 to 4). The `test` split only returns text.
+    :rtype: Union[(int, str), (str,)]
+    """
     # TODO Remove this after removing conditional dependency
     if not is_module_available("torchdata"):
         raise ModuleNotFoundError(
diff --git a/torchtext/datasets/udpos.py b/torchtext/datasets/udpos.py
@@ -33,11 +33,9 @@ def UDPOS(root: str, split: Union[Tuple[str], str]):
     """UDPOS Dataset
 
     Number of lines per split:
-        train: 12543
-
-        valid: 2002
-
-        test: 2077
+        - train: 12543
+        - valid: 2002
+        - test: 2077
 
     Args:
         root: Directory where the datasets are saved. Default: os.path.expanduser('~/.torchtext/cache')