Skip to content

Commit 7209421

Browse files
authored
Updated split arg. Added docs for SST2 and CC100 (#1604)
1 parent 2e93d94 commit 7209421

File tree

8 files changed

+85
-61
lines changed

8 files changed

+85
-61
lines changed

docs/source/datasets.rst

Lines changed: 44 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -32,84 +32,74 @@ AG_NEWS
3232

3333
.. autofunction:: AG_NEWS
3434

35+
AmazonReviewFull
36+
~~~~~~~~~~~~~~~~
3537

36-
SogouNews
37-
~~~~~~~~~
38+
.. autofunction:: AmazonReviewFull
3839

39-
.. autofunction:: SogouNews
40+
AmazonReviewPolarity
41+
~~~~~~~~~~~~~~~~~~~~
42+
43+
.. autofunction:: AmazonReviewPolarity
4044

4145
DBpedia
4246
~~~~~~~
4347

4448
.. autofunction:: DBpedia
4549

46-
YelpReviewPolarity
47-
~~~~~~~~~~~~~~~~~~
50+
IMDb
51+
~~~~
4852

49-
.. autofunction:: YelpReviewPolarity
53+
.. autofunction:: IMDB
5054

51-
YelpReviewFull
52-
~~~~~~~~~~~~~~
55+
SogouNews
56+
~~~~~~~~~
5357

54-
.. autofunction:: YelpReviewFull
58+
.. autofunction:: SogouNews
59+
60+
SST2
61+
~~~~
62+
63+
.. autofunction:: SST2
5564

5665
YahooAnswers
5766
~~~~~~~~~~~~
5867

5968
.. autofunction:: YahooAnswers
6069

61-
AmazonReviewPolarity
62-
~~~~~~~~~~~~~~~~~~~~
63-
64-
.. autofunction:: AmazonReviewPolarity
65-
66-
AmazonReviewFull
67-
~~~~~~~~~~~~~~~~
68-
69-
.. autofunction:: AmazonReviewFull
70-
71-
IMDb
72-
~~~~
70+
YelpReviewFull
71+
~~~~~~~~~~~~~~
7372

74-
.. autofunction:: IMDB
73+
.. autofunction:: YelpReviewFull
7574

76-
SST2
77-
~~~~
75+
YelpReviewPolarity
76+
~~~~~~~~~~~~~~~~~~
7877

79-
.. autofunction:: SST2
78+
.. autofunction:: YelpReviewPolarity
8079

8180

8281
Language Modeling
8382
^^^^^^^^^^^^^^^^^
8483

84+
PennTreebank
85+
~~~~~~~~~~~~
86+
87+
.. autofunction:: PennTreebank
88+
8589
WikiText-2
8690
~~~~~~~~~~
8791

8892
.. autofunction:: WikiText2
8993

90-
9194
WikiText103
9295
~~~~~~~~~~~
9396

9497
.. autofunction:: WikiText103
9598

9699

97-
PennTreebank
98-
~~~~~~~~~~~~
99-
100-
.. autofunction:: PennTreebank
101-
102-
103100
Machine Translation
104101
^^^^^^^^^^^^^^^^^^^
105102

106-
Multi30k
107-
~~~~~~~~
108-
109-
.. autofunction:: Multi30k
110-
111-
112-
113103
IWSLT2016
114104
~~~~~~~~~
115105

@@ -120,20 +110,25 @@ IWSLT2017
120110

121111
.. autofunction:: IWSLT2017
122112

113+
Multi30k
114+
~~~~~~~~
123115

124-
Sequence Tagging
125-
^^^^^^^^^^^^^^^^
116+
.. autofunction:: Multi30k
126117

127-
UDPOS
128-
~~~~~
129118

130-
.. autofunction:: UDPOS
119+
Sequence Tagging
120+
^^^^^^^^^^^^^^^^
131121

132122
CoNLL2000Chunking
133123
~~~~~~~~~~~~~~~~~
134124

135125
.. autofunction:: CoNLL2000Chunking
136126

127+
UDPOS
128+
~~~~~
129+
130+
.. autofunction:: UDPOS
131+
137132

138133
Question Answer
139134
^^^^^^^^^^^^^^^
@@ -153,6 +148,11 @@ SQuAD 2.0
153148
Unsupervised Learning
154149
^^^^^^^^^^^^^^^^^^^^^
155150

151+
CC100
152+
~~~~~~
153+
154+
.. autofunction:: CC100
155+
156156
EnWik9
157157
~~~~~~
158158

torchtext/datasets/cc100.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,17 @@
3030

3131
@_create_dataset_directory(dataset_name=DATASET_NAME)
3232
def CC100(root: str, language_code: str = "en"):
33+
"""CC100 Dataset
34+
35+
For additional details refer to https://data.statmt.org/cc-100/
36+
37+
Args:
38+
root: Directory where the datasets are saved. Default: os.path.expanduser('~/.torchtext/cache')
39+
language_code: the language of the dataset
40+
41+
:returns: DataPipe that yields tuple of language code and text
42+
:rtype: (str, str)
43+
"""
3344
if language_code not in VALID_CODES:
3445
raise ValueError(f"Invalid language code {language_code}")
3546

torchtext/datasets/conll2000chunking.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,8 @@ def CoNLL2000Chunking(root: str, split: Union[Tuple[str], str]):
3939
For additional details refer to https://www.clips.uantwerpen.be/conll2000/chunking/
4040
4141
Number of lines per split:
42-
train: 8936
43-
44-
test: 2012
42+
- train: 8936
43+
- test: 2012
4544
4645
Args:
4746
root: Directory where the datasets are saved. Default: os.path.expanduser('~/.torchtext/cache')

torchtext/datasets/multi30k.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,11 @@ def Multi30k(
4747
4848
For additional details refer to https://www.statmt.org/wmt16/multimodal-task.html#task1
4949
50+
Number of lines per split:
51+
- train: 29000
52+
- valid: 1014
53+
- test: 1000
54+
5055
Args:
5156
root: Directory where the datasets are saved. Default: os.path.expanduser('~/.torchtext/cache')
5257
split: split or splits to be returned. Can be a string or tuple of strings. Default: ('train', 'valid', 'test')

torchtext/datasets/squad1.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,8 @@ def SQuAD1(root: str, split: Union[Tuple[str], str]):
3838
For additional details refer to https://rajpurkar.github.io/SQuAD-explorer/
3939
4040
Number of lines per split:
41-
train: 87599
42-
43-
Dev: 10570
44-
41+
- train: 87599
42+
- dev: 10570
4543
4644
Args:
4745
root: Directory where the datasets are saved. Default: os.path.expanduser('~/.torchtext/cache')

torchtext/datasets/squad2.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,8 @@ def SQuAD2(root: str, split: Union[Tuple[str], str]):
3838
For additional details refer to https://rajpurkar.github.io/SQuAD-explorer/
3939
4040
Number of lines per split:
41-
train: 130319
42-
43-
Dev: 11873
41+
- train: 130319
42+
- dev: 11873
4443
4544
4645
Args:

torchtext/datasets/sst2.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33

44
from torchtext._internal.module_utils import is_module_available
55
from torchtext.data.datasets_utils import (
6-
_add_docstring_header,
76
_create_dataset_directory,
87
_wrap_split_argument,
98
)
@@ -37,10 +36,25 @@
3736
}
3837

3938

40-
@_add_docstring_header(num_lines=NUM_LINES, num_classes=2)
4139
@_create_dataset_directory(dataset_name=DATASET_NAME)
4240
@_wrap_split_argument(("train", "dev", "test"))
4341
def SST2(root, split):
42+
"""SST2 Dataset
43+
44+
For additional details refer to https://nlp.stanford.edu/sentiment/
45+
46+
Number of lines per split:
47+
- train: 67349
48+
- dev: 872
49+
- test: 1821
50+
51+
Args:
52+
root: Directory where the datasets are saved. Default: os.path.expanduser('~/.torchtext/cache')
53+
split: split or splits to be returned. Can be a string or tuple of strings. Default: (`train`, `dev`, `test`)
54+
55+
:returns: DataPipe that yields tuple of text and/or label (1 to 4). The `test` split only returns text.
56+
:rtype: Union[(int, str), (str,)]
57+
"""
4458
# TODO Remove this after removing conditional dependency
4559
if not is_module_available("torchdata"):
4660
raise ModuleNotFoundError(

torchtext/datasets/udpos.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,9 @@ def UDPOS(root: str, split: Union[Tuple[str], str]):
3333
"""UDPOS Dataset
3434
3535
Number of lines per split:
36-
train: 12543
37-
38-
valid: 2002
39-
40-
test: 2077
36+
- train: 12543
37+
- valid: 2002
38+
- test: 2077
4139
4240
Args:
4341
root: Directory where the datasets are saved. Default: os.path.expanduser('~/.torchtext/cache')

0 commit comments

Comments
 (0)