Modified experimental vocab factory functions API (#1286)

parmeet · web-flow · commit 90fd332f9e4e · 2021-04-22T16:19:57.000-04:00
diff --git a/test/experimental/test_vocab.py b/test/experimental/test_vocab.py
@@ -219,7 +219,7 @@ def test_vocab_load_and_save(self):
 
     def test_build_vocab_iterator(self):
         iterator = [['hello', 'hello', 'hello', 'freq_low', 'hello', 'world', 'world', 'world', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T',
-                    'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'freq_low', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T']]
+                     'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'freq_low', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T']]
         v = build_vocab_from_iterator(iterator)
         expected_itos = ['<unk>', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world', 'freq_low']
         expected_stoi = {x: index for index, x in enumerate(expected_itos)}
diff --git a/test/experimental/test_with_asset.py b/test/experimental/test_with_asset.py
@@ -78,13 +78,12 @@ class TestTransformsWithAsset(TorchtextTestCase):
     def test_vocab_transform(self):
         asset_name = 'vocab_test2.txt'
         asset_path = get_asset_path(asset_name)
-        with open(asset_path, 'r') as f:
-            vocab_transform = VocabTransform(load_vocab_from_file(f))
-            self.assertEqual(vocab_transform(['of', 'that', 'new']),
-                             [7, 18, 24])
-            jit_vocab_transform = torch.jit.script(vocab_transform)
-            self.assertEqual(jit_vocab_transform(['of', 'that', 'new', 'that']),
-                             [7, 18, 24, 18])
+        vocab_transform = VocabTransform(load_vocab_from_file(asset_path))
+        self.assertEqual(vocab_transform(['of', 'that', 'new']),
+                         [7, 18, 24])
+        jit_vocab_transform = torch.jit.script(vocab_transform)
+        self.assertEqual(jit_vocab_transform(['of', 'that', 'new', 'that']),
+                         [7, 18, 24, 18])
 
     def test_errors_vectors_python(self):
         tokens = []
@@ -179,27 +178,25 @@ def test_glove_different_dims(self):
     def test_vocab_from_file(self):
         asset_name = 'vocab_test.txt'
         asset_path = get_asset_path(asset_name)
-        with open(asset_path, 'r') as f:
-            v = load_vocab_from_file(f, unk_token='<new_unk>')
-            expected_itos = ['<new_unk>', 'b', 'a', 'c']
-            expected_stoi = {x: index for index, x in enumerate(expected_itos)}
-            self.assertEqual(v.get_itos(), expected_itos)
-            self.assertEqual(dict(v.get_stoi()), expected_stoi)
+        v = load_vocab_from_file(asset_path, unk_token='<new_unk>')
+        expected_itos = ['<new_unk>', 'b', 'a', 'c']
+        expected_stoi = {x: index for index, x in enumerate(expected_itos)}
+        self.assertEqual(v.get_itos(), expected_itos)
+        self.assertEqual(dict(v.get_stoi()), expected_stoi)
 
     def test_vocab_from_raw_text_file(self):
         asset_name = 'vocab_raw_text_test.txt'
         asset_path = get_asset_path(asset_name)
-        with open(asset_path, 'r') as f:
-            tokenizer = basic_english_normalize()
-            jit_tokenizer = torch.jit.script(tokenizer)
-            v = build_vocab_from_text_file(f, jit_tokenizer, unk_token='<new_unk>')
-            expected_itos = ['<new_unk>', "'", 'after', 'talks', '.', 'are', 'at', 'disappointed',
-                             'fears', 'federal', 'firm', 'for', 'mogul', 'n', 'newall', 'parent',
-                             'pension', 'representing', 'say', 'stricken', 't', 'they', 'turner',
-                             'unions', 'with', 'workers']
-            expected_stoi = {x: index for index, x in enumerate(expected_itos)}
-            self.assertEqual(v.get_itos(), expected_itos)
-            self.assertEqual(dict(v.get_stoi()), expected_stoi)
+        tokenizer = basic_english_normalize()
+        jit_tokenizer = torch.jit.script(tokenizer)
+        v = build_vocab_from_text_file(asset_path, jit_tokenizer, unk_token='<new_unk>')
+        expected_itos = ['<new_unk>', "'", 'after', 'talks', '.', 'are', 'at', 'disappointed',
+                         'fears', 'federal', 'firm', 'for', 'mogul', 'n', 'newall', 'parent',
+                         'pension', 'representing', 'say', 'stricken', 't', 'they', 'turner',
+                         'unions', 'with', 'workers']
+        expected_stoi = {x: index for index, x in enumerate(expected_itos)}
+        self.assertEqual(v.get_itos(), expected_itos)
+        self.assertEqual(dict(v.get_stoi()), expected_stoi)
 
     def test_builtin_pretrained_sentencepiece_processor(self):
         sp_model_path = download_from_url(PRETRAINED_SP_MODEL['text_unigram_25000'])
@@ -241,11 +238,10 @@ def batch_func(data):
     def test_text_sequential_transform(self):
         asset_name = 'vocab_test2.txt'
         asset_path = get_asset_path(asset_name)
-        with open(asset_path, 'r') as f:
-            pipeline = TextSequentialTransforms(basic_english_normalize(), load_vocab_from_file(f))
-            jit_pipeline = torch.jit.script(pipeline)
-            self.assertEqual(pipeline('of that new'), [7, 18, 24])
-            self.assertEqual(jit_pipeline('of that new'), [7, 18, 24])
+        pipeline = TextSequentialTransforms(basic_english_normalize(), load_vocab_from_file(asset_path))
+        jit_pipeline = torch.jit.script(pipeline)
+        self.assertEqual(pipeline('of that new'), [7, 18, 24])
+        self.assertEqual(jit_pipeline('of that new'), [7, 18, 24])
 
     def test_vectors_from_file(self):
         asset_name = 'vectors_test.csv'
diff --git a/torchtext/experimental/vocab.py b/torchtext/experimental/vocab.py
@@ -19,12 +19,11 @@
 logger = logging.getLogger(__name__)
 
 
-def build_vocab_from_text_file(file_object, jited_tokenizer, min_freq=1, unk_token='<unk>', num_cpus=4):
+def build_vocab_from_text_file(file_path, jited_tokenizer, min_freq=1, unk_token='<unk>', num_cpus=4):
     r"""Create a `Vocab` object from a raw text file.
 
-    The `file_object` can contain any raw text. This function applies a generic JITed tokenizer in
-    parallel to the text. Note that the vocab will be created in the order that the tokens first appear
-    in the file (and not by the frequency of tokens).
+    The `file_path` can contain any raw text. This function applies a generic JITed tokenizer in
+    parallel to the text.
 
     Args:
         file_object (FileObject): a file object to read data from.
@@ -40,20 +39,18 @@ def build_vocab_from_text_file(file_object, jited_tokenizer, min_freq=1, unk_tok
     Examples:
         >>> from torchtext.experimental.vocab import build_vocab_from_text_file
         >>> from torchtext.experimental.transforms import basic_english_normalize
-        >>> f = open('vocab.txt', 'r')
-        >>>     tokenizer = basic_english_normalize()
+        >>> tokenizer = basic_english_normalize()
         >>> tokenizer = basic_english_normalize()
         >>> jit_tokenizer = torch.jit.script(tokenizer)
-        >>> v = build_vocab_from_text_file(f, jit_tokenizer)
+        >>> v = build_vocab_from_text_file('vocab.txt', jit_tokenizer)
     """
-    vocab_obj = _build_vocab_from_text_file(file_object.name, unk_token, min_freq, num_cpus, jited_tokenizer)
+    vocab_obj = _build_vocab_from_text_file(file_path, unk_token, min_freq, num_cpus, jited_tokenizer)
     return Vocab(vocab_obj)
 
 
-def load_vocab_from_file(file_object, min_freq=1, unk_token='<unk>', num_cpus=4):
+def load_vocab_from_file(file_path, min_freq=1, unk_token='<unk>', num_cpus=4):
     r"""Create a `Vocab` object from a text file.
-    The `file_object` should contain tokens separated by new lines. Note that the vocab
-    will be created in the order that the tokens first appear in the file (and not by the frequency of tokens).
+    The `file_path` should contain tokens separated by new lines.
     Format for txt file:
 
         token1
@@ -73,11 +70,10 @@ def load_vocab_from_file(file_object, min_freq=1, unk_token='<unk>', num_cpus=4)
 
     Examples:
         >>> from torchtext.experimental.vocab import load_vocab_from_file
-        >>> f = open('vocab.txt', 'r')
-        >>> v = load_vocab_from_file(f)
+        >>> v = load_vocab_from_file('vocab.txt')
     """
 
-    vocab_obj = _load_vocab_from_file(file_object.name, unk_token, min_freq, num_cpus)
+    vocab_obj = _load_vocab_from_file(file_path, unk_token, min_freq, num_cpus)
     return Vocab(vocab_obj)