pytorch · mthrok · May 15, 2020 · May 15, 2020 · cpuhrsch · May 15, 2020
diff --git a/test/common/assets.py b/test/common/assets.py
@@ -0,0 +1,32 @@
+import os
+import shutil
+import atexit
+import tempfile
+from pathlib import Path
+
+_ASSET_DIR = (Path(__file__).parent.parent / "asset").resolve()
+
+_TEMP_DIR = None
+
+
+def _init_temp_dir():
+    """Initialize temporary directory and register clean up at the end of test."""
+    global _TEMP_DIR
+    _TEMP_DIR = tempfile.TemporaryDirectory()  # noqa
+    atexit.register(_TEMP_DIR.cleanup)
+
+
+def get_asset_path(*path_components, use_temp_dir=False):
+    """Get the path to the file under `test/assets` directory.
+    When `use_temp_dir` is True, the asset is copied to a temporary location and
+    path to the temporary file is returned.
+    """
+    path = str(_ASSET_DIR.joinpath(*path_components))
+    if not use_temp_dir:
+        return path
+
+    if _TEMP_DIR is None:
+        _init_temp_dir()
+    tgt = os.path.join(_TEMP_DIR.name, path_components[-1])
+    shutil.copy(path, tgt)
+    return tgt
diff --git a/test/data/test_functional.py b/test/data/test_functional.py
@@ -15,13 +15,19 @@
 )
 
 from ..common.torchtext_test_case import TorchtextTestCase
+from ..common.assets import get_asset_path
 
 
 class TestFunctional(TorchtextTestCase):
     def test_generate_sp_model(self):
         # Test the function to train a sentencepiece tokenizer
 
-        data_path = 'test/asset/text_normalization_ag_news_test.csv'
+        # buck (fb internal) generates test environment which contains ',' in its path.
+        # SentencePieceTrainer considers such path as comma-delimited file list.
+        # So as workaround we copy the asset data to temporary directory and load it from there.
+        data_path = get_asset_path(
+            'text_normalization_ag_news_test.csv',
+            use_temp_dir=True)
         generate_sp_model(data_path,
                           vocab_size=23456,
                           model_prefix='spm_user')
@@ -38,7 +44,7 @@ def test_generate_sp_model(self):
 
     def test_sentencepiece_numericalizer(self):
         test_sample = 'SentencePiece is an unsupervised text tokenizer and detokenizer'
-        model_path = 'test/asset/spm_example.model'
+        model_path = get_asset_path('spm_example.model')
         sp_model = load_sp_model(model_path)
         self.assertEqual(sp_model.GetPieceSize(), 20000)
         spm_generator = sentencepiece_numericalizer(sp_model)
@@ -52,7 +58,7 @@ def test_sentencepiece_numericalizer(self):
     def test_sentencepiece_tokenizer(self):
 
         test_sample = 'SentencePiece is an unsupervised text tokenizer and detokenizer'
-        model_path = 'test/asset/spm_example.model'
+        model_path = get_asset_path('spm_example.model')
         sp_model = load_sp_model(model_path)
         self.assertEqual(sp_model.GetPieceSize(), 20000)
         spm_generator = sentencepiece_tokenizer(sp_model)
@@ -99,7 +105,7 @@ def encode_as_pieces(self, input: str):
 
 class TestScriptableSP(unittest.TestCase):
     def setUp(self):
-        model_path = 'test/asset/spm_example.model'
+        model_path = get_asset_path('spm_example.model')
         with tempfile.NamedTemporaryFile() as file:
             torch.jit.script(ScriptableSP(model_path)).save(file.name)
             self.model = torch.jit.load(file.name)

diff --git a/test/data/test_utils.py b/test/data/test_utils.py
@@ -1,8 +1,11 @@
+import io
+import unittest
+
 import torchtext.data as data
-import pytest
-from ..common.torchtext_test_case import TorchtextTestCase
 from torchtext.utils import unicode_csv_reader
-import io
+
+from ..common.torchtext_test_case import TorchtextTestCase
+from ..common.assets import get_asset_path
 
 
 class TestUtils(TorchtextTestCase):
@@ -21,8 +24,7 @@ def test_get_tokenizer_spacy(self):
 
     # TODO: Remove this once issue was been resolved.
     # TODO# Add nltk data back in build_tools/travis/install.sh.
-    @pytest.mark.skip(reason=("Impractically slow! "
-                              "https://github.com/alvations/sacremoses/issues/61"))
+    @unittest.skip("Impractically slow! https://github.com/alvations/sacremoses/issues/61")
     def test_get_tokenizer_moses(self):
         # Test Moses option.
         # Note that internally, MosesTokenizer converts to unicode if applicable
@@ -54,13 +56,13 @@ def test_text_nomalize_function(self):
         test_lines = []
 
         tokenizer = data.get_tokenizer("basic_english")
-        data_path = 'test/asset/text_normalization_ag_news_test.csv'
+        data_path = get_asset_path('text_normalization_ag_news_test.csv')
         with io.open(data_path, encoding="utf8") as f:
             reader = unicode_csv_reader(f)
             for row in reader:
                 test_lines.append(tokenizer(' , '.join(row)))
 
-        data_path = 'test/asset/text_normalization_ag_news_ref_results.test'
+        data_path = get_asset_path('text_normalization_ag_news_ref_results.test')
         with io.open(data_path, encoding="utf8") as ref_data:
             for line in ref_data:
                 line = line.split()