Add CCI cache for test data (#748)

mthrok · web-flow · commit 8b58a222b542 · 2020-05-01T10:48:22.000-04:00
* Add .vector_cache to cache

* Bust weekly

* Remove slow from `test_vocab`

* Remove slow from dataset test

* Add .data to cache

* Remove slow utils

* Make test data cache key common to all CI jobs
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -240,27 +240,45 @@ jobs:
     resource_class: 2xlarge+
     steps:
       - checkout
+      - run:
+          name: Generate cache key
+          # This will refresh cache on Sundays, nightly build should generate new cache.
+          command: echo "$(date +"%Y-%U")" > .circleci-weekly
       - restore_cache:
 
           keys:
-            - env-v1-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/scripts/environment.yml" }}
+            - env-v1-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
 
       - run:
           name: Setup
           command: .circleci/unittest/scripts/setup_env.sh
       - save_cache:
 
-          key: env-v1-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/scripts/environment.yml" }}
+          key: env-v1-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
 
           paths:
             - conda
             - env
       - run:
           name: Install torchtext
           command: .circleci/unittest/scripts/install.sh
+      - restore_cache:
+          keys:
+
+            - data-v1-{{ checksum ".circleci-weekly" }}
+
       - run:
           name: Run tests
+          # Downloading embedding vector takes long time.
+          no_output_timeout: 30m
           command: .circleci/unittest/scripts/run_test.sh
+      - save_cache:
+
+          key: data-v1-{{ checksum ".circleci-weekly" }}
+
+          paths:
+            - .vector_cache
+            - .data
       - run:
           name: Post process
           command: .circleci/unittest/scripts/post_process.sh
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
@@ -240,27 +240,45 @@ jobs:
     resource_class: 2xlarge+
     steps:
       - checkout
+      - run:
+          name: Generate cache key
+          # This will refresh cache on Sundays, nightly build should generate new cache.
+          command: echo "$(date +"%Y-%U")" > .circleci-weekly
       - restore_cache:
           {% raw %}
           keys:
-            - env-v1-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/scripts/environment.yml" }}
+            - env-v1-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
           {% endraw %}
       - run:
           name: Setup
           command: .circleci/unittest/scripts/setup_env.sh
       - save_cache:
           {% raw %}
-          key: env-v1-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/scripts/environment.yml" }}
+          key: env-v1-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
           {% endraw %}
           paths:
             - conda
             - env
       - run:
           name: Install torchtext
           command: .circleci/unittest/scripts/install.sh
+      - restore_cache:
+          keys:
+          {% raw %}
+            - data-v1-{{ checksum ".circleci-weekly" }}
+          {% endraw %}
       - run:
           name: Run tests
+          # Downloading embedding vector takes long time.
+          no_output_timeout: 30m
           command: .circleci/unittest/scripts/run_test.sh
+      - save_cache:
+          {% raw %}
+          key: data-v1-{{ checksum ".circleci-weekly" }}
+          {% endraw %}
+          paths:
+            - .vector_cache
+            - .data
       - run:
           name: Post process
           command: .circleci/unittest/scripts/post_process.sh
diff --git a/.circleci/unittest/scripts/run_test.sh b/.circleci/unittest/scripts/run_test.sh
@@ -6,5 +6,5 @@ eval "$(./conda/bin/conda shell.bash hook)"
 conda activate ./env
 
 python -m torch.utils.collect_env
-pytest --cov=torchtext --junitxml=test-results/junit.xml -v test
+pytest --cov=torchtext --junitxml=test-results/junit.xml -v --durations 20 test
 flake8 torchtext test
diff --git a/test/common/test_markers.py b/test/common/test_markers.py
diff --git a/test/conftest.py b/test/conftest.py
diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py
@@ -4,7 +4,6 @@
 from torchtext.datasets import AG_NEWS
 import torch
 from torch.testing import assert_allclose
-from ..common.test_markers import slow
 from ..common.torchtext_test_case import TorchtextTestCase
 
 
@@ -16,10 +15,16 @@ def conditional_remove(f):
 
 
 class TestDataset(TorchtextTestCase):
-    @slow
     def test_wikitext2_legacy(self):
         from torchtext.datasets import WikiText2
         # smoke test to ensure wikitext2 works properly
+
+        # NOTE
+        # test_wikitext2 and test_wikitext2_legacy have some cache incompatibility.
+        # Keeping one's cache make the other fail. So we need to clean up the cache dir
+        cachedir = os.path.join(self.project_root, ".data", "wikitext-2")
+        conditional_remove(cachedir)
+
         ds = WikiText2
         TEXT = data.Field(lower=True, batch_first=True)
         train, valid, test = ds.splits(TEXT)
@@ -30,13 +35,20 @@ def test_wikitext2_legacy(self):
         train_iter, valid_iter, test_iter = ds.iters(batch_size=4,
                                                      bptt_len=30)
 
-        # Delete the dataset after we're done to save disk space on CI
-        datafile = os.path.join(self.project_root, ".data", "wikitext-2")
-        conditional_remove(datafile)
+        conditional_remove(cachedir)
 
     def test_wikitext2(self):
         from torchtext.experimental.datasets import WikiText2
         # smoke test to ensure wikitext2 works properly
+
+        # NOTE
+        # test_wikitext2 and test_wikitext2_legacy have some cache incompatibility.
+        # Keeping one's cache make the other fail. So we need to clean up the cache dir
+        cachedir = os.path.join(self.project_root, ".data", "wikitext-2")
+        conditional_remove(cachedir)
+        cachefile = os.path.join(self.project_root, ".data", "wikitext-2-v1.zip")
+        conditional_remove(cachefile)
+
         train_dataset, test_dataset, valid_dataset = WikiText2()
         self.assertEqual(len(train_dataset), 2049990)
         self.assertEqual(len(test_dataset), 241859)
@@ -46,13 +58,9 @@ def test_wikitext2(self):
         tokens_ids = [vocab[token] for token in 'the player characters rest'.split()]
         self.assertEqual(tokens_ids, [2, 286, 503, 700])
 
-        # Delete the dataset after we're done to save disk space on CI
-        datafile = os.path.join(self.project_root, ".data", "wikitext-2")
-        conditional_remove(datafile)
-        datafile = os.path.join(self.project_root, ".data", "wikitext-2-v1.zip")
-        conditional_remove(datafile)
+        conditional_remove(cachedir)
+        conditional_remove(cachefile)
 
-    @slow
     def test_penntreebank_legacy(self):
         from torchtext.datasets import PennTreebank
         # smoke test to ensure penn treebank works properly
@@ -66,10 +74,6 @@ def test_penntreebank_legacy(self):
         train_iter, valid_iter, test_iter = ds.iters(batch_size=4,
                                                      bptt_len=30)
 
-        # Delete the dataset after we're done to save disk space on CI
-        datafile = os.path.join(self.project_root, ".data", "penn-treebank")
-        conditional_remove(datafile)
-
     def test_penntreebank(self):
         from torchtext.experimental.datasets import PennTreebank
         # smoke test to ensure wikitext2 works properly
@@ -82,14 +86,6 @@ def test_penntreebank(self):
         tokens_ids = [vocab[token] for token in 'the player characters rest'.split()]
         self.assertEqual(tokens_ids, [2, 2550, 3344, 1125])
 
-        # Delete the dataset after we're done to save disk space on CI
-        datafile = os.path.join(self.project_root, ".data", 'ptb.train.txt')
-        conditional_remove(datafile)
-        datafile = os.path.join(self.project_root, ".data", 'ptb.test.txt')
-        conditional_remove(datafile)
-        datafile = os.path.join(self.project_root, ".data", 'ptb.valid.txt')
-        conditional_remove(datafile)
-
     def test_text_classification(self):
         # smoke test to ensure ag_news dataset works properly
 
@@ -104,13 +100,6 @@ def test_text_classification(self):
         assert_allclose(ag_news_test[-1][1][:10],
                         torch.tensor([2351, 758, 96, 38581, 2351, 220, 5, 396, 3, 14786]).long())
 
-        # Delete the dataset after we're done to save disk space on CI
-        datafile = os.path.join(self.project_root, ".data", "ag_news_csv")
-        conditional_remove(datafile)
-        datafile = os.path.join(self.project_root, ".data", "ag_news_csv.tar.gz")
-        conditional_remove(datafile)
-
-    @slow
     def test_imdb(self):
         from torchtext.experimental.datasets import IMDB
         from torchtext.vocab import Vocab
@@ -131,11 +120,3 @@ def test_imdb(self):
         old_vocab = train_dataset.get_vocab()
         new_vocab = Vocab(counter=old_vocab.freqs, max_size=2500)
         new_train_data, new_test_data = IMDB(vocab=new_vocab)
-
-        # Delete the dataset after we're done to save disk space on CI
-        datafile = os.path.join(self.project_root, ".data", "imdb")
-        conditional_remove(datafile)
-        datafile = os.path.join(self.project_root, ".data", "aclImdb")
-        conditional_remove(datafile)
-        datafile = os.path.join(self.project_root, ".data", "aclImdb_v1.tar.gz")
-        conditional_remove(datafile)
diff --git a/test/data/test_field.py b/test/data/test_field.py
@@ -9,7 +9,6 @@
 from torch.nn import init
 
 from ..common.torchtext_test_case import TorchtextTestCase, verify_numericalized_example
-from ..common.test_markers import slow
 
 
 class TestField(TorchtextTestCase):
@@ -866,7 +865,6 @@ def test_serialization(self):
 
         assert torch.all(torch.eq(original_numericalization, pickled_numericalization))
 
-    @slow
     def test_build_vocab(self):
         nesting_field = data.Field(tokenize=list, init_token="<w>", eos_token="</w>")
 
diff --git a/test/test_vocab.py b/test/test_vocab.py
@@ -10,7 +10,6 @@
 from torchtext import vocab
 from torchtext.vocab import Vectors, FastText, GloVe, CharNGram
 
-from .common.test_markers import slow
 from .common.torchtext_test_case import TorchtextTestCase
 
 
@@ -93,7 +92,6 @@ def test_vocab_set_vectors(self):
                                      [0.3, 0.4]])
         assert_allclose(v.vectors.numpy(), expected_vectors)
 
-    @slow
     def test_vocab_download_fasttext_vectors(self):
         c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2})
         # Build a vocab and get vectors twice to test caching, then once more
@@ -131,7 +129,6 @@ def test_vocab_download_fasttext_vectors(self):
             vec_file = os.path.join(self.project_root, ".vector_cache", "wiki.simple.vec")
             conditional_remove(vec_file)
 
-    @slow
     def test_vocab_extend(self):
         c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2})
         # Build a vocab and get vectors twice to test caching.
@@ -163,7 +160,6 @@ def test_vocab_extend(self):
             vec_file = os.path.join(self.project_root, ".vector_cache", "wiki.simple.vec")
             conditional_remove(vec_file)
 
-    @slow
     def test_vocab_download_custom_vectors(self):
         c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2})
         # Build a vocab and get vectors twice to test caching.
@@ -192,7 +188,6 @@ def test_vocab_download_custom_vectors(self):
             vec_file = os.path.join(self.project_root, ".vector_cache", "wiki.simple.vec")
             conditional_remove(vec_file)
 
-    @slow
     def test_vocab_vectors_custom_cache(self):
         c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2})
         vector_cache = os.path.join('/tmp', 'vector_cache')
@@ -225,7 +220,6 @@ def test_vocab_vectors_custom_cache(self):
             vec_file = os.path.join(vector_cache, "wiki.simple.vec")
             conditional_remove(vec_file)
 
-    @slow
     def test_vocab_download_glove_vectors(self):
         c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2})
 
@@ -268,7 +262,6 @@ def test_vocab_download_glove_vectors(self):
                 conditional_remove(os.path.join(self.project_root, ".vector_cache",
                                                 "glove.twitter.27B.{}d.txt".format(dim)))
 
-    @slow
     def test_vocab_download_charngram_vectors(self):
         c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2})
         # Build a vocab and get vectors twice to test caching, then once more
@@ -343,7 +336,6 @@ def test_serialization_backcompat(self):
         v_loaded = pickle.load(open(pickle_path, "rb"))
         assert v == v_loaded
 
-    @slow
     def test_vectors_get_vecs(self):
         vec = GloVe(name='twitter.27B', dim='25')
         self.assertEqual(vec.vectors.shape[0], len(vec))