From 8076ed142b03871315cc3f54ee885364e2141f41 Mon Sep 17 00:00:00 2001 From: moto <855818+mthrok@users.noreply.github.com> Date: Thu, 30 Apr 2020 18:01:10 +0000 Subject: [PATCH 01/19] Add CCI cache for .vector_cache directory --- .circleci/config.yml | 7 +++++++ .circleci/config.yml.in | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index c44a1bcdd2..623573b955 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -258,9 +258,16 @@ jobs: - run: name: Install torchtext command: .circleci/unittest/scripts/install.sh + - restore_cache: + keys: + - vector-cache-v1 - run: name: Run tests command: .circleci/unittest/scripts/run_test.sh + - save_cache: + key: vector-cache-v1 + paths: + - .vector_cache - run: name: Post process command: .circleci/unittest/scripts/post_process.sh diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in index c86dbbeca1..832f3b452c 100644 --- a/.circleci/config.yml.in +++ b/.circleci/config.yml.in @@ -258,9 +258,16 @@ jobs: - run: name: Install torchtext command: .circleci/unittest/scripts/install.sh + - restore_cache: + keys: + - vector-cache-v1 - run: name: Run tests command: .circleci/unittest/scripts/run_test.sh + - save_cache: + key: vector-cache-v1 + paths: + - .vector_cache - run: name: Post process command: .circleci/unittest/scripts/post_process.sh From 7ef6eba5c9d21a11dd5e98a34f128b29f5266913 Mon Sep 17 00:00:00 2001 From: moto <855818+mthrok@users.noreply.github.com> Date: Thu, 30 Apr 2020 18:18:59 +0000 Subject: [PATCH 02/19] Bust weekly --- .circleci/config.yml | 14 ++++++++++++-- .circleci/config.yml.in | 14 ++++++++++++-- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 623573b955..0efba949c7 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -258,14 +258,24 @@ jobs: - run: name: Install torchtext command: .circleci/unittest/scripts/install.sh + - run: + name: Generate cache key + # This will refresh cache on Sundays, nightly build would refresh the cache. + command: echo "$(date +"%Y-%U")" > .circle-week - restore_cache: keys: - - vector-cache-v1 + # NOTE: remove .Branch once it's ready + + - vector-cache-v1-{{ .Branch }}-{{ checksum ".circle-week" }} + - run: name: Run tests command: .circleci/unittest/scripts/run_test.sh - save_cache: - key: vector-cache-v1 + # NOTE: remove .Branch once it's ready + + key: vector-cache-v1-{{ .Branch }}-{{ checksum ".circle-week" }} + paths: - .vector_cache - run: diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in index 832f3b452c..1ee06ddbea 100644 --- a/.circleci/config.yml.in +++ b/.circleci/config.yml.in @@ -258,14 +258,24 @@ jobs: - run: name: Install torchtext command: .circleci/unittest/scripts/install.sh + - run: + name: Generate cache key + # This will refresh cache on Sundays, nightly build would refresh the cache. + command: echo "$(date +"%Y-%U")" > .circle-week - restore_cache: keys: - - vector-cache-v1 + # NOTE: remove .Branch once it's ready + {% raw %} + - vector-cache-v1-{{ .Branch }}-{{ checksum ".circle-week" }} + {% endraw %} - run: name: Run tests command: .circleci/unittest/scripts/run_test.sh - save_cache: - key: vector-cache-v1 + # NOTE: remove .Branch once it's ready + {% raw %} + key: vector-cache-v1-{{ .Branch }}-{{ checksum ".circle-week" }} + {% endraw %} paths: - .vector_cache - run: From 36972cd704368b8c85df76abdb60ec9cd27ed4c3 Mon Sep 17 00:00:00 2001 From: moto <855818+mthrok@users.noreply.github.com> Date: Thu, 30 Apr 2020 18:24:46 +0000 Subject: [PATCH 03/19] Remove slow from test_vocab --- test/test_vocab.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/test/test_vocab.py b/test/test_vocab.py index aad82de57c..a083025bd7 100644 --- a/test/test_vocab.py +++ b/test/test_vocab.py @@ -10,7 +10,6 @@ from torchtext import vocab from torchtext.vocab import Vectors, FastText, GloVe, CharNGram -from .common.test_markers import slow from .common.torchtext_test_case import TorchtextTestCase @@ -93,7 +92,6 @@ def test_vocab_set_vectors(self): [0.3, 0.4]]) assert_allclose(v.vectors.numpy(), expected_vectors) - @slow def test_vocab_download_fasttext_vectors(self): c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2}) # Build a vocab and get vectors twice to test caching, then once more @@ -131,7 +129,6 @@ def test_vocab_download_fasttext_vectors(self): vec_file = os.path.join(self.project_root, ".vector_cache", "wiki.simple.vec") conditional_remove(vec_file) - @slow def test_vocab_extend(self): c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2}) # Build a vocab and get vectors twice to test caching. @@ -163,7 +160,6 @@ def test_vocab_extend(self): vec_file = os.path.join(self.project_root, ".vector_cache", "wiki.simple.vec") conditional_remove(vec_file) - @slow def test_vocab_download_custom_vectors(self): c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2}) # Build a vocab and get vectors twice to test caching. @@ -192,7 +188,6 @@ def test_vocab_download_custom_vectors(self): vec_file = os.path.join(self.project_root, ".vector_cache", "wiki.simple.vec") conditional_remove(vec_file) - @slow def test_vocab_vectors_custom_cache(self): c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2}) vector_cache = os.path.join('/tmp', 'vector_cache') @@ -225,7 +220,6 @@ def test_vocab_vectors_custom_cache(self): vec_file = os.path.join(vector_cache, "wiki.simple.vec") conditional_remove(vec_file) - @slow def test_vocab_download_glove_vectors(self): c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2}) @@ -268,7 +262,6 @@ def test_vocab_download_glove_vectors(self): conditional_remove(os.path.join(self.project_root, ".vector_cache", "glove.twitter.27B.{}d.txt".format(dim))) - @slow def test_vocab_download_charngram_vectors(self): c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2}) # Build a vocab and get vectors twice to test caching, then once more @@ -343,7 +336,6 @@ def test_serialization_backcompat(self): v_loaded = pickle.load(open(pickle_path, "rb")) assert v == v_loaded - @slow def test_vectors_get_vecs(self): vec = GloVe(name='twitter.27B', dim='25') self.assertEqual(vec.vectors.shape[0], len(vec)) From e7125afb41ff25df5fe10d883f3573813ad2bff0 Mon Sep 17 00:00:00 2001 From: moto <855818+mthrok@users.noreply.github.com> Date: Thu, 30 Apr 2020 18:25:32 +0000 Subject: [PATCH 04/19] Change cache key --- .circleci/config.yml | 4 ++-- .circleci/config.yml.in | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 0efba949c7..cab1c13824 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -266,7 +266,7 @@ jobs: keys: # NOTE: remove .Branch once it's ready - - vector-cache-v1-{{ .Branch }}-{{ checksum ".circle-week" }} + - vector-cache-v2-{{ .Branch }}-{{ checksum ".circle-week" }} - run: name: Run tests @@ -274,7 +274,7 @@ jobs: - save_cache: # NOTE: remove .Branch once it's ready - key: vector-cache-v1-{{ .Branch }}-{{ checksum ".circle-week" }} + key: vector-cache-v2-{{ .Branch }}-{{ checksum ".circle-week" }} paths: - .vector_cache diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in index 1ee06ddbea..6d42d650a6 100644 --- a/.circleci/config.yml.in +++ b/.circleci/config.yml.in @@ -266,7 +266,7 @@ jobs: keys: # NOTE: remove .Branch once it's ready {% raw %} - - vector-cache-v1-{{ .Branch }}-{{ checksum ".circle-week" }} + - vector-cache-v2-{{ .Branch }}-{{ checksum ".circle-week" }} {% endraw %} - run: name: Run tests @@ -274,7 +274,7 @@ jobs: - save_cache: # NOTE: remove .Branch once it's ready {% raw %} - key: vector-cache-v1-{{ .Branch }}-{{ checksum ".circle-week" }} + key: vector-cache-v2-{{ .Branch }}-{{ checksum ".circle-week" }} {% endraw %} paths: - .vector_cache From 143eabb2ed447bcd770912d2899efd0f743ee2ea Mon Sep 17 00:00:00 2001 From: moto <855818+mthrok@users.noreply.github.com> Date: Thu, 30 Apr 2020 18:40:34 +0000 Subject: [PATCH 05/19] Set timeout 30m --- .circleci/config.yml | 1 + .circleci/config.yml.in | 1 + 2 files changed, 2 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index cab1c13824..29c6890c56 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -270,6 +270,7 @@ jobs: - run: name: Run tests + no_output_timeout: 30m command: .circleci/unittest/scripts/run_test.sh - save_cache: # NOTE: remove .Branch once it's ready diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in index 6d42d650a6..df69e31cb8 100644 --- a/.circleci/config.yml.in +++ b/.circleci/config.yml.in @@ -270,6 +270,7 @@ jobs: {% endraw %} - run: name: Run tests + no_output_timeout: 30m command: .circleci/unittest/scripts/run_test.sh - save_cache: # NOTE: remove .Branch once it's ready From 8820a85c8edb273c6794f98e90a2e253ee79da5e Mon Sep 17 00:00:00 2001 From: moto <855818+mthrok@users.noreply.github.com> Date: Thu, 30 Apr 2020 19:16:04 +0000 Subject: [PATCH 06/19] Rerun test --- .circleci/config.yml | 1 + .circleci/config.yml.in | 1 + .circleci/unittest/scripts/run_test.sh | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 29c6890c56..ad423cc089 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -270,6 +270,7 @@ jobs: - run: name: Run tests + # Downloading embedding vector takes long time. no_output_timeout: 30m command: .circleci/unittest/scripts/run_test.sh - save_cache: diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in index df69e31cb8..187d8cfc4f 100644 --- a/.circleci/config.yml.in +++ b/.circleci/config.yml.in @@ -270,6 +270,7 @@ jobs: {% endraw %} - run: name: Run tests + # Downloading embedding vector takes long time. no_output_timeout: 30m command: .circleci/unittest/scripts/run_test.sh - save_cache: diff --git a/.circleci/unittest/scripts/run_test.sh b/.circleci/unittest/scripts/run_test.sh index 19649530b5..1eea12cfb1 100755 --- a/.circleci/unittest/scripts/run_test.sh +++ b/.circleci/unittest/scripts/run_test.sh @@ -6,5 +6,5 @@ eval "$(./conda/bin/conda shell.bash hook)" conda activate ./env python -m torch.utils.collect_env -pytest --cov=torchtext --junitxml=test-results/junit.xml -v test +pytest --cov=torchtext --junitxml=test-results/junit.xml -v --durations 20 test flake8 torchtext test From 605a17f70ebea2964ea9cbbce7acf865cf3834af Mon Sep 17 00:00:00 2001 From: moto <855818+mthrok@users.noreply.github.com> Date: Thu, 30 Apr 2020 19:33:47 +0000 Subject: [PATCH 07/19] Remove slow from dataset test --- .circleci/config.yml | 4 ++-- .circleci/config.yml.in | 4 ++-- test/data/test_builtin_datasets.py | 12 +++++------- test/data/test_field.py | 1 - 4 files changed, 9 insertions(+), 12 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index ad423cc089..228084f46c 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -266,7 +266,7 @@ jobs: keys: # NOTE: remove .Branch once it's ready - - vector-cache-v2-{{ .Branch }}-{{ checksum ".circle-week" }} + - vector-cache-v3-{{ .Branch }}-{{ checksum ".circle-week" }} - run: name: Run tests @@ -276,7 +276,7 @@ jobs: - save_cache: # NOTE: remove .Branch once it's ready - key: vector-cache-v2-{{ .Branch }}-{{ checksum ".circle-week" }} + key: vector-cache-v3-{{ .Branch }}-{{ checksum ".circle-week" }} paths: - .vector_cache diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in index 187d8cfc4f..2b801dd7f0 100644 --- a/.circleci/config.yml.in +++ b/.circleci/config.yml.in @@ -266,7 +266,7 @@ jobs: keys: # NOTE: remove .Branch once it's ready {% raw %} - - vector-cache-v2-{{ .Branch }}-{{ checksum ".circle-week" }} + - vector-cache-v3-{{ .Branch }}-{{ checksum ".circle-week" }} {% endraw %} - run: name: Run tests @@ -276,7 +276,7 @@ jobs: - save_cache: # NOTE: remove .Branch once it's ready {% raw %} - key: vector-cache-v2-{{ .Branch }}-{{ checksum ".circle-week" }} + key: vector-cache-v3-{{ .Branch }}-{{ checksum ".circle-week" }} {% endraw %} paths: - .vector_cache diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py index b2723949b2..cdefa707a5 100644 --- a/test/data/test_builtin_datasets.py +++ b/test/data/test_builtin_datasets.py @@ -9,14 +9,14 @@ def conditional_remove(f): - if os.path.isfile(f): - os.remove(f) - elif os.path.isdir(f): - shutil.rmtree(f) + if os.environ.get("TRAVIS") == "true": + if os.path.isfile(f): + os.remove(f) + elif os.path.isdir(f): + shutil.rmtree(f) class TestDataset(TorchtextTestCase): - @slow def test_wikitext2_legacy(self): from torchtext.datasets import WikiText2 # smoke test to ensure wikitext2 works properly @@ -52,7 +52,6 @@ def test_wikitext2(self): datafile = os.path.join(self.project_root, ".data", "wikitext-2-v1.zip") conditional_remove(datafile) - @slow def test_penntreebank_legacy(self): from torchtext.datasets import PennTreebank # smoke test to ensure penn treebank works properly @@ -110,7 +109,6 @@ def test_text_classification(self): datafile = os.path.join(self.project_root, ".data", "ag_news_csv.tar.gz") conditional_remove(datafile) - @slow def test_imdb(self): from torchtext.experimental.datasets import IMDB from torchtext.vocab import Vocab diff --git a/test/data/test_field.py b/test/data/test_field.py index 14fe043776..c20e1033ce 100644 --- a/test/data/test_field.py +++ b/test/data/test_field.py @@ -866,7 +866,6 @@ def test_serialization(self): assert torch.all(torch.eq(original_numericalization, pickled_numericalization)) - @slow def test_build_vocab(self): nesting_field = data.Field(tokenize=list, init_token="", eos_token="") From 775d260da778719554fe447b823b24546c9c7e20 Mon Sep 17 00:00:00 2001 From: moto <855818+mthrok@users.noreply.github.com> Date: Thu, 30 Apr 2020 19:35:10 +0000 Subject: [PATCH 08/19] Add .data to cache --- .circleci/config.yml.in | 1 + 1 file changed, 1 insertion(+) diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in index 2b801dd7f0..30cae3bec5 100644 --- a/.circleci/config.yml.in +++ b/.circleci/config.yml.in @@ -280,6 +280,7 @@ jobs: {% endraw %} paths: - .vector_cache + - .data - run: name: Post process command: .circleci/unittest/scripts/post_process.sh From 0e56dae6c854ad6766148ac0fed19d5e106fd361 Mon Sep 17 00:00:00 2001 From: moto <855818+mthrok@users.noreply.github.com> Date: Thu, 30 Apr 2020 20:04:40 +0000 Subject: [PATCH 09/19] Fix flake8 --- test/data/test_builtin_datasets.py | 1 - test/data/test_field.py | 1 - 2 files changed, 2 deletions(-) diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py index cdefa707a5..c4b87419e1 100644 --- a/test/data/test_builtin_datasets.py +++ b/test/data/test_builtin_datasets.py @@ -4,7 +4,6 @@ from torchtext.datasets import AG_NEWS import torch from torch.testing import assert_allclose -from ..common.test_markers import slow from ..common.torchtext_test_case import TorchtextTestCase diff --git a/test/data/test_field.py b/test/data/test_field.py index c20e1033ce..385412b26f 100644 --- a/test/data/test_field.py +++ b/test/data/test_field.py @@ -9,7 +9,6 @@ from torch.nn import init from ..common.torchtext_test_case import TorchtextTestCase, verify_numericalized_example -from ..common.test_markers import slow class TestField(TorchtextTestCase): From 036f7cb0e41062f981a56c3fa35c713e0ffa85ea Mon Sep 17 00:00:00 2001 From: moto <855818+mthrok@users.noreply.github.com> Date: Thu, 30 Apr 2020 20:05:37 +0000 Subject: [PATCH 10/19] Using the previous cache, add a new cache on top of it --- .circleci/config.yml | 8 ++++++-- .circleci/config.yml.in | 7 +++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 228084f46c..db034f47f1 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -266,13 +266,16 @@ jobs: keys: # NOTE: remove .Branch once it's ready - - vector-cache-v3-{{ .Branch }}-{{ checksum ".circle-week" }} + - vector-cache-v2-{{ .Branch }}-{{ checksum ".circle-week" }} - run: name: Run tests # Downloading embedding vector takes long time. no_output_timeout: 30m - command: .circleci/unittest/scripts/run_test.sh + command: | + rm -rf .data + ls -alh . + .circleci/unittest/scripts/run_test.sh - save_cache: # NOTE: remove .Branch once it's ready @@ -280,6 +283,7 @@ jobs: paths: - .vector_cache + - .data - run: name: Post process command: .circleci/unittest/scripts/post_process.sh diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in index 30cae3bec5..0ffd1374d3 100644 --- a/.circleci/config.yml.in +++ b/.circleci/config.yml.in @@ -266,13 +266,16 @@ jobs: keys: # NOTE: remove .Branch once it's ready {% raw %} - - vector-cache-v3-{{ .Branch }}-{{ checksum ".circle-week" }} + - vector-cache-v2-{{ .Branch }}-{{ checksum ".circle-week" }} {% endraw %} - run: name: Run tests # Downloading embedding vector takes long time. no_output_timeout: 30m - command: .circleci/unittest/scripts/run_test.sh + command: | + rm -rf .data + ls -alh . + .circleci/unittest/scripts/run_test.sh - save_cache: # NOTE: remove .Branch once it's ready {% raw %} From b880d531f62f22b5b25b72766a15aeed5adfcad0 Mon Sep 17 00:00:00 2001 From: moto <855818+mthrok@users.noreply.github.com> Date: Thu, 30 Apr 2020 20:17:25 +0000 Subject: [PATCH 11/19] Fix --- .circleci/config.yml | 21 ++++---- .circleci/config.yml.in | 21 ++++---- test/data/test_builtin_datasets.py | 82 +++++++++++++++++------------- 3 files changed, 65 insertions(+), 59 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index db034f47f1..64ce4fd4cb 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -240,17 +240,21 @@ jobs: resource_class: 2xlarge+ steps: - checkout + - run: + name: Generate cache key + # This will refresh cache on Sundays, nightly build should refresh the cache. + command: echo "$(date +"%Y-%U")" > .circleci-weekly - restore_cache: keys: - - env-v1-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/scripts/environment.yml" }} + - env-v1-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - run: name: Setup command: .circleci/unittest/scripts/setup_env.sh - save_cache: - key: env-v1-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/scripts/environment.yml" }} + key: env-v1-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} paths: - conda @@ -258,28 +262,21 @@ jobs: - run: name: Install torchtext command: .circleci/unittest/scripts/install.sh - - run: - name: Generate cache key - # This will refresh cache on Sundays, nightly build would refresh the cache. - command: echo "$(date +"%Y-%U")" > .circle-week - restore_cache: keys: # NOTE: remove .Branch once it's ready - - vector-cache-v2-{{ .Branch }}-{{ checksum ".circle-week" }} + - vector-cache-v2-{{ .Branch }}-{{ checksum ".circleci-weekly" }} - run: name: Run tests # Downloading embedding vector takes long time. no_output_timeout: 30m - command: | - rm -rf .data - ls -alh . - .circleci/unittest/scripts/run_test.sh + command: .circleci/unittest/scripts/run_test.sh - save_cache: # NOTE: remove .Branch once it's ready - key: vector-cache-v3-{{ .Branch }}-{{ checksum ".circle-week" }} + key: vector-cache-v3-{{ .Branch }}-{{ checksum ".circleci-weekly" }} paths: - .vector_cache diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in index 0ffd1374d3..d5aa1132b9 100644 --- a/.circleci/config.yml.in +++ b/.circleci/config.yml.in @@ -240,17 +240,21 @@ jobs: resource_class: 2xlarge+ steps: - checkout + - run: + name: Generate cache key + # This will refresh cache on Sundays, nightly build should refresh the cache. + command: echo "$(date +"%Y-%U")" > .circleci-weekly - restore_cache: {% raw %} keys: - - env-v1-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/scripts/environment.yml" }} + - env-v1-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} {% endraw %} - run: name: Setup command: .circleci/unittest/scripts/setup_env.sh - save_cache: {% raw %} - key: env-v1-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/scripts/environment.yml" }} + key: env-v1-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} {% endraw %} paths: - conda @@ -258,28 +262,21 @@ jobs: - run: name: Install torchtext command: .circleci/unittest/scripts/install.sh - - run: - name: Generate cache key - # This will refresh cache on Sundays, nightly build would refresh the cache. - command: echo "$(date +"%Y-%U")" > .circle-week - restore_cache: keys: # NOTE: remove .Branch once it's ready {% raw %} - - vector-cache-v2-{{ .Branch }}-{{ checksum ".circle-week" }} + - vector-cache-v2-{{ .Branch }}-{{ checksum ".circleci-weekly" }} {% endraw %} - run: name: Run tests # Downloading embedding vector takes long time. no_output_timeout: 30m - command: | - rm -rf .data - ls -alh . - .circleci/unittest/scripts/run_test.sh + command: .circleci/unittest/scripts/run_test.sh - save_cache: # NOTE: remove .Branch once it's ready {% raw %} - key: vector-cache-v3-{{ .Branch }}-{{ checksum ".circle-week" }} + key: vector-cache-v3-{{ .Branch }}-{{ checksum ".circleci-weekly" }} {% endraw %} paths: - .vector_cache diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py index c4b87419e1..eee741e947 100644 --- a/test/data/test_builtin_datasets.py +++ b/test/data/test_builtin_datasets.py @@ -8,17 +8,22 @@ def conditional_remove(f): - if os.environ.get("TRAVIS") == "true": - if os.path.isfile(f): - os.remove(f) - elif os.path.isdir(f): - shutil.rmtree(f) + if os.path.isfile(f): + os.remove(f) + elif os.path.isdir(f): + shutil.rmtree(f) class TestDataset(TorchtextTestCase): def test_wikitext2_legacy(self): from torchtext.datasets import WikiText2 # smoke test to ensure wikitext2 works properly + + # NOTE test_wikitext2 and test_wikitext2_legacy have some cache incompatibility, + # and keeping one's data make the other fail. So we need to clean up the cache dir + cachedir = os.path.join(self.project_root, ".data", "wikitext-2") + conditional_remove(cachedir) + ds = WikiText2 TEXT = data.Field(lower=True, batch_first=True) train, valid, test = ds.splits(TEXT) @@ -29,13 +34,19 @@ def test_wikitext2_legacy(self): train_iter, valid_iter, test_iter = ds.iters(batch_size=4, bptt_len=30) - # Delete the dataset after we're done to save disk space on CI - datafile = os.path.join(self.project_root, ".data", "wikitext-2") - conditional_remove(datafile) + conditional_remove(cachedir) def test_wikitext2(self): from torchtext.experimental.datasets import WikiText2 # smoke test to ensure wikitext2 works properly + + # NOTE test_wikitext2 and test_wikitext2_legacy have some cache incompatibility, + # and keeping one's data make the other fail. So we need to clean up the cache dir + cachedir = os.path.join(self.project_root, ".data", "wikitext-2") + conditional_remove(cachedir) + cachefile = os.path.join(self.project_root, ".data", "wikitext-2-v1.zip") + conditional_remove(cachefile) + train_dataset, test_dataset, valid_dataset = WikiText2() self.assertEqual(len(train_dataset), 2049990) self.assertEqual(len(test_dataset), 241859) @@ -45,11 +56,8 @@ def test_wikitext2(self): tokens_ids = [vocab[token] for token in 'the player characters rest'.split()] self.assertEqual(tokens_ids, [2, 286, 503, 700]) - # Delete the dataset after we're done to save disk space on CI - datafile = os.path.join(self.project_root, ".data", "wikitext-2") - conditional_remove(datafile) - datafile = os.path.join(self.project_root, ".data", "wikitext-2-v1.zip") - conditional_remove(datafile) + conditional_remove(cachedir) + conditional_remove(cachefile) def test_penntreebank_legacy(self): from torchtext.datasets import PennTreebank @@ -64,9 +72,10 @@ def test_penntreebank_legacy(self): train_iter, valid_iter, test_iter = ds.iters(batch_size=4, bptt_len=30) - # Delete the dataset after we're done to save disk space on CI - datafile = os.path.join(self.project_root, ".data", "penn-treebank") - conditional_remove(datafile) + if os.environ.get("TRAVIS") == "true": + # Delete the dataset after we're done to save disk space on CI + datafile = os.path.join(self.project_root, ".data", "penn-treebank") + conditional_remove(datafile) def test_penntreebank(self): from torchtext.experimental.datasets import PennTreebank @@ -80,13 +89,14 @@ def test_penntreebank(self): tokens_ids = [vocab[token] for token in 'the player characters rest'.split()] self.assertEqual(tokens_ids, [2, 2550, 3344, 1125]) - # Delete the dataset after we're done to save disk space on CI - datafile = os.path.join(self.project_root, ".data", 'ptb.train.txt') - conditional_remove(datafile) - datafile = os.path.join(self.project_root, ".data", 'ptb.test.txt') - conditional_remove(datafile) - datafile = os.path.join(self.project_root, ".data", 'ptb.valid.txt') - conditional_remove(datafile) + if os.environ.get("TRAVIS") == "true": + # Delete the dataset after we're done to save disk space on CI + datafile = os.path.join(self.project_root, ".data", 'ptb.train.txt') + conditional_remove(datafile) + datafile = os.path.join(self.project_root, ".data", 'ptb.test.txt') + conditional_remove(datafile) + datafile = os.path.join(self.project_root, ".data", 'ptb.valid.txt') + conditional_remove(datafile) def test_text_classification(self): # smoke test to ensure ag_news dataset works properly @@ -102,11 +112,12 @@ def test_text_classification(self): assert_allclose(ag_news_test[-1][1][:10], torch.tensor([2351, 758, 96, 38581, 2351, 220, 5, 396, 3, 14786]).long()) - # Delete the dataset after we're done to save disk space on CI - datafile = os.path.join(self.project_root, ".data", "ag_news_csv") - conditional_remove(datafile) - datafile = os.path.join(self.project_root, ".data", "ag_news_csv.tar.gz") - conditional_remove(datafile) + if os.environ.get("TRAVIS") == "true": + # Delete the dataset after we're done to save disk space on CI + datafile = os.path.join(self.project_root, ".data", "ag_news_csv") + conditional_remove(datafile) + datafile = os.path.join(self.project_root, ".data", "ag_news_csv.tar.gz") + conditional_remove(datafile) def test_imdb(self): from torchtext.experimental.datasets import IMDB @@ -129,10 +140,11 @@ def test_imdb(self): new_vocab = Vocab(counter=old_vocab.freqs, max_size=2500) new_train_data, new_test_data = IMDB(vocab=new_vocab) - # Delete the dataset after we're done to save disk space on CI - datafile = os.path.join(self.project_root, ".data", "imdb") - conditional_remove(datafile) - datafile = os.path.join(self.project_root, ".data", "aclImdb") - conditional_remove(datafile) - datafile = os.path.join(self.project_root, ".data", "aclImdb_v1.tar.gz") - conditional_remove(datafile) + if os.environ.get("TRAVIS") == "true": + # Delete the dataset after we're done to save disk space on CI + datafile = os.path.join(self.project_root, ".data", "imdb") + conditional_remove(datafile) + datafile = os.path.join(self.project_root, ".data", "aclImdb") + conditional_remove(datafile) + datafile = os.path.join(self.project_root, ".data", "aclImdb_v1.tar.gz") + conditional_remove(datafile) From eccdc4438e0076708eb389bf56e6a2887d3857f1 Mon Sep 17 00:00:00 2001 From: moto <855818+mthrok@users.noreply.github.com> Date: Thu, 30 Apr 2020 21:48:02 +0000 Subject: [PATCH 12/19] Use cache --- .circleci/config.yml | 2 +- .circleci/config.yml.in | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 64ce4fd4cb..83b71f6d87 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -266,7 +266,7 @@ jobs: keys: # NOTE: remove .Branch once it's ready - - vector-cache-v2-{{ .Branch }}-{{ checksum ".circleci-weekly" }} + - vector-cache-v3-{{ .Branch }}-{{ checksum ".circleci-weekly" }} - run: name: Run tests diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in index d5aa1132b9..f14faf444e 100644 --- a/.circleci/config.yml.in +++ b/.circleci/config.yml.in @@ -266,7 +266,7 @@ jobs: keys: # NOTE: remove .Branch once it's ready {% raw %} - - vector-cache-v2-{{ .Branch }}-{{ checksum ".circleci-weekly" }} + - vector-cache-v3-{{ .Branch }}-{{ checksum ".circleci-weekly" }} {% endraw %} - run: name: Run tests From 463063bfea72f1a1fb61f9cad8fc1c0c7a66c283 Mon Sep 17 00:00:00 2001 From: moto <855818+mthrok@users.noreply.github.com> Date: Thu, 30 Apr 2020 22:18:43 +0000 Subject: [PATCH 13/19] Refresh cache --- .circleci/config.yml | 4 ++-- .circleci/config.yml.in | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 83b71f6d87..0de16e2050 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -266,7 +266,7 @@ jobs: keys: # NOTE: remove .Branch once it's ready - - vector-cache-v3-{{ .Branch }}-{{ checksum ".circleci-weekly" }} + - data-v1-{{ .Branch }}-{{ checksum ".circleci-weekly" }} - run: name: Run tests @@ -276,7 +276,7 @@ jobs: - save_cache: # NOTE: remove .Branch once it's ready - key: vector-cache-v3-{{ .Branch }}-{{ checksum ".circleci-weekly" }} + key: data-v1-{{ .Branch }}-{{ checksum ".circleci-weekly" }} paths: - .vector_cache diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in index f14faf444e..ea85bd9a94 100644 --- a/.circleci/config.yml.in +++ b/.circleci/config.yml.in @@ -266,7 +266,7 @@ jobs: keys: # NOTE: remove .Branch once it's ready {% raw %} - - vector-cache-v3-{{ .Branch }}-{{ checksum ".circleci-weekly" }} + - data-v1-{{ .Branch }}-{{ checksum ".circleci-weekly" }} {% endraw %} - run: name: Run tests @@ -276,7 +276,7 @@ jobs: - save_cache: # NOTE: remove .Branch once it's ready {% raw %} - key: vector-cache-v3-{{ .Branch }}-{{ checksum ".circleci-weekly" }} + key: data-v1-{{ .Branch }}-{{ checksum ".circleci-weekly" }} {% endraw %} paths: - .vector_cache From 46a56d80a07c8a3a84681630aa0d99deb101c95d Mon Sep 17 00:00:00 2001 From: moto <855818+mthrok@users.noreply.github.com> Date: Thu, 30 Apr 2020 23:06:59 +0000 Subject: [PATCH 14/19] Rerun test From a2a73a81d3a7d44ea7632db6d7f379891e7b360e Mon Sep 17 00:00:00 2001 From: moto <855818+mthrok@users.noreply.github.com> Date: Thu, 30 Apr 2020 19:19:25 -0700 Subject: [PATCH 15/19] Get rid of Travis conditional --- test/data/test_builtin_datasets.py | 40 +++++------------------------- 1 file changed, 6 insertions(+), 34 deletions(-) diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py index eee741e947..b388dc61de 100644 --- a/test/data/test_builtin_datasets.py +++ b/test/data/test_builtin_datasets.py @@ -19,8 +19,9 @@ def test_wikitext2_legacy(self): from torchtext.datasets import WikiText2 # smoke test to ensure wikitext2 works properly - # NOTE test_wikitext2 and test_wikitext2_legacy have some cache incompatibility, - # and keeping one's data make the other fail. So we need to clean up the cache dir + # NOTE + # test_wikitext2 and test_wikitext2_legacy have some cache incompatibility. + # Keeping one's cache make the other fail. So we need to clean up the cache dir cachedir = os.path.join(self.project_root, ".data", "wikitext-2") conditional_remove(cachedir) @@ -40,8 +41,9 @@ def test_wikitext2(self): from torchtext.experimental.datasets import WikiText2 # smoke test to ensure wikitext2 works properly - # NOTE test_wikitext2 and test_wikitext2_legacy have some cache incompatibility, - # and keeping one's data make the other fail. So we need to clean up the cache dir + # NOTE + # test_wikitext2 and test_wikitext2_legacy have some cache incompatibility. + # Keeping one's cache make the other fail. So we need to clean up the cache dir cachedir = os.path.join(self.project_root, ".data", "wikitext-2") conditional_remove(cachedir) cachefile = os.path.join(self.project_root, ".data", "wikitext-2-v1.zip") @@ -72,11 +74,6 @@ def test_penntreebank_legacy(self): train_iter, valid_iter, test_iter = ds.iters(batch_size=4, bptt_len=30) - if os.environ.get("TRAVIS") == "true": - # Delete the dataset after we're done to save disk space on CI - datafile = os.path.join(self.project_root, ".data", "penn-treebank") - conditional_remove(datafile) - def test_penntreebank(self): from torchtext.experimental.datasets import PennTreebank # smoke test to ensure wikitext2 works properly @@ -89,15 +86,6 @@ def test_penntreebank(self): tokens_ids = [vocab[token] for token in 'the player characters rest'.split()] self.assertEqual(tokens_ids, [2, 2550, 3344, 1125]) - if os.environ.get("TRAVIS") == "true": - # Delete the dataset after we're done to save disk space on CI - datafile = os.path.join(self.project_root, ".data", 'ptb.train.txt') - conditional_remove(datafile) - datafile = os.path.join(self.project_root, ".data", 'ptb.test.txt') - conditional_remove(datafile) - datafile = os.path.join(self.project_root, ".data", 'ptb.valid.txt') - conditional_remove(datafile) - def test_text_classification(self): # smoke test to ensure ag_news dataset works properly @@ -112,13 +100,6 @@ def test_text_classification(self): assert_allclose(ag_news_test[-1][1][:10], torch.tensor([2351, 758, 96, 38581, 2351, 220, 5, 396, 3, 14786]).long()) - if os.environ.get("TRAVIS") == "true": - # Delete the dataset after we're done to save disk space on CI - datafile = os.path.join(self.project_root, ".data", "ag_news_csv") - conditional_remove(datafile) - datafile = os.path.join(self.project_root, ".data", "ag_news_csv.tar.gz") - conditional_remove(datafile) - def test_imdb(self): from torchtext.experimental.datasets import IMDB from torchtext.vocab import Vocab @@ -139,12 +120,3 @@ def test_imdb(self): old_vocab = train_dataset.get_vocab() new_vocab = Vocab(counter=old_vocab.freqs, max_size=2500) new_train_data, new_test_data = IMDB(vocab=new_vocab) - - if os.environ.get("TRAVIS") == "true": - # Delete the dataset after we're done to save disk space on CI - datafile = os.path.join(self.project_root, ".data", "imdb") - conditional_remove(datafile) - datafile = os.path.join(self.project_root, ".data", "aclImdb") - conditional_remove(datafile) - datafile = os.path.join(self.project_root, ".data", "aclImdb_v1.tar.gz") - conditional_remove(datafile) From 2618ad0203be6b82b2a7f591b5e8744e072998f9 Mon Sep 17 00:00:00 2001 From: moto <855818+mthrok@users.noreply.github.com> Date: Thu, 30 Apr 2020 19:20:35 -0700 Subject: [PATCH 16/19] Remove slow utils --- test/common/test_markers.py | 7 ------- test/conftest.py | 3 --- 2 files changed, 10 deletions(-) delete mode 100644 test/common/test_markers.py delete mode 100644 test/conftest.py diff --git a/test/common/test_markers.py b/test/common/test_markers.py deleted file mode 100644 index 903f5c5450..0000000000 --- a/test/common/test_markers.py +++ /dev/null @@ -1,7 +0,0 @@ -import pytest -import os - -slow = pytest.mark.skipif( - os.getenv('RUN_SLOW', 'False') == 'False', - reason="This test is slow." -) diff --git a/test/conftest.py b/test/conftest.py deleted file mode 100644 index 3860ed6e34..0000000000 --- a/test/conftest.py +++ /dev/null @@ -1,3 +0,0 @@ -def pytest_addoption(parser): - parser.addoption("--runslow", action="store_true", - help="Run slow tests") From be75ab8a92e36a453d4e51acfd0cdf04266b11e9 Mon Sep 17 00:00:00 2001 From: moto <855818+mthrok@users.noreply.github.com> Date: Thu, 30 Apr 2020 19:22:12 -0700 Subject: [PATCH 17/19] Make test data cache key common to all CI jobs --- .circleci/config.yml.in | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in index ea85bd9a94..19af2e4590 100644 --- a/.circleci/config.yml.in +++ b/.circleci/config.yml.in @@ -264,9 +264,8 @@ jobs: command: .circleci/unittest/scripts/install.sh - restore_cache: keys: - # NOTE: remove .Branch once it's ready {% raw %} - - data-v1-{{ .Branch }}-{{ checksum ".circleci-weekly" }} + - data-v1-{{ checksum ".circleci-weekly" }} {% endraw %} - run: name: Run tests @@ -274,9 +273,8 @@ jobs: no_output_timeout: 30m command: .circleci/unittest/scripts/run_test.sh - save_cache: - # NOTE: remove .Branch once it's ready {% raw %} - key: data-v1-{{ .Branch }}-{{ checksum ".circleci-weekly" }} + key: data-v1-{{ checksum ".circleci-weekly" }} {% endraw %} paths: - .vector_cache From c0345e98ab5e41f95868fd3ed0c3f8b466522e60 Mon Sep 17 00:00:00 2001 From: moto <855818+mthrok@users.noreply.github.com> Date: Fri, 1 May 2020 02:33:31 +0000 Subject: [PATCH 18/19] Resync config --- .circleci/config.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 0de16e2050..51dc57b156 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -264,9 +264,8 @@ jobs: command: .circleci/unittest/scripts/install.sh - restore_cache: keys: - # NOTE: remove .Branch once it's ready - - data-v1-{{ .Branch }}-{{ checksum ".circleci-weekly" }} + - data-v1-{{ checksum ".circleci-weekly" }} - run: name: Run tests @@ -274,9 +273,8 @@ jobs: no_output_timeout: 30m command: .circleci/unittest/scripts/run_test.sh - save_cache: - # NOTE: remove .Branch once it's ready - key: data-v1-{{ .Branch }}-{{ checksum ".circleci-weekly" }} + key: data-v1-{{ checksum ".circleci-weekly" }} paths: - .vector_cache From 1b880c5e659a3c2449c1fdec2a808840a6a1d45b Mon Sep 17 00:00:00 2001 From: moto <855818+mthrok@users.noreply.github.com> Date: Fri, 1 May 2020 14:38:05 +0000 Subject: [PATCH 19/19] One more run to test if cache works --- .circleci/config.yml | 2 +- .circleci/config.yml.in | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 51dc57b156..5ce617105e 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -242,7 +242,7 @@ jobs: - checkout - run: name: Generate cache key - # This will refresh cache on Sundays, nightly build should refresh the cache. + # This will refresh cache on Sundays, nightly build should generate new cache. command: echo "$(date +"%Y-%U")" > .circleci-weekly - restore_cache: diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in index 19af2e4590..097a6d7b19 100644 --- a/.circleci/config.yml.in +++ b/.circleci/config.yml.in @@ -242,7 +242,7 @@ jobs: - checkout - run: name: Generate cache key - # This will refresh cache on Sundays, nightly build should refresh the cache. + # This will refresh cache on Sundays, nightly build should generate new cache. command: echo "$(date +"%Y-%U")" > .circleci-weekly - restore_cache: {% raw %}