Skip to content

Commit 8b58a22

Browse files
authored
Add CCI cache for test data (#748)
* Add .vector_cache to cache * Bust weekly * Remove slow from `test_vocab` * Remove slow from dataset test * Add .data to cache * Remove slow utils * Make test data cache key common to all CI jobs
1 parent 53f2108 commit 8b58a22

File tree

8 files changed

+60
-63
lines changed

8 files changed

+60
-63
lines changed

.circleci/config.yml

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -240,27 +240,45 @@ jobs:
240240
resource_class: 2xlarge+
241241
steps:
242242
- checkout
243+
- run:
244+
name: Generate cache key
245+
# This will refresh cache on Sundays, nightly build should generate new cache.
246+
command: echo "$(date +"%Y-%U")" > .circleci-weekly
243247
- restore_cache:
244248

245249
keys:
246-
- env-v1-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/scripts/environment.yml" }}
250+
- env-v1-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
247251

248252
- run:
249253
name: Setup
250254
command: .circleci/unittest/scripts/setup_env.sh
251255
- save_cache:
252256

253-
key: env-v1-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/scripts/environment.yml" }}
257+
key: env-v1-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
254258

255259
paths:
256260
- conda
257261
- env
258262
- run:
259263
name: Install torchtext
260264
command: .circleci/unittest/scripts/install.sh
265+
- restore_cache:
266+
keys:
267+
268+
- data-v1-{{ checksum ".circleci-weekly" }}
269+
261270
- run:
262271
name: Run tests
272+
# Downloading embedding vector takes long time.
273+
no_output_timeout: 30m
263274
command: .circleci/unittest/scripts/run_test.sh
275+
- save_cache:
276+
277+
key: data-v1-{{ checksum ".circleci-weekly" }}
278+
279+
paths:
280+
- .vector_cache
281+
- .data
264282
- run:
265283
name: Post process
266284
command: .circleci/unittest/scripts/post_process.sh

.circleci/config.yml.in

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -240,27 +240,45 @@ jobs:
240240
resource_class: 2xlarge+
241241
steps:
242242
- checkout
243+
- run:
244+
name: Generate cache key
245+
# This will refresh cache on Sundays, nightly build should generate new cache.
246+
command: echo "$(date +"%Y-%U")" > .circleci-weekly
243247
- restore_cache:
244248
{% raw %}
245249
keys:
246-
- env-v1-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/scripts/environment.yml" }}
250+
- env-v1-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
247251
{% endraw %}
248252
- run:
249253
name: Setup
250254
command: .circleci/unittest/scripts/setup_env.sh
251255
- save_cache:
252256
{% raw %}
253-
key: env-v1-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/scripts/environment.yml" }}
257+
key: env-v1-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
254258
{% endraw %}
255259
paths:
256260
- conda
257261
- env
258262
- run:
259263
name: Install torchtext
260264
command: .circleci/unittest/scripts/install.sh
265+
- restore_cache:
266+
keys:
267+
{% raw %}
268+
- data-v1-{{ checksum ".circleci-weekly" }}
269+
{% endraw %}
261270
- run:
262271
name: Run tests
272+
# Downloading embedding vector takes long time.
273+
no_output_timeout: 30m
263274
command: .circleci/unittest/scripts/run_test.sh
275+
- save_cache:
276+
{% raw %}
277+
key: data-v1-{{ checksum ".circleci-weekly" }}
278+
{% endraw %}
279+
paths:
280+
- .vector_cache
281+
- .data
264282
- run:
265283
name: Post process
266284
command: .circleci/unittest/scripts/post_process.sh

.circleci/unittest/scripts/run_test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,5 @@ eval "$(./conda/bin/conda shell.bash hook)"
66
conda activate ./env
77

88
python -m torch.utils.collect_env
9-
pytest --cov=torchtext --junitxml=test-results/junit.xml -v test
9+
pytest --cov=torchtext --junitxml=test-results/junit.xml -v --durations 20 test
1010
flake8 torchtext test

test/common/test_markers.py

Lines changed: 0 additions & 7 deletions
This file was deleted.

test/conftest.py

Lines changed: 0 additions & 3 deletions
This file was deleted.

test/data/test_builtin_datasets.py

Lines changed: 19 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
from torchtext.datasets import AG_NEWS
55
import torch
66
from torch.testing import assert_allclose
7-
from ..common.test_markers import slow
87
from ..common.torchtext_test_case import TorchtextTestCase
98

109

@@ -16,10 +15,16 @@ def conditional_remove(f):
1615

1716

1817
class TestDataset(TorchtextTestCase):
19-
@slow
2018
def test_wikitext2_legacy(self):
2119
from torchtext.datasets import WikiText2
2220
# smoke test to ensure wikitext2 works properly
21+
22+
# NOTE
23+
# test_wikitext2 and test_wikitext2_legacy have some cache incompatibility.
24+
# Keeping one's cache make the other fail. So we need to clean up the cache dir
25+
cachedir = os.path.join(self.project_root, ".data", "wikitext-2")
26+
conditional_remove(cachedir)
27+
2328
ds = WikiText2
2429
TEXT = data.Field(lower=True, batch_first=True)
2530
train, valid, test = ds.splits(TEXT)
@@ -30,13 +35,20 @@ def test_wikitext2_legacy(self):
3035
train_iter, valid_iter, test_iter = ds.iters(batch_size=4,
3136
bptt_len=30)
3237

33-
# Delete the dataset after we're done to save disk space on CI
34-
datafile = os.path.join(self.project_root, ".data", "wikitext-2")
35-
conditional_remove(datafile)
38+
conditional_remove(cachedir)
3639

3740
def test_wikitext2(self):
3841
from torchtext.experimental.datasets import WikiText2
3942
# smoke test to ensure wikitext2 works properly
43+
44+
# NOTE
45+
# test_wikitext2 and test_wikitext2_legacy have some cache incompatibility.
46+
# Keeping one's cache make the other fail. So we need to clean up the cache dir
47+
cachedir = os.path.join(self.project_root, ".data", "wikitext-2")
48+
conditional_remove(cachedir)
49+
cachefile = os.path.join(self.project_root, ".data", "wikitext-2-v1.zip")
50+
conditional_remove(cachefile)
51+
4052
train_dataset, test_dataset, valid_dataset = WikiText2()
4153
self.assertEqual(len(train_dataset), 2049990)
4254
self.assertEqual(len(test_dataset), 241859)
@@ -46,13 +58,9 @@ def test_wikitext2(self):
4658
tokens_ids = [vocab[token] for token in 'the player characters rest'.split()]
4759
self.assertEqual(tokens_ids, [2, 286, 503, 700])
4860

49-
# Delete the dataset after we're done to save disk space on CI
50-
datafile = os.path.join(self.project_root, ".data", "wikitext-2")
51-
conditional_remove(datafile)
52-
datafile = os.path.join(self.project_root, ".data", "wikitext-2-v1.zip")
53-
conditional_remove(datafile)
61+
conditional_remove(cachedir)
62+
conditional_remove(cachefile)
5463

55-
@slow
5664
def test_penntreebank_legacy(self):
5765
from torchtext.datasets import PennTreebank
5866
# smoke test to ensure penn treebank works properly
@@ -66,10 +74,6 @@ def test_penntreebank_legacy(self):
6674
train_iter, valid_iter, test_iter = ds.iters(batch_size=4,
6775
bptt_len=30)
6876

69-
# Delete the dataset after we're done to save disk space on CI
70-
datafile = os.path.join(self.project_root, ".data", "penn-treebank")
71-
conditional_remove(datafile)
72-
7377
def test_penntreebank(self):
7478
from torchtext.experimental.datasets import PennTreebank
7579
# smoke test to ensure wikitext2 works properly
@@ -82,14 +86,6 @@ def test_penntreebank(self):
8286
tokens_ids = [vocab[token] for token in 'the player characters rest'.split()]
8387
self.assertEqual(tokens_ids, [2, 2550, 3344, 1125])
8488

85-
# Delete the dataset after we're done to save disk space on CI
86-
datafile = os.path.join(self.project_root, ".data", 'ptb.train.txt')
87-
conditional_remove(datafile)
88-
datafile = os.path.join(self.project_root, ".data", 'ptb.test.txt')
89-
conditional_remove(datafile)
90-
datafile = os.path.join(self.project_root, ".data", 'ptb.valid.txt')
91-
conditional_remove(datafile)
92-
9389
def test_text_classification(self):
9490
# smoke test to ensure ag_news dataset works properly
9591

@@ -104,13 +100,6 @@ def test_text_classification(self):
104100
assert_allclose(ag_news_test[-1][1][:10],
105101
torch.tensor([2351, 758, 96, 38581, 2351, 220, 5, 396, 3, 14786]).long())
106102

107-
# Delete the dataset after we're done to save disk space on CI
108-
datafile = os.path.join(self.project_root, ".data", "ag_news_csv")
109-
conditional_remove(datafile)
110-
datafile = os.path.join(self.project_root, ".data", "ag_news_csv.tar.gz")
111-
conditional_remove(datafile)
112-
113-
@slow
114103
def test_imdb(self):
115104
from torchtext.experimental.datasets import IMDB
116105
from torchtext.vocab import Vocab
@@ -131,11 +120,3 @@ def test_imdb(self):
131120
old_vocab = train_dataset.get_vocab()
132121
new_vocab = Vocab(counter=old_vocab.freqs, max_size=2500)
133122
new_train_data, new_test_data = IMDB(vocab=new_vocab)
134-
135-
# Delete the dataset after we're done to save disk space on CI
136-
datafile = os.path.join(self.project_root, ".data", "imdb")
137-
conditional_remove(datafile)
138-
datafile = os.path.join(self.project_root, ".data", "aclImdb")
139-
conditional_remove(datafile)
140-
datafile = os.path.join(self.project_root, ".data", "aclImdb_v1.tar.gz")
141-
conditional_remove(datafile)

test/data/test_field.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
from torch.nn import init
1010

1111
from ..common.torchtext_test_case import TorchtextTestCase, verify_numericalized_example
12-
from ..common.test_markers import slow
1312

1413

1514
class TestField(TorchtextTestCase):
@@ -866,7 +865,6 @@ def test_serialization(self):
866865

867866
assert torch.all(torch.eq(original_numericalization, pickled_numericalization))
868867

869-
@slow
870868
def test_build_vocab(self):
871869
nesting_field = data.Field(tokenize=list, init_token="<w>", eos_token="</w>")
872870

test/test_vocab.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
from torchtext import vocab
1111
from torchtext.vocab import Vectors, FastText, GloVe, CharNGram
1212

13-
from .common.test_markers import slow
1413
from .common.torchtext_test_case import TorchtextTestCase
1514

1615

@@ -93,7 +92,6 @@ def test_vocab_set_vectors(self):
9392
[0.3, 0.4]])
9493
assert_allclose(v.vectors.numpy(), expected_vectors)
9594

96-
@slow
9795
def test_vocab_download_fasttext_vectors(self):
9896
c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2})
9997
# Build a vocab and get vectors twice to test caching, then once more
@@ -131,7 +129,6 @@ def test_vocab_download_fasttext_vectors(self):
131129
vec_file = os.path.join(self.project_root, ".vector_cache", "wiki.simple.vec")
132130
conditional_remove(vec_file)
133131

134-
@slow
135132
def test_vocab_extend(self):
136133
c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2})
137134
# Build a vocab and get vectors twice to test caching.
@@ -163,7 +160,6 @@ def test_vocab_extend(self):
163160
vec_file = os.path.join(self.project_root, ".vector_cache", "wiki.simple.vec")
164161
conditional_remove(vec_file)
165162

166-
@slow
167163
def test_vocab_download_custom_vectors(self):
168164
c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2})
169165
# Build a vocab and get vectors twice to test caching.
@@ -192,7 +188,6 @@ def test_vocab_download_custom_vectors(self):
192188
vec_file = os.path.join(self.project_root, ".vector_cache", "wiki.simple.vec")
193189
conditional_remove(vec_file)
194190

195-
@slow
196191
def test_vocab_vectors_custom_cache(self):
197192
c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2})
198193
vector_cache = os.path.join('/tmp', 'vector_cache')
@@ -225,7 +220,6 @@ def test_vocab_vectors_custom_cache(self):
225220
vec_file = os.path.join(vector_cache, "wiki.simple.vec")
226221
conditional_remove(vec_file)
227222

228-
@slow
229223
def test_vocab_download_glove_vectors(self):
230224
c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2})
231225

@@ -268,7 +262,6 @@ def test_vocab_download_glove_vectors(self):
268262
conditional_remove(os.path.join(self.project_root, ".vector_cache",
269263
"glove.twitter.27B.{}d.txt".format(dim)))
270264

271-
@slow
272265
def test_vocab_download_charngram_vectors(self):
273266
c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2})
274267
# Build a vocab and get vectors twice to test caching, then once more
@@ -343,7 +336,6 @@ def test_serialization_backcompat(self):
343336
v_loaded = pickle.load(open(pickle_path, "rb"))
344337
assert v == v_loaded
345338

346-
@slow
347339
def test_vectors_get_vecs(self):
348340
vec = GloVe(name='twitter.27B', dim='25')
349341
self.assertEqual(vec.vectors.shape[0], len(vec))

0 commit comments

Comments
 (0)