From 363a065b32b2af88843742a25e4890fef5e13eec Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 11 Jun 2019 14:34:12 +0300 Subject: [PATCH 1/4] Simplify directory creation --- download_dataset.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/download_dataset.py b/download_dataset.py index fec082b..6c69064 100644 --- a/download_dataset.py +++ b/download_dataset.py @@ -4,9 +4,7 @@ from tqdm import tqdm subdir = 'data' -if not os.path.exists(subdir): - os.makedirs(subdir) -subdir = subdir.replace('\\','/') # needed for Windows +os.makedirs(subdir, exist_ok=True) for ds in [ 'webtext', From 1e78accdc638d7f57d023e829a78ac3cf67d844c Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 11 Jun 2019 14:34:41 +0300 Subject: [PATCH 2/4] Update progress bar less often --- download_dataset.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/download_dataset.py b/download_dataset.py index 6c69064..99d63ed 100644 --- a/download_dataset.py +++ b/download_dataset.py @@ -19,9 +19,7 @@ with open(os.path.join(subdir, filename), 'wb') as f: file_size = int(r.headers["content-length"]) - chunk_size = 1000 with tqdm(ncols=100, desc="Fetching " + filename, total=file_size, unit_scale=True) as pbar: - # 1k for chunk_size, since Ethernet packet size is around 1500 bytes - for chunk in r.iter_content(chunk_size=chunk_size): + for chunk in r.iter_content(chunk_size=4194304): f.write(chunk) - pbar.update(chunk_size) + pbar.update(len(chunk)) From 445236575f83987146d4880d203812d89d1dcc04 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 11 Jun 2019 14:35:54 +0300 Subject: [PATCH 3/4] Die on HTTP error --- download_dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/download_dataset.py b/download_dataset.py index 99d63ed..2fdd25f 100644 --- a/download_dataset.py +++ b/download_dataset.py @@ -16,6 +16,7 @@ for split in ['train', 'valid', 'test']: filename = ds + "." + split + '.jsonl' r = requests.get("https://openaipublic.azureedge.net/gpt-2/output-dataset/v1/" + filename, stream=True) + r.raise_for_status() with open(os.path.join(subdir, filename), 'wb') as f: file_size = int(r.headers["content-length"]) From b4285b9e0ad352819bf00f5b40cce26562e632fc Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 11 Jun 2019 14:39:36 +0300 Subject: [PATCH 4/4] Don't clobber existing files --- download_dataset.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/download_dataset.py b/download_dataset.py index 2fdd25f..a744d2c 100644 --- a/download_dataset.py +++ b/download_dataset.py @@ -17,9 +17,17 @@ filename = ds + "." + split + '.jsonl' r = requests.get("https://openaipublic.azureedge.net/gpt-2/output-dataset/v1/" + filename, stream=True) r.raise_for_status() + file_size = int(r.headers["content-length"]) + filepath = os.path.join(subdir, filename) + try: + if os.stat(filepath).st_size == file_size: + print('%s already exists and is the expected %d bytes, not redownloading' % (filepath, file_size)) + r.close() + continue + except OSError: # likely "file not found" or similar + pass - with open(os.path.join(subdir, filename), 'wb') as f: - file_size = int(r.headers["content-length"]) + with open(filepath, 'wb') as f: with tqdm(ncols=100, desc="Fetching " + filename, total=file_size, unit_scale=True) as pbar: for chunk in r.iter_content(chunk_size=4194304): f.write(chunk)