Skip to content

Commit 5c0b1c2

Browse files
committed
Don't clobber existing files
1 parent 63919c8 commit 5c0b1c2

File tree

1 file changed

+10
-2
lines changed

1 file changed

+10
-2
lines changed

download_dataset.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,17 @@
1717
filename = ds + "." + split + '.jsonl'
1818
r = requests.get("https://storage.googleapis.com/gpt-2/output-dataset/v1/" + filename, stream=True)
1919
r.raise_for_status()
20+
file_size = int(r.headers["content-length"])
21+
filepath = os.path.join(subdir, filename)
22+
try:
23+
if os.stat(filepath).st_size == file_size:
24+
print('%s already exists and is the expected %d bytes, not redownloading' % (filepath, file_size))
25+
r.close()
26+
continue
27+
except OSError: # likely "file not found" or similar
28+
pass
2029

21-
with open(os.path.join(subdir, filename), 'wb') as f:
22-
file_size = int(r.headers["content-length"])
30+
with open(filepath, 'wb') as f:
2331
with tqdm(ncols=100, desc="Fetching " + filename, total=file_size, unit_scale=True) as pbar:
2432
for chunk in r.iter_content(chunk_size=4194304):
2533
f.write(chunk)

0 commit comments

Comments
 (0)