From 31c831343a7a524edc2ba8664d1bc3f1565721bf Mon Sep 17 00:00:00 2001 From: Sahal Mulki <56467460+sahal-mulki@users.noreply.github.com> Date: Wed, 27 Mar 2024 11:21:23 +0000 Subject: [PATCH 1/2] Create download.py for cross-platform downloading --- download.py | 103 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 1 + 2 files changed, 104 insertions(+) create mode 100644 download.py diff --git a/download.py b/download.py new file mode 100644 index 000000000..dbb45558a --- /dev/null +++ b/download.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 + +# Copyright (c) Meta Platforms, Inc. and affiliates. +# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. + +import os +import requests +import hashlib +from tqdm import tqdm # for showing download progress + +presigned_url = input("Enter the URL from email: ") +print("") + +model_size = input("Enter the list of models to download without spaces (7B,13B,70B,7B-chat,13B-chat,70B-chat), or press Enter for all: ") +target_folder = ".\models" # where all files should end up +headers = {'User-Agent': 'wget'} # Specify wget useragent, as meta only allows downloads from wget useragent +os.makedirs(target_folder, exist_ok=True) + +if model_size == "": + model_size = "7B,13B,70B,7B-chat,13B-chat,70B-chat" + +print("Downloading LICENSE and Acceptable Usage Policy") +license_response = requests.get(presigned_url.replace('*', "LICENSE"), headers=headers) +with open(os.path.join(target_folder, "LICENSE"), 'wb') as license_file: + license_file.write(license_response.content) +policy_response = requests.get(presigned_url.replace('*', "USE_POLICY.md"), headers=headers) +with open(os.path.join(target_folder, "USE_POLICY.md"), 'wb') as policy_file: + policy_file.write(policy_response.content) + +print("Downloading tokenizer") +tokenizer_model_response = requests.get(presigned_url.replace('*', "tokenizer.model"), headers=headers) +with open(os.path.join(target_folder, "tokenizer.model"), 'wb') as tokenizer_model_file: + tokenizer_model_file.write(tokenizer_model_response.content) +tokenizer_checklist_response = requests.get(presigned_url.replace('*', "tokenizer_checklist.chk"), headers=headers) +with open(os.path.join(target_folder, "tokenizer_checklist.chk"), 'wb') as tokenizer_checklist_file: + tokenizer_checklist_file.write(tokenizer_checklist_response.content) + +def check_md5(file_path, checksum): + with open(file_path, 'rb') as f: + data = f.read() + md5 = hashlib.md5(data).hexdigest() + return md5 == checksum + +def check_checksums(folder_path, checklist_path): + with open(checklist_path) as f: + for line in f: + checksum, file_name = line.strip().split() + file_path = os.path.join(folder_path, file_name) + if check_md5(file_path, checksum): + print(f"{file_name}: OK") + else: + print(f"{file_name}: FAILED") + +check_checksums(target_folder, os.path.join(target_folder, "tokenizer_checklist.chk")) + +for model in model_size.split(','): + if model == "7B": + shard = 0 + model_path = "llama-2-7b" + elif model == "7B-chat": + shard = 0 + model_path = "llama-2-7b-chat" + elif model == "13B": + shard = 1 + model_path = "llama-2-13b" + elif model == "13B-chat": + shard = 1 + model_path = "llama-2-13b-chat" + elif model == "70B": + shard = 7 + model_path = "llama-2-70b" + elif model == "70B-chat": + shard = 7 + model_path = "llama-2-70b-chat" + + print(f"Downloading {model_path}") + os.makedirs(os.path.join(target_folder, model_path), exist_ok=True) + + for s in range(shard + 1): + print("Downloading shard " + str(s + 1) + " of shards " + str(shard + 1)) # display shard progress + consolidated_response = requests.get(presigned_url.replace('*', f"{model_path}/consolidated.{s:02d}.pth"), headers=headers, stream=True) + total_cons = int(consolidated_response.headers.get('content-length', 0)) + + with open(os.path.join(target_folder, model_path, f"consolidated.{s:02d}.pth"), 'wb') as consolidated_file, tqdm( + total=total_cons, + unit='iB', + unit_scale=True, + unit_divisor=1024, + ) as bar: # use tqdm to show real-time progress of download + for data in consolidated_response.iter_content(chunk_size=1024): + size = consolidated_file.write(data) + bar.update(size) + + params_response = requests.get(presigned_url.replace('*', f"{model_path}/params.json"), headers=headers) + with open(os.path.join(target_folder, model_path, "params.json"), 'wb') as params_file: + params_file.write(params_response.content) + checklist_response = requests.get(presigned_url.replace('*', f"{model_path}/checklist.chk"), headers=headers) + with open(os.path.join(target_folder, model_path, "checklist.chk"), 'wb') as checklist_file: + checklist_file.write(checklist_response.content) + + print("Checking checksums") + check_checksums(os.path.join(target_folder, model_path), os.path.join(target_folder, model_path, "checklist.chk")) + diff --git a/requirements.txt b/requirements.txt index 66f8a64f5..d582fbba1 100755 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ torch fairscale fire sentencepiece +tqdm \ No newline at end of file From 893da25d5c148c66aefe9d2dc0d19a76e41a2145 Mon Sep 17 00:00:00 2001 From: Sahal Mulki <56467460+sahal-mulki@users.noreply.github.com> Date: Wed, 27 Mar 2024 15:29:15 +0400 Subject: [PATCH 2/2] Update README.md to include download.py usage --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 960d5a2ec..19ee9358d 100755 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ In order to download the model weights and tokenizer, please visit the [Meta web Once your request is approved, you will receive a signed URL over email. Then run the download.sh script, passing the URL provided when prompted to start the download. -Pre-requisites: Make sure you have `wget` and `md5sum` installed. Then run the script: `./download.sh`. +Pre-requisites: Make sure you have `wget` and `md5sum` installed. Then run the script: `./download.sh` if you're on a Unix system, or else `python download.py`. Keep in mind that the links expire after 24 hours and a certain amount of downloads. If you start seeing errors such as `403: Forbidden`, you can always re-request a link. @@ -38,7 +38,7 @@ You can follow the steps below to quickly get up and running with Llama 2 models 4. Once registered, you will get an email with a URL to download the models. You will need this URL when you run the download.sh script. -5. Once you get the email, navigate to your downloaded llama repository and run the download.sh script. +5. Once you get the email, navigate to your downloaded llama repository and run the download.sh script for Unix platforms, or else the download.py script for other platforms. - Make sure to grant execution permissions to the download.sh script - During this process, you will be prompted to enter the URL from the email. - Do not use the “Copy Link” option but rather make sure to manually copy the link from the email.