From 31c831343a7a524edc2ba8664d1bc3f1565721bf Mon Sep 17 00:00:00 2001
From: Sahal Mulki <56467460+sahal-mulki@users.noreply.github.com>
Date: Wed, 27 Mar 2024 11:21:23 +0000
Subject: [PATCH 1/2] Create download.py for cross-platform downloading

---
 download.py      | 103 +++++++++++++++++++++++++++++++++++++++++++++++
 requirements.txt |   1 +
 2 files changed, 104 insertions(+)
 create mode 100644 download.py

diff --git a/download.py b/download.py
new file mode 100644
index 000000000..dbb45558a
--- /dev/null
+++ b/download.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+
+import os
+import requests
+import hashlib
+from tqdm import tqdm # for showing download progress
+
+presigned_url = input("Enter the URL from email: ")
+print("")
+
+model_size = input("Enter the list of models to download without spaces (7B,13B,70B,7B-chat,13B-chat,70B-chat), or press Enter for all: ")
+target_folder = ".\models"             # where all files should end up
+headers = {'User-Agent': 'wget'} # Specify wget useragent, as meta only allows downloads from wget useragent
+os.makedirs(target_folder, exist_ok=True)
+
+if model_size == "":
+    model_size = "7B,13B,70B,7B-chat,13B-chat,70B-chat"
+
+print("Downloading LICENSE and Acceptable Usage Policy")
+license_response = requests.get(presigned_url.replace('*', "LICENSE"), headers=headers)
+with open(os.path.join(target_folder, "LICENSE"), 'wb') as license_file:
+    license_file.write(license_response.content)
+policy_response = requests.get(presigned_url.replace('*', "USE_POLICY.md"), headers=headers)
+with open(os.path.join(target_folder, "USE_POLICY.md"), 'wb') as policy_file:
+    policy_file.write(policy_response.content)
+
+print("Downloading tokenizer")
+tokenizer_model_response = requests.get(presigned_url.replace('*', "tokenizer.model"), headers=headers)
+with open(os.path.join(target_folder, "tokenizer.model"), 'wb') as tokenizer_model_file:
+    tokenizer_model_file.write(tokenizer_model_response.content)
+tokenizer_checklist_response = requests.get(presigned_url.replace('*', "tokenizer_checklist.chk"), headers=headers)
+with open(os.path.join(target_folder, "tokenizer_checklist.chk"), 'wb') as tokenizer_checklist_file:
+    tokenizer_checklist_file.write(tokenizer_checklist_response.content)
+
+def check_md5(file_path, checksum):
+    with open(file_path, 'rb') as f:
+        data = f.read()
+        md5 = hashlib.md5(data).hexdigest()
+        return md5 == checksum
+
+def check_checksums(folder_path, checklist_path):
+    with open(checklist_path) as f:
+        for line in f:
+            checksum, file_name = line.strip().split()
+            file_path = os.path.join(folder_path, file_name)
+            if check_md5(file_path, checksum):
+                print(f"{file_name}: OK")
+            else:
+                print(f"{file_name}: FAILED")
+
+check_checksums(target_folder, os.path.join(target_folder, "tokenizer_checklist.chk"))
+
+for model in model_size.split(','):
+    if model == "7B":
+        shard = 0
+        model_path = "llama-2-7b"
+    elif model == "7B-chat":
+        shard = 0
+        model_path = "llama-2-7b-chat"
+    elif model == "13B":
+        shard = 1
+        model_path = "llama-2-13b"
+    elif model == "13B-chat":
+        shard = 1
+        model_path = "llama-2-13b-chat"
+    elif model == "70B":
+        shard = 7
+        model_path = "llama-2-70b"
+    elif model == "70B-chat":
+        shard = 7
+        model_path = "llama-2-70b-chat"
+
+    print(f"Downloading {model_path}")
+    os.makedirs(os.path.join(target_folder, model_path), exist_ok=True)
+
+    for s in range(shard + 1):        
+        print("Downloading shard " + str(s + 1) + " of shards " + str(shard + 1)) # display shard progress
+        consolidated_response = requests.get(presigned_url.replace('*', f"{model_path}/consolidated.{s:02d}.pth"), headers=headers, stream=True)
+        total_cons = int(consolidated_response.headers.get('content-length', 0))
+
+        with open(os.path.join(target_folder, model_path, f"consolidated.{s:02d}.pth"), 'wb') as consolidated_file, tqdm(
+            total=total_cons,
+            unit='iB',
+            unit_scale=True,
+            unit_divisor=1024,
+        ) as bar: # use tqdm to show real-time progress of download
+            for data in consolidated_response.iter_content(chunk_size=1024):
+                size = consolidated_file.write(data)
+                bar.update(size)
+
+    params_response = requests.get(presigned_url.replace('*', f"{model_path}/params.json"), headers=headers)
+    with open(os.path.join(target_folder, model_path, "params.json"), 'wb') as params_file:
+        params_file.write(params_response.content)
+    checklist_response = requests.get(presigned_url.replace('*', f"{model_path}/checklist.chk"), headers=headers)
+    with open(os.path.join(target_folder, model_path, "checklist.chk"), 'wb') as checklist_file:
+        checklist_file.write(checklist_response.content)
+    
+    print("Checking checksums")
+    check_checksums(os.path.join(target_folder, model_path), os.path.join(target_folder, model_path, "checklist.chk"))
+
diff --git a/requirements.txt b/requirements.txt
index 66f8a64f5..d582fbba1 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,3 +2,4 @@ torch
 fairscale
 fire
 sentencepiece
+tqdm
\ No newline at end of file

From 893da25d5c148c66aefe9d2dc0d19a76e41a2145 Mon Sep 17 00:00:00 2001
From: Sahal Mulki <56467460+sahal-mulki@users.noreply.github.com>
Date: Wed, 27 Mar 2024 15:29:15 +0400
Subject: [PATCH 2/2] Update README.md to include download.py usage

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 960d5a2ec..19ee9358d 100755
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ In order to download the model weights and tokenizer, please visit the [Meta web
 
 Once your request is approved, you will receive a signed URL over email. Then run the download.sh script, passing the URL provided when prompted to start the download.
 
-Pre-requisites: Make sure you have `wget` and `md5sum` installed. Then run the script: `./download.sh`.
+Pre-requisites: Make sure you have `wget` and `md5sum` installed. Then run the script: `./download.sh` if you're on a Unix system, or else `python download.py`.
 
 Keep in mind that the links expire after 24 hours and a certain amount of downloads. If you start seeing errors such as `403: Forbidden`, you can always re-request a link.
 
@@ -38,7 +38,7 @@ You can follow the steps below to quickly get up and running with Llama 2 models
 
 4. Once registered, you will get an email with a URL to download the models. You will need this URL when you run the download.sh script.
 
-5. Once you get the email, navigate to your downloaded llama repository and run the download.sh script. 
+5. Once you get the email, navigate to your downloaded llama repository and run the download.sh script for Unix platforms, or else the download.py script for other platforms. 
     - Make sure to grant execution permissions to the download.sh script
     - During this process, you will be prompted to enter the URL from the email. 
     - Do not use the “Copy Link” option but rather make sure to manually copy the link from the email.