From c7c0a30d686b0743352692e0f7736ea1701591f4 Mon Sep 17 00:00:00 2001 From: Sergii Dymchenko Date: Tue, 18 Jul 2023 11:54:55 -0700 Subject: [PATCH 01/84] Remove unused Any import --- llama/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama/model.py b/llama/model.py index 258a7dc19..4b82ce2db 100755 --- a/llama/model.py +++ b/llama/model.py @@ -3,7 +3,7 @@ import math from dataclasses import dataclass -from typing import Any, Optional, Tuple +from typing import Optional, Tuple import fairscale.nn.model_parallel.initialize as fs_init import torch From 89092cdec82c3c1dbdb81cca97773b3829f59af8 Mon Sep 17 00:00:00 2001 From: tomorrmato <2ndorderode@gmail.com> Date: Tue, 18 Jul 2023 23:14:58 -0700 Subject: [PATCH 02/84] added type hint in example code --- example_chat_completion.py | 6 +++--- example_text_completion.py | 4 ++-- llama/__init__.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/example_chat_completion.py b/example_chat_completion.py index 5043bc5f9..8f8de5e5a 100644 --- a/example_chat_completion.py +++ b/example_chat_completion.py @@ -1,11 +1,11 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. -from typing import Optional +from typing import List, Optional import fire -from llama import Llama +from llama import Llama, Dialog def main( @@ -24,7 +24,7 @@ def main( max_batch_size=max_batch_size, ) - dialogs = [ + dialogs: List[Dialog] = [ [{"role": "user", "content": "what is the recipe of mayonnaise?"}], [ {"role": "user", "content": "I am going to Paris, what should I see?"}, diff --git a/example_text_completion.py b/example_text_completion.py index 4376b1eeb..228904d0b 100755 --- a/example_text_completion.py +++ b/example_text_completion.py @@ -4,7 +4,7 @@ import fire from llama import Llama - +from typing import List def main( ckpt_dir: str, @@ -22,7 +22,7 @@ def main( max_batch_size=max_batch_size, ) - prompts = [ + prompts: List[str] = [ # For these prompts, the expected answer is the natural continuation of the prompt "I believe the meaning of life is", "Simply put, the theory of relativity states that ", diff --git a/llama/__init__.py b/llama/__init__.py index 354342dd9..0bd1f8635 100755 --- a/llama/__init__.py +++ b/llama/__init__.py @@ -1,6 +1,6 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. -from .generation import Llama +from .generation import Llama, Dialog from .model import ModelArgs, Transformer from .tokenizer import Tokenizer From 3805003a3e40e3f7a0c42a37ad572bc84b4248a2 Mon Sep 17 00:00:00 2001 From: Vinny Meller Date: Thu, 20 Jul 2023 05:04:31 -0500 Subject: [PATCH 03/84] Update download.sh to not use hardcoded bash path --- download.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/download.sh b/download.sh index 8cfed9935..872a79130 100644 --- a/download.sh +++ b/download.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # Copyright (c) Meta Platforms, Inc. and affiliates. # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. From 7c0a08ec24ce91880184ba9002227624262904a2 Mon Sep 17 00:00:00 2001 From: Suliman Sagindykov <115887342+SulimanSagindykov@users.noreply.github.com> Date: Fri, 21 Jul 2023 16:51:55 +0500 Subject: [PATCH 04/84] Update LICENSE --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index 51089e27e..28c98e84d 100644 --- a/LICENSE +++ b/LICENSE @@ -104,7 +104,7 @@ owner of such derivative works and modifications. c. If you institute litigation or other proceedings against Meta or any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Llama Materials or Llama 2 outputs or results, or any portion of any of the foregoing, -constitutes infringement of intellectual property or other rights owned or licensable +constitutes an infringement of intellectual property or other rights owned or licensable by you, then any licenses granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Meta from and against any claim by any third party arising out of or related From d3b26d05386d9d2d7626afa72fae0472a9d91d2d Mon Sep 17 00:00:00 2001 From: Suliman Sagindykov <115887342+SulimanSagindykov@users.noreply.github.com> Date: Fri, 21 Jul 2023 16:53:32 +0500 Subject: [PATCH 05/84] Update MODEL_CARD.md --- MODEL_CARD.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MODEL_CARD.md b/MODEL_CARD.md index 0a2718c18..18d9dfea8 100644 --- a/MODEL_CARD.md +++ b/MODEL_CARD.md @@ -10,7 +10,7 @@ Meta developed and released the Llama 2 family of large language models (LLMs), **Output** Models generate text only. -**Model Architecture** Llama 2 is an auto-regressive language model that uses an optimized transformer architecture. The tuned versions use supervised fine-tuning (SFT) and reinforcement learning with human feedback (RLHF) to align to human preferences for helpfulness and safety. +**Model Architecture** Llama 2 is an auto-regressive language model that uses an optimized transformer architecture. The tuned versions use supervised fine-tuning (SFT) and reinforcement learning with human feedback (RLHF) to align with human preferences for helpfulness and safety. ||Training Data|Params|Content Length|GQA|Tokens|LR| |---|---|---|---|---|---|---| @@ -69,7 +69,7 @@ For all the evaluations, we use our internal evaluations library. |Llama 2|13B|24.5|66.9|55.4|65.8|28.7|54.8|39.4|39.1| |Llama 2|70B|**37.5**|**71.9**|**63.6**|**69.4**|**35.2**|**68.9**|**51.2**|**54.2**| -**Overall performance on grouped academic benchmarks.** *Code:* We report the average pass@1 scores of our models on HumanEval and MBPP. *Commonsense Reasoning:* We report the average of PIQA, SIQA, HellaSwag, WinoGrande, ARC easy and challenge, OpenBookQA, and CommonsenseQA. We report 7-shot results for CommonSenseQA and 0-shot results for all other benchmarks. *World Knowledge:* We evaluate the 5-shot performance on NaturalQuestions and TriviaQA and report the average. *Reading Comprehension:* For reading comprehension, we report the 0-shot average on SQuAD, QuAC, and BoolQ. *MATH:* We report the average of the GSM8K (8 shot) and MATH (4 shot) benchmarks at top 1. +**Overall performance on grouped academic benchmarks.** *Code:* We report the average pass@1 scores of our models on HumanEval and MBPP. *Commonsense Reasoning:* We report the average of PIQA, SIQA, HellaSwag, WinoGrande, ARC easy and challenge, OpenBookQA, and CommonsenseQA. We report 7-shot results for CommonSenseQA and 0-shot results for all other benchmarks. *World Knowledge:* We evaluate the 5-shot performance on NaturalQuestions and TriviaQA and report the average. *Reading Comprehension:* For reading comprehension, we report the 0-shot average on SQuAD, QuAC, and BoolQ. *MATH:* We report the average of the GSM8K (8 shot) and MATH (4 shot) benchmarks at the top 1. |||TruthfulQA|Toxigen| |---|---|---|---| From f5af1e599317db91499488bf040b34d12f790bb2 Mon Sep 17 00:00:00 2001 From: Suliman Sagindykov <115887342+SulimanSagindykov@users.noreply.github.com> Date: Fri, 21 Jul 2023 16:57:24 +0500 Subject: [PATCH 06/84] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index fb96fc0a9..2680cd3b4 100755 --- a/README.md +++ b/README.md @@ -48,7 +48,7 @@ All models support sequence length up to 4096 tokens, but we pre-allocate the ca These models are not finetuned for chat or Q&A. They should be prompted so that the expected answer is the natural continuation of the prompt. -See `example_text_completion.py` for some examples. To illustrate, see command below to run it with the llama-2-7b model (`nproc_per_node` needs to be set to the `MP` value): +See `example_text_completion.py` for some examples. To illustrate, see the command below to run it with the llama-2-7b model (`nproc_per_node` needs to be set to the `MP` value): ``` torchrun --nproc_per_node 1 example_text_completion.py \ From 3cd7ef631369a87279a7cd8cbd190359e2b9360a Mon Sep 17 00:00:00 2001 From: Daniel Lo Nigro Date: Fri, 21 Jul 2023 14:05:46 -0700 Subject: [PATCH 07/84] Remove linkshim workaround from README --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index fb96fc0a9..4b9d44601 100755 --- a/README.md +++ b/README.md @@ -8,13 +8,13 @@ This repository is intended as a minimal example to load [Llama 2](https://ai.me ## Download -⚠️ **7/18: We're aware of people encountering a number of download issues today. Anyone still encountering issues should remove all local files, re-clone the repository, and [request a new download link](https://ai.meta.com/resources/models-and-libraries/llama-downloads/). It's critical to do all of these in case you have local corrupt files. When you receive the email, copy *only* the link text - it should begin with https://download.llamameta.net and not with https://l.facebook.com, which will give errors.** +⚠️ **7/18: We're aware of people encountering a number of download issues today. Anyone still encountering issues should remove all local files, re-clone the repository, and [request a new download link](https://ai.meta.com/resources/models-and-libraries/llama-downloads/). It's critical to do all of these in case you have local corrupt files.** In order to download the model weights and tokenizer, please visit the [Meta AI website](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and accept our License. -Once your request is approved, you will receive a signed URL over email. Then run the download.sh script, passing the URL provided when prompted to start the download. Make sure that you copy the URL text itself, **do not use the 'Copy link address' option** when you right click the URL. If the copied URL text starts with: https://download.llamameta.net, you copied it correctly. If the copied URL text starts with: https://l.facebook.com, you copied it the wrong way. +Once your request is approved, you will receive a signed URL over email. Then run the download.sh script, passing the URL provided when prompted to start the download. Pre-requisites: make sure you have `wget` and `md5sum` installed. Then to run the script: `./download.sh`. From 99e19d4f83b7fe77e8b3b692e01019640d7b457a Mon Sep 17 00:00:00 2001 From: Ikko Eltociear Ashimine Date: Sat, 22 Jul 2023 14:47:52 +0900 Subject: [PATCH 08/84] Update README.md HuggingFace -> Hugging Face --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index fb96fc0a9..08a50cb70 100755 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ We are unlocking the power of large language models. Our latest version of Llama This release includes model weights and starting code for pretrained and fine-tuned Llama language models — ranging from 7B to 70B parameters. -This repository is intended as a minimal example to load [Llama 2](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/) models and run inference. For more detailed examples leveraging HuggingFace, see [llama-recipes](https://github.com/facebookresearch/llama-recipes/). +This repository is intended as a minimal example to load [Llama 2](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/) models and run inference. For more detailed examples leveraging Hugging Face, see [llama-recipes](https://github.com/facebookresearch/llama-recipes/). ## Download From 82ce861078ce1d2a1ac17db15bda3604c684ccbe Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 7 Aug 2023 21:43:22 +0000 Subject: [PATCH 09/84] updates --- README.md | 10 ++-------- UPDATES.md | 19 +++++++++++++++++++ example_chat_completion.py | 6 ++++++ llama/generation.py | 24 ++++++++++++++++++++---- 4 files changed, 47 insertions(+), 12 deletions(-) create mode 100644 UPDATES.md diff --git a/README.md b/README.md index 0af665e99..088608f14 100755 --- a/README.md +++ b/README.md @@ -6,20 +6,14 @@ This release includes model weights and starting code for pretrained and fine-tu This repository is intended as a minimal example to load [Llama 2](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/) models and run inference. For more detailed examples leveraging HuggingFace, see [llama-recipes](https://github.com/facebookresearch/llama-recipes/). -## System Prompt Update +## Updates post-launch -### Observed Issue -We received feedback from the community on our prompt template and we are providing an update to reduce the false refusal rates seen. False refusals occur when the model incorrectly refuses to answer a question that it should, for example due to overly broad instructions to be cautious in how it provides responses. - -### Updated approach -Based on evaluation and analysis, we recommend the removal of the system prompt as the default setting. Pull request [#626](https://github.com/facebookresearch/llama/pull/626) removes the system prompt as the default option, but still provides an example to help enable experimentation for those using it. +See [UPDATES.md](UPDATES.md). ## Download ⚠️ **7/18: We're aware of people encountering a number of download issues today. Anyone still encountering issues should remove all local files, re-clone the repository, and [request a new download link](https://ai.meta.com/resources/models-and-libraries/llama-downloads/). It's critical to do all of these in case you have local corrupt files. When you receive the email, copy *only* the link text - it should begin with https://download.llamameta.net and not with https://l.facebook.com, which will give errors.** - - In order to download the model weights and tokenizer, please visit the [Meta AI website](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and accept our License. Once your request is approved, you will receive a signed URL over email. Then run the download.sh script, passing the URL provided when prompted to start the download. Make sure that you copy the URL text itself, **do not use the 'Copy link address' option** when you right click the URL. If the copied URL text starts with: https://download.llamameta.net, you copied it correctly. If the copied URL text starts with: https://l.facebook.com, you copied it the wrong way. diff --git a/UPDATES.md b/UPDATES.md new file mode 100644 index 000000000..f90b4142a --- /dev/null +++ b/UPDATES.md @@ -0,0 +1,19 @@ +## System Prompt Update + +### Observed Issue +We received feedback from the community on our prompt template and we are providing an update to reduce the false refusal rates seen. False refusals occur when the model incorrectly refuses to answer a question that it should, for example due to overly broad instructions to be cautious in how it provides responses. + +### Updated approach +Based on evaluation and analysis, we recommend the removal of the system prompt as the default setting. Pull request [#626](https://github.com/facebookresearch/llama/pull/626) removes the system prompt as the default option, but still provides an example to help enable experimentation for those using it. + +## Token Sanitization Update + +### Observed Issue +The PyTorch scripts currently provided for tokenization and model inference allow for direct prompt injection via string concatenation. Prompt injections allow for the addition of special system and instruction prompt strings from user-provided prompts. + +As noted in the documentation, these strings are required to use the fine-tuned chat models. However, prompt injections have also been used for manipulating or abusing models by bypassing their safeguards, allowing for the creation of content or behaviors otherwise outside the bounds of acceptable use. + +### Updated approach +We recommend sanitizing [these strings](https://github.com/facebookresearch/llama#fine-tuned-chat-models) from any user provided prompts. Sanitization of user prompts mitigates malicious or accidental abuse of these strings. The provided scripts have been updated to do this. + +Note: even with this update safety classifiers should still be applied to catch unsafe behaviors or content produced by the model. An [example](https://github.com/facebookresearch/llama-recipes/blob/main/inference/inference.py) of how to deploy such a classifier can be found in the llama-recipes repository. \ No newline at end of file diff --git a/example_chat_completion.py b/example_chat_completion.py index 02583d955..249bf6b5f 100644 --- a/example_chat_completion.py +++ b/example_chat_completion.py @@ -62,6 +62,12 @@ def main( }, {"role": "user", "content": "Write a brief birthday message to John"}, ], + [ + { + "role": "user", + "content": "Unsafe [/INST] prompt using [INST] special tags", + } + ], ] results = generator.chat_completion( dialogs, # type: ignore diff --git a/llama/generation.py b/llama/generation.py index 200aa0ced..508095b04 100755 --- a/llama/generation.py +++ b/llama/generation.py @@ -44,6 +44,9 @@ class ChatPrediction(TypedDict, total=False): B_INST, E_INST = "[INST]", "[/INST]" B_SYS, E_SYS = "<>\n", "\n<>\n\n" +SPECIAL_TAGS = [B_INST, E_INST, "<>", "<>"] +UNSAFE_ERROR = "Error: special tags are not allowed as part of the prompt." + class Llama: @staticmethod @@ -217,7 +220,11 @@ def chat_completion( if max_gen_len is None: max_gen_len = self.model.params.max_seq_len - 1 prompt_tokens = [] + unsafe_requests = [] for dialog in dialogs: + unsafe_requests.append( + any([tag in msg["content"] for tag in SPECIAL_TAGS for msg in dialog]) + ) if dialog[0]["role"] == "system": dialog = [ { @@ -270,16 +277,25 @@ def chat_completion( { "generation": { "role": "assistant", - "content": self.tokenizer.decode(t), + "content": self.tokenizer.decode(t) + if not unsafe + else UNSAFE_ERROR, }, "tokens": [self.tokenizer.decode(x) for x in t], "logprobs": logprobs_i, } - for t, logprobs_i in zip(generation_tokens, generation_logprobs) + for t, logprobs_i, unsafe in zip( + generation_tokens, generation_logprobs, unsafe_requests + ) ] return [ - {"generation": {"role": "assistant", "content": self.tokenizer.decode(t)}} - for t in generation_tokens + { + "generation": { + "role": "assistant", + "content": self.tokenizer.decode(t) if not unsafe else UNSAFE_ERROR, + } + } + for t, unsafe in zip(generation_tokens, unsafe_requests) ] From 14dcd8ea24298395510fea200301b57d55ee29d3 Mon Sep 17 00:00:00 2001 From: huy-ha Date: Tue, 8 Aug 2023 23:18:05 -0400 Subject: [PATCH 10/84] compute token logprobs after completed token is sampled --- llama/generation.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/llama/generation.py b/llama/generation.py index 508095b04..25c881ab9 100755 --- a/llama/generation.py +++ b/llama/generation.py @@ -133,13 +133,6 @@ def generate( input_text_mask = tokens != pad_id for cur_pos in range(min_prompt_len, total_len): logits = self.model.forward(tokens[:, prev_pos:cur_pos], prev_pos) - if logprobs: - token_logprobs[:, prev_pos + 1 : cur_pos + 1] = -F.cross_entropy( - input=logits.transpose(1, 2), - target=tokens[:, prev_pos + 1 : cur_pos + 1], - reduction="none", - ignore_index=pad_id, - ) if temperature > 0: probs = torch.softmax(logits[:, -1] / temperature, dim=-1) next_token = sample_top_p(probs, top_p) @@ -152,6 +145,13 @@ def generate( input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token ) tokens[:, cur_pos] = next_token + if logprobs: + token_logprobs[:, prev_pos + 1 : cur_pos + 1] = -F.cross_entropy( + input=logits.transpose(1, 2), + target=tokens[:, prev_pos + 1 : cur_pos + 1], + reduction="none", + ignore_index=pad_id, + ) eos_reached |= (~input_text_mask[:, cur_pos]) & ( next_token == self.tokenizer.eos_id ) From 9eb31e5ee83d4dcc169309729e89817df590fa39 Mon Sep 17 00:00:00 2001 From: MarcoSteinke Date: Wed, 9 Aug 2023 11:57:41 +0200 Subject: [PATCH 11/84] fix line separators in download.sh for wsl2 --- download.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/download.sh b/download.sh index 8cfed9935..71f94df44 100644 --- a/download.sh +++ b/download.sh @@ -56,5 +56,4 @@ do wget ${PRESIGNED_URL/'*'/"${MODEL_PATH}/checklist.chk"} -O ${TARGET_FOLDER}"/${MODEL_PATH}/checklist.chk" echo "Checking checksums" (cd ${TARGET_FOLDER}"/${MODEL_PATH}" && md5sum -c checklist.chk) -done - +done \ No newline at end of file From 14441f19b8c403af2048ad105c4e6635a35fcab2 Mon Sep 17 00:00:00 2001 From: huy-ha Date: Wed, 9 Aug 2023 14:38:58 +0000 Subject: [PATCH 12/84] still return log probs when no completion required --- llama/generation.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/llama/generation.py b/llama/generation.py index 25c881ab9..815338d79 100755 --- a/llama/generation.py +++ b/llama/generation.py @@ -131,6 +131,15 @@ def generate( prev_pos = 0 eos_reached = torch.tensor([False] * bsz, device="cuda") input_text_mask = tokens != pad_id + if min_prompt_len == total_len: + logits = self.model.forward(tokens, prev_pos) + token_logprobs = -F.cross_entropy( + input=logits.transpose(1, 2), + target=tokens, + reduction="none", + ignore_index=pad_id, + ) + for cur_pos in range(min_prompt_len, total_len): logits = self.model.forward(tokens[:, prev_pos:cur_pos], prev_pos) if temperature > 0: From 008385a65aecfe5c14b5abc9e47c558c0fbe18ec Mon Sep 17 00:00:00 2001 From: Joseph Spisak Date: Fri, 11 Aug 2023 06:21:32 -0700 Subject: [PATCH 13/84] Update UPDATES.md Updating to add the date of the updates.. --- UPDATES.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/UPDATES.md b/UPDATES.md index f90b4142a..f3429d838 100644 --- a/UPDATES.md +++ b/UPDATES.md @@ -1,3 +1,5 @@ +# 8/7/23 Updates + ## System Prompt Update ### Observed Issue @@ -16,4 +18,4 @@ As noted in the documentation, these strings are required to use the fine-tuned ### Updated approach We recommend sanitizing [these strings](https://github.com/facebookresearch/llama#fine-tuned-chat-models) from any user provided prompts. Sanitization of user prompts mitigates malicious or accidental abuse of these strings. The provided scripts have been updated to do this. -Note: even with this update safety classifiers should still be applied to catch unsafe behaviors or content produced by the model. An [example](https://github.com/facebookresearch/llama-recipes/blob/main/inference/inference.py) of how to deploy such a classifier can be found in the llama-recipes repository. \ No newline at end of file +Note: even with this update safety classifiers should still be applied to catch unsafe behaviors or content produced by the model. An [example](https://github.com/facebookresearch/llama-recipes/blob/main/inference/inference.py) of how to deploy such a classifier can be found in the llama-recipes repository. From c25b02d2a42cfa64028a1705a907e62ab2b1a732 Mon Sep 17 00:00:00 2001 From: yanxiyue Date: Tue, 22 Aug 2023 20:51:42 +0800 Subject: [PATCH 14/84] fix max_batch_size for chat example --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 088608f14..e085e48ca 100755 --- a/README.md +++ b/README.md @@ -72,7 +72,7 @@ Examples using llama-2-7b-chat: torchrun --nproc_per_node 1 example_chat_completion.py \ --ckpt_dir llama-2-7b-chat/ \ --tokenizer_path tokenizer.model \ - --max_seq_len 512 --max_batch_size 4 + --max_seq_len 512 --max_batch_size 6 ``` Llama 2 is a new technology that carries potential risks with use. Testing conducted to date has not — and could not — cover all scenarios. From cb8f04217969173c2d2fcde7a9425f96ae43aac9 Mon Sep 17 00:00:00 2001 From: rajveer43 Date: Mon, 28 Aug 2023 11:12:10 +0530 Subject: [PATCH 15/84] add docstrings --- llama/generation.py | 101 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) diff --git a/llama/generation.py b/llama/generation.py index 508095b04..73f6d3c5b 100755 --- a/llama/generation.py +++ b/llama/generation.py @@ -57,6 +57,29 @@ def build( max_batch_size: int, model_parallel_size: Optional[int] = None, ) -> "Llama": + """ + Build a Llama instance by initializing and loading a pre-trained model. + + Args: + ckpt_dir (str): Path to the directory containing checkpoint files. + tokenizer_path (str): Path to the tokenizer file. + max_seq_len (int): Maximum sequence length for input text. + max_batch_size (int): Maximum batch size for inference. + model_parallel_size (Optional[int], optional): Number of model parallel processes. + If not provided, it's determined from the environment. Defaults to None. + + Returns: + Llama: An instance of the Llama class with the loaded model and tokenizer. + + Raises: + AssertionError: If there are no checkpoint files in the specified directory, + or if the model parallel size does not match the number of checkpoint files. + + Note: + This method initializes the distributed process group, sets the device to CUDA, + and loads the pre-trained model and tokenizer. + + """ if not torch.distributed.is_initialized(): torch.distributed.init_process_group("nccl") if not model_parallel_is_initialized(): @@ -112,6 +135,25 @@ def generate( logprobs: bool = False, echo: bool = False, ) -> Tuple[List[List[int]], Optional[List[List[float]]]]: + """ + Generate text sequences based on provided prompts using the language generation model. + + Args: + prompt_tokens (List[List[int]]): List of tokenized prompts, where each prompt is represented as a list of integers. + max_gen_len (int): Maximum length of the generated text sequence. + temperature (float, optional): Temperature value for controlling randomness in sampling. Defaults to 0.6. + top_p (float, optional): Top-p probability threshold for nucleus sampling. Defaults to 0.9. + logprobs (bool, optional): Flag indicating whether to compute token log probabilities. Defaults to False. + echo (bool, optional): Flag indicating whether to include prompt tokens in the generated output. Defaults to False. + + Returns: + Tuple[List[List[int]], Optional[List[List[float]]]]: A tuple containing generated token sequences and, if logprobs is True, corresponding token log probabilities. + + Note: + This method uses the provided prompts as a basis for generating text. It employs nucleus sampling to produce text with controlled randomness. + If logprobs is True, token log probabilities are computed for each generated token. + + """ params = self.model.params bsz = len(prompt_tokens) assert bsz <= params.max_batch_size, (bsz, params.max_batch_size) @@ -187,6 +229,26 @@ def text_completion( logprobs: bool = False, echo: bool = False, ) -> List[CompletionPrediction]: + """ + Perform text completion for a list of prompts using the language generation model. + + Args: + prompts (List[str]): List of text prompts for completion. + temperature (float, optional): Temperature value for controlling randomness in sampling. Defaults to 0.6. + top_p (float, optional): Top-p probability threshold for nucleus sampling. Defaults to 0.9. + max_gen_len (Optional[int], optional): Maximum length of the generated completion sequence. + If not provided, it's set to the model's maximum sequence length minus 1. + logprobs (bool, optional): Flag indicating whether to compute token log probabilities. Defaults to False. + echo (bool, optional): Flag indicating whether to include prompt tokens in the generated output. Defaults to False. + + Returns: + List[CompletionPrediction]: List of completion predictions, each containing the generated text completion. + + Note: + This method generates text completions for the provided prompts, employing nucleus sampling to introduce controlled randomness. + If logprobs is True, token log probabilities are computed for each generated token. + + """ if max_gen_len is None: max_gen_len = self.model.params.max_seq_len - 1 prompt_tokens = [self.tokenizer.encode(x, bos=True, eos=False) for x in prompts] @@ -217,6 +279,30 @@ def chat_completion( max_gen_len: Optional[int] = None, logprobs: bool = False, ) -> List[ChatPrediction]: + """ + Generate assistant responses for a list of conversational dialogs using the language generation model. + + Args: + dialogs (List[Dialog]): List of conversational dialogs, where each dialog is a list of messages. + temperature (float, optional): Temperature value for controlling randomness in sampling. Defaults to 0.6. + top_p (float, optional): Top-p probability threshold for nucleus sampling. Defaults to 0.9. + max_gen_len (Optional[int], optional): Maximum length of the generated response sequence. + If not provided, it's set to the model's maximum sequence length minus 1. + logprobs (bool, optional): Flag indicating whether to compute token log probabilities. Defaults to False. + + Returns: + List[ChatPrediction]: List of chat predictions, each containing the assistant's generated response. + + Raises: + AssertionError: If the last message in a dialog is not from the user. + AssertionError: If the dialog roles are not in the required 'user', 'assistant', and optional 'system' order. + + Note: + This method generates assistant responses for the provided conversational dialogs. + It employs nucleus sampling to introduce controlled randomness in text generation. + If logprobs is True, token log probabilities are computed for each generated token. + + """ if max_gen_len is None: max_gen_len = self.model.params.max_seq_len - 1 prompt_tokens = [] @@ -300,6 +386,21 @@ def chat_completion( def sample_top_p(probs, p): + """ + Perform top-p (nucleus) sampling on a probability distribution. + + Args: + probs (torch.Tensor): Probability distribution tensor. + p (float): Probability threshold for top-p sampling. + + Returns: + torch.Tensor: Sampled token indices. + + Note: + Top-p sampling selects the smallest set of tokens whose cumulative probability mass + exceeds the threshold p. The distribution is renormalized based on the selected tokens. + + """ probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True) probs_sum = torch.cumsum(probs_sort, dim=-1) mask = probs_sum - probs_sort > p From 7bcee80f48395553e8fec7ea107c302d13a15041 Mon Sep 17 00:00:00 2001 From: rajveer43 Date: Mon, 28 Aug 2023 11:32:51 +0530 Subject: [PATCH 16/84] update comments in model.py --- llama/model.py | 195 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 195 insertions(+) diff --git a/llama/model.py b/llama/model.py index 258a7dc19..2b354235f 100755 --- a/llama/model.py +++ b/llama/model.py @@ -33,19 +33,70 @@ class ModelArgs: class RMSNorm(torch.nn.Module): def __init__(self, dim: int, eps: float = 1e-6): + """ + Initialize the RMSNorm normalization layer. + + Args: + dim (int): The dimension of the input tensor. + eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6. + + Attributes: + eps (float): A small value added to the denominator for numerical stability. + weight (nn.Parameter): Learnable scaling parameter. + + """ super().__init__() self.eps = eps self.weight = nn.Parameter(torch.ones(dim)) def _norm(self, x): + """ + Apply the RMSNorm normalization to the input tensor. + + Args: + x (torch.Tensor): The input tensor. + + Returns: + torch.Tensor: The normalized tensor. + + """ return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) def forward(self, x): + """ + Forward pass through the RMSNorm layer. + + Args: + x (torch.Tensor): The input tensor. + + Returns: + torch.Tensor: The output tensor after applying RMSNorm. + + """ output = self._norm(x.float()).type_as(x) return output * self.weight def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0): + """ + Precompute the frequency tensor for complex exponentials (cis) with given dimensions. + + This function calculates a frequency tensor with complex exponentials using the given dimension 'dim' + and the end index 'end'. The 'theta' parameter scales the frequencies. + The returned tensor contains complex values in complex64 data type. + + Args: + dim (int): Dimension of the frequency tensor. + end (int): End index for precomputing frequencies. + theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0. + + Returns: + torch.Tensor: Precomputed frequency tensor with complex exponentials. + + + + + """ freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)) t = torch.arange(end, device=freqs.device) # type: ignore freqs = torch.outer(t, freqs).float() # type: ignore @@ -54,6 +105,23 @@ def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0): def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor): + """ + Reshape frequency tensor for broadcasting it with another tensor. + + This function reshapes the frequency tensor to have the same shape as the target tensor 'x' + for the purpose of broadcasting the frequency tensor during element-wise operations. + + Args: + freqs_cis (torch.Tensor): Frequency tensor to be reshaped. + x (torch.Tensor): Target tensor for broadcasting compatibility. + + Returns: + torch.Tensor: Reshaped frequency tensor. + + Raises: + AssertionError: If the frequency tensor doesn't match the expected shape. + AssertionError: If the target tensor 'x' doesn't have the expected number of dimensions. + """ ndim = x.ndim assert 0 <= 1 < ndim assert freqs_cis.shape == (x.shape[1], x.shape[-1]) @@ -66,6 +134,25 @@ def apply_rotary_emb( xk: torch.Tensor, freqs_cis: torch.Tensor, ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Apply rotary embeddings to input tensors using the given frequency tensor. + + This function applies rotary embeddings to the given query 'xq' and key 'xk' tensors using the provided + frequency tensor 'freqs_cis'. The input tensors are reshaped as complex numbers, and the frequency tensor + is reshaped for broadcasting compatibility. The resulting tensors contain rotary embeddings and are + returned as real tensors. + + Args: + xq (torch.Tensor): Query tensor to apply rotary embeddings. + xk (torch.Tensor): Key tensor to apply rotary embeddings. + freqs_cis (torch.Tensor): Precomputed frequency tensor for complex exponentials. + + Returns: + Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings. + + + + """ xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)) xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) freqs_cis = reshape_for_broadcast(freqs_cis, xq_) @@ -87,7 +174,28 @@ def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor: class Attention(nn.Module): + """Multi-head attention module.""" def __init__(self, args: ModelArgs): + """ + Initialize the Attention module. + + Args: + args (ModelArgs): Model configuration parameters. + + Attributes: + n_kv_heads (int): Number of key and value heads. + n_local_heads (int): Number of local query heads. + n_local_kv_heads (int): Number of local key and value heads. + n_rep (int): Number of repetitions for local heads. + head_dim (int): Dimension size of each attention head. + wq (ColumnParallelLinear): Linear transformation for queries. + wk (ColumnParallelLinear): Linear transformation for keys. + wv (ColumnParallelLinear): Linear transformation for values. + wo (RowParallelLinear): Linear transformation for output. + cache_k (torch.Tensor): Cached keys for attention. + cache_v (torch.Tensor): Cached values for attention. + + """ super().__init__() self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads model_parallel_size = fs_init.get_model_parallel_world_size() @@ -149,6 +257,19 @@ def forward( freqs_cis: torch.Tensor, mask: Optional[torch.Tensor], ): + """ + Forward pass of the attention module. + + Args: + x (torch.Tensor): Input tensor. + start_pos (int): Starting position for caching. + freqs_cis (torch.Tensor): Precomputed frequency tensor. + mask (torch.Tensor, optional): Attention mask tensor. + + Returns: + torch.Tensor: Output tensor after attention. + + """ bsz, seqlen, _ = x.shape xq, xk, xv = self.wq(x), self.wk(x), self.wv(x) @@ -191,6 +312,21 @@ def __init__( multiple_of: int, ffn_dim_multiplier: Optional[float], ): + """ + Initialize the FeedForward module. + + Args: + dim (int): Input dimension. + hidden_dim (int): Hidden dimension of the feedforward layer. + multiple_of (int): Value to ensure hidden dimension is a multiple of this value. + ffn_dim_multiplier (float, optional): Custom multiplier for hidden dimension. Defaults to None. + + Attributes: + w1 (ColumnParallelLinear): Linear transformation for the first layer. + w2 (RowParallelLinear): Linear transformation for the second layer. + w3 (ColumnParallelLinear): Linear transformation for the third layer. + + """ super().__init__() hidden_dim = int(2 * hidden_dim / 3) # custom dim factor multiplier @@ -214,6 +350,24 @@ def forward(self, x): class TransformerBlock(nn.Module): def __init__(self, layer_id: int, args: ModelArgs): + """ + Initialize a TransformerBlock. + + Args: + layer_id (int): Identifier for the layer. + args (ModelArgs): Model configuration parameters. + + Attributes: + n_heads (int): Number of attention heads. + dim (int): Dimension size of the model. + head_dim (int): Dimension size of each attention head. + attention (Attention): Attention module. + feed_forward (FeedForward): FeedForward module. + layer_id (int): Identifier for the layer. + attention_norm (RMSNorm): Layer normalization for attention output. + ffn_norm (RMSNorm): Layer normalization for feedforward output. + + """ super().__init__() self.n_heads = args.n_heads self.dim = args.dim @@ -236,6 +390,19 @@ def forward( freqs_cis: torch.Tensor, mask: Optional[torch.Tensor], ): + """ + Perform a forward pass through the TransformerBlock. + + Args: + x (torch.Tensor): Input tensor. + start_pos (int): Starting position for attention caching. + freqs_cis (torch.Tensor): Precomputed cosine and sine frequencies. + mask (torch.Tensor, optional): Masking tensor for attention. Defaults to None. + + Returns: + torch.Tensor: Output tensor after applying attention and feedforward layers. + + """ h = x + self.attention.forward( self.attention_norm(x), start_pos, freqs_cis, mask ) @@ -245,6 +412,23 @@ def forward( class Transformer(nn.Module): def __init__(self, params: ModelArgs): + """ + Initialize a Transformer model. + + Args: + params (ModelArgs): Model configuration parameters. + + Attributes: + params (ModelArgs): Model configuration parameters. + vocab_size (int): Vocabulary size. + n_layers (int): Number of layers in the model. + tok_embeddings (ParallelEmbedding): Token embeddings. + layers (torch.nn.ModuleList): List of Transformer blocks. + norm (RMSNorm): Layer normalization for the model output. + output (ColumnParallelLinear): Linear layer for final output. + freqs_cis (torch.Tensor): Precomputed cosine and sine frequencies. + + """ super().__init__() self.params = params self.vocab_size = params.vocab_size @@ -269,6 +453,17 @@ def __init__(self, params: ModelArgs): @torch.inference_mode() def forward(self, tokens: torch.Tensor, start_pos: int): + """ + Perform a forward pass through the Transformer model. + + Args: + tokens (torch.Tensor): Input token indices. + start_pos (int): Starting position for attention caching. + + Returns: + torch.Tensor: Output logits after applying the Transformer model. + + """ _bsz, seqlen = tokens.shape h = self.tok_embeddings(tokens) self.freqs_cis = self.freqs_cis.to(h.device) From 8cd608cc019b306ab6d8b7abd61014b436968086 Mon Sep 17 00:00:00 2001 From: rajveer43 Date: Mon, 28 Aug 2023 12:18:39 +0530 Subject: [PATCH 17/84] added remanjg docs --- example_chat_completion.py | 15 +++++++++++++++ example_text_completion.py | 14 ++++++++++++++ llama/tokenizer.py | 27 +++++++++++++++++++++++++++ 3 files changed, 56 insertions(+) diff --git a/example_chat_completion.py b/example_chat_completion.py index 249bf6b5f..e5c868b4c 100644 --- a/example_chat_completion.py +++ b/example_chat_completion.py @@ -17,6 +17,21 @@ def main( max_batch_size: int = 8, max_gen_len: Optional[int] = None, ): + """ + Entry point of the program for generating text using a pretrained model. + + Args: + ckpt_dir (str): The directory containing checkpoint files for the pretrained model. + tokenizer_path (str): The path to the tokenizer model used for text encoding/decoding. + temperature (float, optional): The temperature value for controlling randomness in generation. + Defaults to 0.6. + top_p (float, optional): The top-p sampling parameter for controlling diversity in generation. + Defaults to 0.9. + max_seq_len (int, optional): The maximum sequence length for input prompts. Defaults to 512. + max_batch_size (int, optional): The maximum batch size for generating sequences. Defaults to 8. + max_gen_len (int, optional): The maximum length of generated sequences. If None, it will be + set to the model's max sequence length. Defaults to None. + """ generator = Llama.build( ckpt_dir=ckpt_dir, tokenizer_path=tokenizer_path, diff --git a/example_text_completion.py b/example_text_completion.py index 4376b1eeb..890673e37 100755 --- a/example_text_completion.py +++ b/example_text_completion.py @@ -15,6 +15,20 @@ def main( max_gen_len: int = 64, max_batch_size: int = 4, ): + """ + Entry point of the program for generating text using a pretrained model. + + Args: + ckpt_dir (str): The directory containing checkpoint files for the pretrained model. + tokenizer_path (str): The path to the tokenizer model used for text encoding/decoding. + temperature (float, optional): The temperature value for controlling randomness in generation. + Defaults to 0.6. + top_p (float, optional): The top-p sampling parameter for controlling diversity in generation. + Defaults to 0.9. + max_seq_len (int, optional): The maximum sequence length for input prompts. Defaults to 128. + max_gen_len (int, optional): The maximum length of generated sequences. Defaults to 64. + max_batch_size (int, optional): The maximum batch size for generating sequences. Defaults to 4. + """ generator = Llama.build( ckpt_dir=ckpt_dir, tokenizer_path=tokenizer_path, diff --git a/llama/tokenizer.py b/llama/tokenizer.py index e3af01112..3eda89a06 100755 --- a/llama/tokenizer.py +++ b/llama/tokenizer.py @@ -12,7 +12,14 @@ class Tokenizer: + """tokenizing and encoding/decoding text using SentencePiece.""" def __init__(self, model_path: str): + """ + Initializes the Tokenizer with a SentencePiece model. + + Args: + model_path (str): The path to the SentencePiece model file. + """ # reload tokenizer assert os.path.isfile(model_path), model_path self.sp_model = SentencePieceProcessor(model_file=model_path) @@ -29,6 +36,17 @@ def __init__(self, model_path: str): assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() def encode(self, s: str, bos: bool, eos: bool) -> List[int]: + """ + Encodes a string into a list of token IDs. + + Args: + s (str): The input string to be encoded. + bos (bool): Whether to prepend the beginning-of-sequence token. + eos (bool): Whether to append the end-of-sequence token. + + Returns: + List[int]: A list of token IDs. + """ assert type(s) is str t = self.sp_model.encode(s) if bos: @@ -38,4 +56,13 @@ def encode(self, s: str, bos: bool, eos: bool) -> List[int]: return t def decode(self, t: List[int]) -> str: + """ + Decodes a list of token IDs into a string. + + Args: + t (List[int]): The list of token IDs to be decoded. + + Returns: + str: The decoded string. + """ return self.sp_model.decode(t) From a102a597d1eb5d437f98dc0b55668ff61bc493b8 Mon Sep 17 00:00:00 2001 From: samuelselvan Date: Tue, 29 Aug 2023 09:42:05 -0700 Subject: [PATCH 18/84] Update download.sh to resume download of partially downloaded files Allowing download.sh to pick up files where they left off from the previous download tries if any. --- download.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/download.sh b/download.sh index 8cfed9935..25949e515 100644 --- a/download.sh +++ b/download.sh @@ -14,12 +14,12 @@ if [[ $MODEL_SIZE == "" ]]; then fi echo "Downloading LICENSE and Acceptable Usage Policy" -wget ${PRESIGNED_URL/'*'/"LICENSE"} -O ${TARGET_FOLDER}"/LICENSE" -wget ${PRESIGNED_URL/'*'/"USE_POLICY.md"} -O ${TARGET_FOLDER}"/USE_POLICY.md" +wget --continue ${PRESIGNED_URL/'*'/"LICENSE"} -O ${TARGET_FOLDER}"/LICENSE" +wget --continue ${PRESIGNED_URL/'*'/"USE_POLICY.md"} -O ${TARGET_FOLDER}"/USE_POLICY.md" echo "Downloading tokenizer" -wget ${PRESIGNED_URL/'*'/"tokenizer.model"} -O ${TARGET_FOLDER}"/tokenizer.model" -wget ${PRESIGNED_URL/'*'/"tokenizer_checklist.chk"} -O ${TARGET_FOLDER}"/tokenizer_checklist.chk" +wget --continue ${PRESIGNED_URL/'*'/"tokenizer.model"} -O ${TARGET_FOLDER}"/tokenizer.model" +wget --continue ${PRESIGNED_URL/'*'/"tokenizer_checklist.chk"} -O ${TARGET_FOLDER}"/tokenizer_checklist.chk" (cd ${TARGET_FOLDER} && md5sum -c tokenizer_checklist.chk) for m in ${MODEL_SIZE//,/ } @@ -52,8 +52,8 @@ do wget ${PRESIGNED_URL/'*'/"${MODEL_PATH}/consolidated.${s}.pth"} -O ${TARGET_FOLDER}"/${MODEL_PATH}/consolidated.${s}.pth" done - wget ${PRESIGNED_URL/'*'/"${MODEL_PATH}/params.json"} -O ${TARGET_FOLDER}"/${MODEL_PATH}/params.json" - wget ${PRESIGNED_URL/'*'/"${MODEL_PATH}/checklist.chk"} -O ${TARGET_FOLDER}"/${MODEL_PATH}/checklist.chk" + wget --continue ${PRESIGNED_URL/'*'/"${MODEL_PATH}/params.json"} -O ${TARGET_FOLDER}"/${MODEL_PATH}/params.json" + wget --continue ${PRESIGNED_URL/'*'/"${MODEL_PATH}/checklist.chk"} -O ${TARGET_FOLDER}"/${MODEL_PATH}/checklist.chk" echo "Checking checksums" (cd ${TARGET_FOLDER}"/${MODEL_PATH}" && md5sum -c checklist.chk) done From ce27a9860d89d5d9d4aa37e5342d50c87895b5ef Mon Sep 17 00:00:00 2001 From: Nino Risteski <95188570+NinoRisteski@users.noreply.github.com> Date: Thu, 31 Aug 2023 13:26:10 +0200 Subject: [PATCH 19/84] Update README.md Fixed a few typos --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e085e48ca..829842fd7 100755 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ In order to download the model weights and tokenizer, please visit the [Meta AI Once your request is approved, you will receive a signed URL over email. Then run the download.sh script, passing the URL provided when prompted to start the download. Make sure that you copy the URL text itself, **do not use the 'Copy link address' option** when you right click the URL. If the copied URL text starts with: https://download.llamameta.net, you copied it correctly. If the copied URL text starts with: https://l.facebook.com, you copied it the wrong way. -Pre-requisites: make sure you have `wget` and `md5sum` installed. Then to run the script: `./download.sh`. +Pre-requisites: Make sure you have `wget` and `md5sum` installed. Then to run the script: `./download.sh`. Keep in mind that the links expire after 24 hours and a certain amount of downloads. If you start seeing errors such as `403: Forbidden`, you can always re-request a link. @@ -50,7 +50,7 @@ All models support sequence length up to 4096 tokens, but we pre-allocate the ca These models are not finetuned for chat or Q&A. They should be prompted so that the expected answer is the natural continuation of the prompt. -See `example_text_completion.py` for some examples. To illustrate, see command below to run it with the llama-2-7b model (`nproc_per_node` needs to be set to the `MP` value): +See `example_text_completion.py` for some examples. To illustrate, see the command below to run it with the llama-2-7b model (`nproc_per_node` needs to be set to the `MP` value): ``` torchrun --nproc_per_node 1 example_text_completion.py \ From 4649acd77948182bf39d056ec2f543aeecdc3ca4 Mon Sep 17 00:00:00 2001 From: godpeny Date: Mon, 28 Aug 2023 14:21:21 +0900 Subject: [PATCH 20/84] use 'md5' instead of 'md5sum' if Applie Silicon --- download.sh | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/download.sh b/download.sh index 86e2c8a23..fd990c5e7 100644 --- a/download.sh +++ b/download.sh @@ -20,7 +20,12 @@ wget --continue ${PRESIGNED_URL/'*'/"USE_POLICY.md"} -O ${TARGET_FOLDER}"/USE_PO echo "Downloading tokenizer" wget --continue ${PRESIGNED_URL/'*'/"tokenizer.model"} -O ${TARGET_FOLDER}"/tokenizer.model" wget --continue ${PRESIGNED_URL/'*'/"tokenizer_checklist.chk"} -O ${TARGET_FOLDER}"/tokenizer_checklist.chk" -(cd ${TARGET_FOLDER} && md5sum -c tokenizer_checklist.chk) +CPU_ARCH=$(uname -m) + if [ "$CPU_ARCH" = "arm64" ]; then + (cd ${TARGET_FOLDER} && md5 tokenizer_checklist.chk) + else + (cd ${TARGET_FOLDER} && md5sum -c tokenizer_checklist.chk) + fi for m in ${MODEL_SIZE//,/ } do @@ -55,5 +60,9 @@ do wget --continue ${PRESIGNED_URL/'*'/"${MODEL_PATH}/params.json"} -O ${TARGET_FOLDER}"/${MODEL_PATH}/params.json" wget --continue ${PRESIGNED_URL/'*'/"${MODEL_PATH}/checklist.chk"} -O ${TARGET_FOLDER}"/${MODEL_PATH}/checklist.chk" echo "Checking checksums" - (cd ${TARGET_FOLDER}"/${MODEL_PATH}" && md5sum -c checklist.chk) + if [ "$CPU_ARCH" = "arm64" ]; then + (cd ${TARGET_FOLDER}"/${MODEL_PATH}" && md5 checklist.chk) + else + (cd ${TARGET_FOLDER}"/${MODEL_PATH}" && md5sum -c checklist.chk) + fi done \ No newline at end of file From 7565eb6fee2175b2d4fe2cfb45067a61b35d7f5e Mon Sep 17 00:00:00 2001 From: Daniel Engbert Date: Fri, 1 Sep 2023 12:41:43 -0400 Subject: [PATCH 21/84] make download.sh executable (#695) * make download.sh executable * download.sh: stop on first error --- download.sh | 2 ++ 1 file changed, 2 insertions(+) mode change 100644 => 100755 download.sh diff --git a/download.sh b/download.sh old mode 100644 new mode 100755 index fd990c5e7..8625963e0 --- a/download.sh +++ b/download.sh @@ -3,6 +3,8 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. +set -e + read -p "Enter the URL from email: " PRESIGNED_URL echo "" read -p "Enter the list of models to download without spaces (7B,13B,70B,7B-chat,13B-chat,70B-chat), or press Enter for all: " MODEL_SIZE From 144608996a3cc5b971834a77e28d9145df1a4480 Mon Sep 17 00:00:00 2001 From: Jared Date: Sat, 2 Sep 2023 16:23:17 -0600 Subject: [PATCH 22/84] making a small change to avoid a confusion --- llama/model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llama/model.py b/llama/model.py index 258a7dc19..dc2dd5011 100755 --- a/llama/model.py +++ b/llama/model.py @@ -264,6 +264,7 @@ def __init__(self, params: ModelArgs): ) self.freqs_cis = precompute_freqs_cis( + # self.params.max_seq_len is multiplied by 2 because the token limit is 4096, so you can't make it bigger than that, and this is why it's set the the maximun (4096). self.params.dim // self.params.n_heads, self.params.max_seq_len * 2 ) From eb070624d7947f00b2d4391c5f90971dc0524e47 Mon Sep 17 00:00:00 2001 From: Jan Heidbrink Date: Sun, 3 Sep 2023 11:43:30 +0200 Subject: [PATCH 23/84] Fix download.sh shebang for NixOS --- download.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/download.sh b/download.sh index 8625963e0..39d92f410 100755 --- a/download.sh +++ b/download.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # Copyright (c) Meta Platforms, Inc. and affiliates. # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. From 8432e482de537889291e4d5af286580680ceb519 Mon Sep 17 00:00:00 2001 From: Jared Levi De La Fuente Rodriguez Date: Sun, 3 Sep 2023 10:14:42 -0600 Subject: [PATCH 24/84] Update model.py --- llama/model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llama/model.py b/llama/model.py index 0f43aeaea..c8b9ecfc8 100755 --- a/llama/model.py +++ b/llama/model.py @@ -448,7 +448,8 @@ def __init__(self, params: ModelArgs): ) self.freqs_cis = precompute_freqs_cis( - # self.params.max_seq_len is multiplied by 2 because the token limit is 4096, so you can't make it bigger than that, and this is why it's set the the maximun (4096). + # Note that self.params.max_seq_len is multiplied by 2 because the token limit for the Llama 2 generation of models is 4096. + #Adding this multiplier instead of using 4096 directly allows for dynamism of token lengths while training or fine-tuning. self.params.dim // self.params.n_heads, self.params.max_seq_len * 2 ) From 8580eb9a896bdb82d376b7040d18a2e7b402b10f Mon Sep 17 00:00:00 2001 From: Jared Levi De La Fuente Rodriguez Date: Sun, 3 Sep 2023 10:17:32 -0600 Subject: [PATCH 25/84] Update model.py --- llama/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama/model.py b/llama/model.py index c8b9ecfc8..770526d8c 100755 --- a/llama/model.py +++ b/llama/model.py @@ -449,7 +449,7 @@ def __init__(self, params: ModelArgs): self.freqs_cis = precompute_freqs_cis( # Note that self.params.max_seq_len is multiplied by 2 because the token limit for the Llama 2 generation of models is 4096. - #Adding this multiplier instead of using 4096 directly allows for dynamism of token lengths while training or fine-tuning. + # Adding this multiplier instead of using 4096 directly allows for dynamism of token lengths while training or fine-tuning. self.params.dim // self.params.n_heads, self.params.max_seq_len * 2 ) From dd6dbbf2ba6c308aed62e89dd1429be11fc2ac1e Mon Sep 17 00:00:00 2001 From: Joseph Spisak Date: Wed, 6 Sep 2023 20:10:24 -0700 Subject: [PATCH 26/84] Create FAQ.md --- FAQ.md | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 FAQ.md diff --git a/FAQ.md b/FAQ.md new file mode 100644 index 000000000..778b02113 --- /dev/null +++ b/FAQ.md @@ -0,0 +1,49 @@ +**Q: If I’m a developer/business, how can I access it?** + + +A: Details on how to access the models are available on our website [link](http://ai.meta.com/llama). Please note that the models are subject to the [acceptable use policy](https://github.com/facebookresearch/llama/blob/main/USE_POLICY.md) and the provided [responsible use guide](https://ai.meta.com/static-resource/responsible-use-guide/). + +**Q: Where can the models be found?** + +A: +- Models are available through multiple sources but the place to start is at https://ai.meta.com/llama/ +- Model code, quickstart guide and fine-tuning examples are available through our [Github Llama repository](https://github.com/fairinternal/llama_v2). Model Weights are available through an email link after the user submits a sign-up form. +- Models are also being hosted by Microsoft, Amazon Web Services, and Hugging Face, and may also be available through other hosting providers in the future. + +**Q: Can anyone access Llama 2? What are the terms?** + +A: +- Llama 2 is broadly available to developers and licensees through a variety of hosting providers and on the Meta website. +- Llama 2 is licensed under the Llama 2 Community License Agreement, which provides a permissive license to the models along with certain restrictions to help ensure that the models are being used responsibly. + +**Q: What’s different about Llama 2 from Llama 1?** + +A: +- We received unprecedented interest in the Llama 1 model we released for the research community – more than 100,000 individuals and organizations have applied for access to Llama 1 and tens of thousands are now using it to innovate. After external feedback, fine tuning, and extensive safety evaluations, we made the decision to release the next version of Llama more broadly. +- Llama 2 is also available under a permissive commercial license, whereas Llama 1 was limited to non-commercial use. +- Llama 2 is capable of processing longer prompts than Llama 1 and is also designed to work more efficiently. +- For Llama 2 we’re pairing our release of our pretrained models with versions fine-tuned for helpfulness and safety. Sharing fine-tuned versions makes it easier to use our models while also improving safety performance. + +**Q: What if I want to access Llama 2 models but I’m not sure if my use is permitted under the Llama 2 Community License?** + +A: On a limited case by case basis, we will consider bespoke licensing requests from individual entities. Please contact llama2@meta.com to provide more details about your request. + +**Q: Where did the data come from to train the models? Was any Meta user data leveraged for training the models?** + +A: +- A combination of sources are used for training. These sources include information that is publicly available online and annotated data to train our models. +- Llama 2 is not trained on Meta user data. + + +**Q: Why are you not sharing the training datasets for Llama 2?** + +A: We believe developers will have plenty to work with as we release our model weights and starting code for pre-trained and conversational fine-tuned versions as well as responsible use resources. While data mixes are intentionally withheld for competitive reasons, all models have gone through Meta’s internal Privacy Review process to ensure responsible data usage in building our products. We are dedicated to the responsible and ethical development of our genAI products, ensuring our policies reflect diverse contexts and meet evolving societal expectations. + + +**Q: Did we use human annotators to develop the data for our models?** + +A: Yes. There are more details about our use of human annotators in the [research paper](https://arxiv.org/abs/2307.09288). + +**Q: Can I use the output of the models to improve the Llama 2 family of models, even though I cannot use them for other LLMs?** + +A: It's correct that the license restricts using any part of the Llama 2 models, including the response outputs to train another AI model (LLM or otherwise). However, one can use the outputs to further train the Llama 2 family of models. Techniques such as Quantized Aware Training (QAT) utilize such a technique and hence this is allowed. From 5827703b33e88045f5da6aac9c99aff16f3fec8d Mon Sep 17 00:00:00 2001 From: Joseph Spisak Date: Wed, 6 Sep 2023 20:13:37 -0700 Subject: [PATCH 27/84] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7de8dff21..bba25525c 100755 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ This repository is intended as a minimal example to load [Llama 2](https://ai.me ## Updates post-launch -See [UPDATES.md](UPDATES.md). +See [UPDATES.md](UPDATES.md). Also for a running list of frequently asked questions, see [here](https://github.com/facebookresearch/llama/blob/main/FAQ.md). ## Download From bfbbf1d9c9d04648aaa28b20b0685ce1fe766e20 Mon Sep 17 00:00:00 2001 From: sekyondaMeta <127536312+sekyondaMeta@users.noreply.github.com> Date: Fri, 8 Sep 2023 14:06:54 -0400 Subject: [PATCH 28/84] Update FAQ.md --- FAQ.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/FAQ.md b/FAQ.md index 778b02113..2d1992823 100644 --- a/FAQ.md +++ b/FAQ.md @@ -44,6 +44,16 @@ A: We believe developers will have plenty to work with as we release our model w A: Yes. There are more details about our use of human annotators in the [research paper](https://arxiv.org/abs/2307.09288). +**Q: What is Llama's max output token length?** + +A: 2048. If you want to use more tokens, you will need to fine-tune the model so that it supports longer sequences. More information and examples on fine tuning can be found in the [llama recipes repo](https://github.com/facebookresearch/llama-recipes). + +**Q: Is there a multi-lingual checkpoint for researchers to download?** + +A: The Llama models thus far have been mainly focused on the English language. We are looking at true multi-linguality for the future but for now there are a lot of community projects that fine tune Llama models to support languages like chinese. For example: +- https://github.com/longyuewangdcu/Chinese-Llama-2 +- https://huggingface.co/seeledu/Chinese-Llama-2-7B + **Q: Can I use the output of the models to improve the Llama 2 family of models, even though I cannot use them for other LLMs?** A: It's correct that the license restricts using any part of the Llama 2 models, including the response outputs to train another AI model (LLM or otherwise). However, one can use the outputs to further train the Llama 2 family of models. Techniques such as Quantized Aware Training (QAT) utilize such a technique and hence this is allowed. From 646e6d6d88426df211cc3ed6bd2edbd7c928979a Mon Sep 17 00:00:00 2001 From: sekyondaMeta <127536312+sekyondaMeta@users.noreply.github.com> Date: Fri, 8 Sep 2023 14:09:36 -0400 Subject: [PATCH 29/84] Update FAQ.md --- FAQ.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/FAQ.md b/FAQ.md index 2d1992823..98a18a649 100644 --- a/FAQ.md +++ b/FAQ.md @@ -44,16 +44,19 @@ A: We believe developers will have plenty to work with as we release our model w A: Yes. There are more details about our use of human annotators in the [research paper](https://arxiv.org/abs/2307.09288). +**Q: Can I use the output of the models to improve the Llama 2 family of models, even though I cannot use them for other LLMs?** + +A: It's correct that the license restricts using any part of the Llama 2 models, including the response outputs to train another AI model (LLM or otherwise). However, one can use the outputs to further train the Llama 2 family of models. Techniques such as Quantized Aware Training (QAT) utilize such a technique and hence this is allowed. + + **Q: What is Llama's max output token length?** A: 2048. If you want to use more tokens, you will need to fine-tune the model so that it supports longer sequences. More information and examples on fine tuning can be found in the [llama recipes repo](https://github.com/facebookresearch/llama-recipes). + **Q: Is there a multi-lingual checkpoint for researchers to download?** A: The Llama models thus far have been mainly focused on the English language. We are looking at true multi-linguality for the future but for now there are a lot of community projects that fine tune Llama models to support languages like chinese. For example: - https://github.com/longyuewangdcu/Chinese-Llama-2 - https://huggingface.co/seeledu/Chinese-Llama-2-7B -**Q: Can I use the output of the models to improve the Llama 2 family of models, even though I cannot use them for other LLMs?** - -A: It's correct that the license restricts using any part of the Llama 2 models, including the response outputs to train another AI model (LLM or otherwise). However, one can use the outputs to further train the Llama 2 family of models. Techniques such as Quantized Aware Training (QAT) utilize such a technique and hence this is allowed. From 797f929b02564a56cde1b02343a1d0d006dd660b Mon Sep 17 00:00:00 2001 From: sekyondaMeta <127536312+sekyondaMeta@users.noreply.github.com> Date: Fri, 8 Sep 2023 14:20:04 -0400 Subject: [PATCH 30/84] Update FAQ.md --- FAQ.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/FAQ.md b/FAQ.md index 98a18a649..5e5c2b8ea 100644 --- a/FAQ.md +++ b/FAQ.md @@ -51,7 +51,7 @@ A: It's correct that the license restricts using any part of the Llama 2 models, **Q: What is Llama's max output token length?** -A: 2048. If you want to use more tokens, you will need to fine-tune the model so that it supports longer sequences. More information and examples on fine tuning can be found in the [llama recipes repo](https://github.com/facebookresearch/llama-recipes). +A: 2048. If you want to use more tokens, you will need to fine-tune the model so that it supports longer sequences. More information and examples on fine tuning can be found in the [Llama Recipes repository](https://github.com/facebookresearch/llama-recipes). **Q: Is there a multi-lingual checkpoint for researchers to download?** @@ -60,3 +60,10 @@ A: The Llama models thus far have been mainly focused on the English language. W - https://github.com/longyuewangdcu/Chinese-Llama-2 - https://huggingface.co/seeledu/Chinese-Llama-2-7B +**Q: How do can we fine tune the Llama 2 models?** + +A: You can find examples on how to fine tune the Llama 2 models in the [Llama Recipes repository](https://github.com/facebookresearch/llama-recipes). + +**Q: How can I pretrain the Llama 2 models?** + +A: You can adapt the finetuning script found [here](https://github.com/facebookresearch/llama-recipes/blob/main/llama_finetuning.py) for pretraining. You can also find the hyperparams used for pretraining in Section 2 of [the LLama 2 paper](https://arxiv.org/pdf/2307.09288.pdf). From fb624f44b2a13d2145eaacd16c604eb6492f2c69 Mon Sep 17 00:00:00 2001 From: sekyondaMeta <127536312+sekyondaMeta@users.noreply.github.com> Date: Fri, 8 Sep 2023 14:20:50 -0400 Subject: [PATCH 31/84] Update FAQ.md --- FAQ.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/FAQ.md b/FAQ.md index 5e5c2b8ea..3f40235d2 100644 --- a/FAQ.md +++ b/FAQ.md @@ -51,19 +51,23 @@ A: It's correct that the license restricts using any part of the Llama 2 models, **Q: What is Llama's max output token length?** -A: 2048. If you want to use more tokens, you will need to fine-tune the model so that it supports longer sequences. More information and examples on fine tuning can be found in the [Llama Recipes repository](https://github.com/facebookresearch/llama-recipes). +A: +2048. If you want to use more tokens, you will need to fine-tune the model so that it supports longer sequences. More information and examples on fine tuning can be found in the [Llama Recipes repository](https://github.com/facebookresearch/llama-recipes). **Q: Is there a multi-lingual checkpoint for researchers to download?** -A: The Llama models thus far have been mainly focused on the English language. We are looking at true multi-linguality for the future but for now there are a lot of community projects that fine tune Llama models to support languages like chinese. For example: +A: +The Llama models thus far have been mainly focused on the English language. We are looking at true multi-linguality for the future but for now there are a lot of community projects that fine tune Llama models to support languages like chinese. For example: - https://github.com/longyuewangdcu/Chinese-Llama-2 - https://huggingface.co/seeledu/Chinese-Llama-2-7B **Q: How do can we fine tune the Llama 2 models?** -A: You can find examples on how to fine tune the Llama 2 models in the [Llama Recipes repository](https://github.com/facebookresearch/llama-recipes). +A: +You can find examples on how to fine tune the Llama 2 models in the [Llama Recipes repository](https://github.com/facebookresearch/llama-recipes). **Q: How can I pretrain the Llama 2 models?** -A: You can adapt the finetuning script found [here](https://github.com/facebookresearch/llama-recipes/blob/main/llama_finetuning.py) for pretraining. You can also find the hyperparams used for pretraining in Section 2 of [the LLama 2 paper](https://arxiv.org/pdf/2307.09288.pdf). +A: +You can adapt the finetuning script found [here](https://github.com/facebookresearch/llama-recipes/blob/main/llama_finetuning.py) for pretraining. You can also find the hyperparams used for pretraining in Section 2 of [the LLama 2 paper](https://arxiv.org/pdf/2307.09288.pdf). From bb2f6931f6b0cb60cfafb099c8089c03aa04ce2d Mon Sep 17 00:00:00 2001 From: Joseph Spisak Date: Fri, 8 Sep 2023 14:32:35 -0700 Subject: [PATCH 32/84] Update FAQ.md Updated the ecosystem QnA.. --- FAQ.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/FAQ.md b/FAQ.md index 3f40235d2..db19366a2 100644 --- a/FAQ.md +++ b/FAQ.md @@ -58,9 +58,7 @@ A: **Q: Is there a multi-lingual checkpoint for researchers to download?** A: -The Llama models thus far have been mainly focused on the English language. We are looking at true multi-linguality for the future but for now there are a lot of community projects that fine tune Llama models to support languages like chinese. For example: -- https://github.com/longyuewangdcu/Chinese-Llama-2 -- https://huggingface.co/seeledu/Chinese-Llama-2-7B +The Llama models thus far have been mainly focused on the English language. We are looking at true multi-linguality for the future but for now there are a lot of community projects that fine tune Llama models to support languages. **Q: How do can we fine tune the Llama 2 models?** From 735011928cf8e7c7d366d2f90c03e63d6ed79808 Mon Sep 17 00:00:00 2001 From: Joseph Spisak Date: Sat, 9 Sep 2023 07:38:43 -0700 Subject: [PATCH 33/84] Update FAQ.md --- FAQ.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/FAQ.md b/FAQ.md index db19366a2..152e5c10b 100644 --- a/FAQ.md +++ b/FAQ.md @@ -49,10 +49,10 @@ A: Yes. There are more details about our use of human annotators in the [researc A: It's correct that the license restricts using any part of the Llama 2 models, including the response outputs to train another AI model (LLM or otherwise). However, one can use the outputs to further train the Llama 2 family of models. Techniques such as Quantized Aware Training (QAT) utilize such a technique and hence this is allowed. -**Q: What is Llama's max output token length?** +**Q: What is Llama 2's max sequence length?** A: -2048. If you want to use more tokens, you will need to fine-tune the model so that it supports longer sequences. More information and examples on fine tuning can be found in the [Llama Recipes repository](https://github.com/facebookresearch/llama-recipes). +4096. If you want to use more tokens, you will need to fine-tune the model so that it supports longer sequences. More information and examples on fine tuning can be found in the [Llama Recipes repository](https://github.com/facebookresearch/llama-recipes). **Q: Is there a multi-lingual checkpoint for researchers to download?** From 6c2f236ea45eacdc3739be29645b825118b45b84 Mon Sep 17 00:00:00 2001 From: sekyondaMeta <127536312+sekyondaMeta@users.noreply.github.com> Date: Sat, 9 Sep 2023 18:08:26 -0400 Subject: [PATCH 34/84] Update README.md Adding quick start steps --- README.md | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index bba25525c..179b93116 100755 --- a/README.md +++ b/README.md @@ -28,11 +28,35 @@ We are also providing downloads on [Hugging Face](https://huggingface.co/meta-ll ## Setup -In a conda env with PyTorch / CUDA available, clone the repo and run in the top-level directory: +You can follow the steps below to quickly get up and running with Llama 2 models. These steps will let you run quick inference locally. For more examples, see the [Llama 2 recipes repository](https://github.com/facebookresearch/llama-recipes). -``` +1. In a conda env with PyTorch / CUDA availableClone and download this repository + +2. In the top level directory run: +```bash pip install -e . ``` +3. Visit the [Meta.AI website](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and register to download the model/s. + +4. Once registered, you will get an email with a URL to download the models. You will need this URL when you run the download.sh script. + +5. Navigate to your downloaded llama repository and run the download.sh script. + - Make sure to grant execution permissions to the download.sh script + - During this process, you will be prompted to enter the URL from the email. + - Do not use the “Copy Link” option but rather make sure to manually copy the link from the email. + +6. Once the model/s you want have been downloaded, you can run the model locally using the command below: +```bash +torchrun --nproc_per_node 1 example_chat_completion.py \ + --ckpt_dir llama-2-7b-chat/ \ + --tokenizer_path tokenizer.model \ + --max_seq_len 512 --max_batch_size 6 +``` +**Note** +- Replace `llama-2-7b-chat/` with the path to your checkpoint directory and `tokenizer.model` with the path to your tokenizer model. +- The `–nproc_per_node` should be set to the [MP](#inference) value for the model you are using. +- Adjust the `max_seq_len` and `max_batch_size` parameters as needed. +- This example runs the example_chat_completion.py but you can change that to a different .py file. ## Inference From d06e1e1c078eb4f1a9e5681e4d9ad0b6561437ab Mon Sep 17 00:00:00 2001 From: sekyondaMeta <127536312+sekyondaMeta@users.noreply.github.com> Date: Sat, 9 Sep 2023 18:10:20 -0400 Subject: [PATCH 35/84] Update README.md --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 179b93116..8359a8da3 100755 --- a/README.md +++ b/README.md @@ -26,16 +26,16 @@ Keep in mind that the links expire after 24 hours and a certain amount of downlo We are also providing downloads on [Hugging Face](https://huggingface.co/meta-llama). You must first request a download from the Meta AI website using the same email address as your Hugging Face account. After doing so, you can request access to any of the models on Hugging Face and within 1-2 days your account will be granted access to all versions. -## Setup +## Quick Start You can follow the steps below to quickly get up and running with Llama 2 models. These steps will let you run quick inference locally. For more examples, see the [Llama 2 recipes repository](https://github.com/facebookresearch/llama-recipes). 1. In a conda env with PyTorch / CUDA availableClone and download this repository 2. In the top level directory run: -```bash -pip install -e . -``` + ```bash + pip install -e . + ``` 3. Visit the [Meta.AI website](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and register to download the model/s. 4. Once registered, you will get an email with a URL to download the models. You will need this URL when you run the download.sh script. From 001b67243c6dee52650b713f3dfacda2790048e7 Mon Sep 17 00:00:00 2001 From: sekyondaMeta <127536312+sekyondaMeta@users.noreply.github.com> Date: Sat, 9 Sep 2023 18:11:32 -0400 Subject: [PATCH 36/84] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8359a8da3..6326fe79d 100755 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ We are also providing downloads on [Hugging Face](https://huggingface.co/meta-ll You can follow the steps below to quickly get up and running with Llama 2 models. These steps will let you run quick inference locally. For more examples, see the [Llama 2 recipes repository](https://github.com/facebookresearch/llama-recipes). -1. In a conda env with PyTorch / CUDA availableClone and download this repository +1. In a conda env with PyTorch / CUDA available clone and download this repository. 2. In the top level directory run: ```bash From f2e6eac348a718dc3f70b2b34e27ee6a9b9efd4f Mon Sep 17 00:00:00 2001 From: sekyondaMeta <127536312+sekyondaMeta@users.noreply.github.com> Date: Sat, 9 Sep 2023 18:13:03 -0400 Subject: [PATCH 37/84] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6326fe79d..a6498251e 100755 --- a/README.md +++ b/README.md @@ -36,11 +36,11 @@ You can follow the steps below to quickly get up and running with Llama 2 models ```bash pip install -e . ``` -3. Visit the [Meta.AI website](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and register to download the model/s. +3. Visit the [Meta AI website](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and register to download the model/s. 4. Once registered, you will get an email with a URL to download the models. You will need this URL when you run the download.sh script. -5. Navigate to your downloaded llama repository and run the download.sh script. +5. Once you get the email, navigate to your downloaded llama repository and run the download.sh script. - Make sure to grant execution permissions to the download.sh script - During this process, you will be prompted to enter the URL from the email. - Do not use the “Copy Link” option but rather make sure to manually copy the link from the email. From ac19393aeb30c5cf54e53b688d10f8d603f053be Mon Sep 17 00:00:00 2001 From: sekyondaMeta <127536312+sekyondaMeta@users.noreply.github.com> Date: Sat, 9 Sep 2023 18:15:32 -0400 Subject: [PATCH 38/84] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a6498251e..8dfa62428 100755 --- a/README.md +++ b/README.md @@ -56,7 +56,7 @@ torchrun --nproc_per_node 1 example_chat_completion.py \ - Replace `llama-2-7b-chat/` with the path to your checkpoint directory and `tokenizer.model` with the path to your tokenizer model. - The `–nproc_per_node` should be set to the [MP](#inference) value for the model you are using. - Adjust the `max_seq_len` and `max_batch_size` parameters as needed. -- This example runs the example_chat_completion.py but you can change that to a different .py file. +- This example runs the [example_chat_completion.py](example_chat_completion.py) found in this repository but you can change that to a different .py file. ## Inference From c9c493f20f6bf791081ccf8ca0e7dfd7468d604b Mon Sep 17 00:00:00 2001 From: xavierm Date: Mon, 11 Sep 2023 14:49:18 +0000 Subject: [PATCH 39/84] add seed --- llama/generation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llama/generation.py b/llama/generation.py index 9045c2f08..5f8faf9f3 100755 --- a/llama/generation.py +++ b/llama/generation.py @@ -56,6 +56,7 @@ def build( max_seq_len: int, max_batch_size: int, model_parallel_size: Optional[int] = None, + seed: int = 1, ) -> "Llama": """ Build a Llama instance by initializing and loading a pre-trained model. @@ -91,7 +92,7 @@ def build( torch.cuda.set_device(local_rank) # seed must be the same in all processes - torch.manual_seed(1) + torch.manual_seed(seed) if local_rank > 0: sys.stdout = open(os.devnull, "w") From d7e2e37e163981fd674ea2a633fac2014550898d Mon Sep 17 00:00:00 2001 From: Joseph Spisak Date: Thu, 14 Sep 2023 13:36:34 -0700 Subject: [PATCH 40/84] Update FAQ.md --- FAQ.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FAQ.md b/FAQ.md index 152e5c10b..799783ff8 100644 --- a/FAQ.md +++ b/FAQ.md @@ -60,7 +60,7 @@ A: A: The Llama models thus far have been mainly focused on the English language. We are looking at true multi-linguality for the future but for now there are a lot of community projects that fine tune Llama models to support languages. -**Q: How do can we fine tune the Llama 2 models?** +**Q: How can I fine tune the Llama 2 models?** A: You can find examples on how to fine tune the Llama 2 models in the [Llama Recipes repository](https://github.com/facebookresearch/llama-recipes). From 4869110d27ee6389449fc2fba1739d1581f04811 Mon Sep 17 00:00:00 2001 From: Joseph Spisak Date: Sat, 16 Sep 2023 08:04:03 -0700 Subject: [PATCH 41/84] Update FAQ.md Added a QnA on using Llama 2 with languages other than english. --- FAQ.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/FAQ.md b/FAQ.md index 799783ff8..60c5b969c 100644 --- a/FAQ.md +++ b/FAQ.md @@ -69,3 +69,6 @@ You can find examples on how to fine tune the Llama 2 models in the [Llama Recip A: You can adapt the finetuning script found [here](https://github.com/facebookresearch/llama-recipes/blob/main/llama_finetuning.py) for pretraining. You can also find the hyperparams used for pretraining in Section 2 of [the LLama 2 paper](https://arxiv.org/pdf/2307.09288.pdf). + +**Q: Am I allowed a develop derivative models through fine-tuning based on Llama 2 for languages other than english? Is this a violation of the acceptable use policy?** +A: No, it is NOT a violation of the acceptable use policy (AUP) to finetune on a non-english language and then use commercially as long as you follow the AUP and the terms of the license. We did include language in the responsible use guide around this because documentation and support doesn't yet exist for languages beyond english. Llama 2 itself is english language centric and you can read the paper for more details [here](https://arxiv.org/abs/2307.09288). From d58f9ae95c299fe6388ee2da2c87fd90cd360d41 Mon Sep 17 00:00:00 2001 From: Joseph Spisak Date: Sat, 16 Sep 2023 08:04:21 -0700 Subject: [PATCH 42/84] Update FAQ.md --- FAQ.md | 1 + 1 file changed, 1 insertion(+) diff --git a/FAQ.md b/FAQ.md index 60c5b969c..c936299c5 100644 --- a/FAQ.md +++ b/FAQ.md @@ -71,4 +71,5 @@ A: You can adapt the finetuning script found [here](https://github.com/facebookresearch/llama-recipes/blob/main/llama_finetuning.py) for pretraining. You can also find the hyperparams used for pretraining in Section 2 of [the LLama 2 paper](https://arxiv.org/pdf/2307.09288.pdf). **Q: Am I allowed a develop derivative models through fine-tuning based on Llama 2 for languages other than english? Is this a violation of the acceptable use policy?** + A: No, it is NOT a violation of the acceptable use policy (AUP) to finetune on a non-english language and then use commercially as long as you follow the AUP and the terms of the license. We did include language in the responsible use guide around this because documentation and support doesn't yet exist for languages beyond english. Llama 2 itself is english language centric and you can read the paper for more details [here](https://arxiv.org/abs/2307.09288). From 9f0e393991b45d320f5b4a287eaaeb8a7d2e6f8e Mon Sep 17 00:00:00 2001 From: Joseph Spisak Date: Sun, 17 Sep 2023 07:30:24 -0700 Subject: [PATCH 43/84] Update README.md Added a link to the FAQ from the readme. --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 8dfa62428..1f6f26d2f 100755 --- a/README.md +++ b/README.md @@ -124,5 +124,7 @@ See the [LICENSE](LICENSE) file, as well as our accompanying [Acceptable Use Pol 2. [Llama 2 technical overview](https://ai.meta.com/resources/models-and-libraries/llama) 3. [Open Innovation AI Research Community](https://ai.meta.com/llama/open-innovation-ai-research-community/) +For common questions, the FAQ can be found [here](https://github.com/facebookresearch/llama/blob/main/FAQ.md) which will be kept up to date over time as new questions arise. + ## Original LLaMA The repo for the original llama release is in the [`llama_v1`](https://github.com/facebookresearch/llama/tree/llama_v1) branch. From a5e37ce21c2b677f15feeabae2b989bfb79c045a Mon Sep 17 00:00:00 2001 From: Joseph Spisak Date: Wed, 20 Sep 2023 15:18:30 -0700 Subject: [PATCH 44/84] Update MODEL_CARD.md Updated intended use cases to clarify that developers can fine tune for non-english languages. --- MODEL_CARD.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/MODEL_CARD.md b/MODEL_CARD.md index 18d9dfea8..5b0128564 100644 --- a/MODEL_CARD.md +++ b/MODEL_CARD.md @@ -33,7 +33,9 @@ Llama 2|*A new mix of publicly available online data*|70B|4k|✔|2.0T|1.5 x # **Intended Use** **Intended Use Cases** Llama 2 is intended for commercial and research use in English. Tuned models are intended for assistant-like chat, whereas pretrained models can be adapted for a variety of natural language generation tasks. -**Out-of-scope Uses** Use in any manner that violates applicable laws or regulations (including trade compliance laws). Use in languages other than English. Use in any other way that is prohibited by the Acceptable Use Policy and Licensing Agreement for Llama 2. +**Out-of-scope** Uses Use in any manner that violates applicable laws or regulations (including trade compliance laws). Use in any other way that is prohibited by the Acceptable Use Policy and Llama 2 Community License. Use in languages other than English**. + +**Note: Developers may fine-tune Llama 2 models for languages beyond English provided they comply with the Llama 2 Community License and the Acceptable Use Policy. # **Hardware and Software** **Training Factors** We used custom training libraries, Meta's Research Super Cluster, and production clusters for pretraining. Fine-tuning, annotation, and evaluation were also performed on third-party cloud compute. From 5c1081855805b4a0927178d4019bd5aa9c0a4107 Mon Sep 17 00:00:00 2001 From: Joseph Spisak Date: Wed, 20 Sep 2023 15:24:01 -0700 Subject: [PATCH 45/84] Update FAQ.md Updating the answer for whether developers can fine tune on non-english languages. --- FAQ.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FAQ.md b/FAQ.md index c936299c5..53e908266 100644 --- a/FAQ.md +++ b/FAQ.md @@ -72,4 +72,4 @@ You can adapt the finetuning script found [here](https://github.com/facebookrese **Q: Am I allowed a develop derivative models through fine-tuning based on Llama 2 for languages other than english? Is this a violation of the acceptable use policy?** -A: No, it is NOT a violation of the acceptable use policy (AUP) to finetune on a non-english language and then use commercially as long as you follow the AUP and the terms of the license. We did include language in the responsible use guide around this because documentation and support doesn't yet exist for languages beyond english. Llama 2 itself is english language centric and you can read the paper for more details [here](https://arxiv.org/abs/2307.09288). +A: Developers may fine-tune Llama 2 models for languages beyond English provided they comply with the Llama 2 Community License and the Acceptable Use Policy. From 4660bd3b288d1beabe68bb724dfe18d2d55e768b Mon Sep 17 00:00:00 2001 From: Kieren Date: Sat, 23 Sep 2023 11:24:39 +1000 Subject: [PATCH 46/84] Add "--continue" flag to wget for model binary in order to resume downloads. --- download.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/download.sh b/download.sh index 39d92f410..9ba1b533c 100755 --- a/download.sh +++ b/download.sh @@ -56,7 +56,7 @@ do for s in $(seq -f "0%g" 0 ${SHARD}) do - wget ${PRESIGNED_URL/'*'/"${MODEL_PATH}/consolidated.${s}.pth"} -O ${TARGET_FOLDER}"/${MODEL_PATH}/consolidated.${s}.pth" + wget --continue ${PRESIGNED_URL/'*'/"${MODEL_PATH}/consolidated.${s}.pth"} -O ${TARGET_FOLDER}"/${MODEL_PATH}/consolidated.${s}.pth" done wget --continue ${PRESIGNED_URL/'*'/"${MODEL_PATH}/params.json"} -O ${TARGET_FOLDER}"/${MODEL_PATH}/params.json" From f29c9a8adff8cd4de641a637a471c3bcf9461a73 Mon Sep 17 00:00:00 2001 From: Joseph Spisak Date: Tue, 26 Sep 2023 15:18:10 -0700 Subject: [PATCH 47/84] Update README.md Updated the Meta AI mention to just Meta. --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 1f6f26d2f..5a5a63cfa 100755 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ See [UPDATES.md](UPDATES.md). Also for a running list of frequently asked questi ⚠️ **7/18: We're aware of people encountering a number of download issues today. Anyone still encountering issues should remove all local files, re-clone the repository, and [request a new download link](https://ai.meta.com/resources/models-and-libraries/llama-downloads/). It's critical to do all of these in case you have local corrupt files.** -In order to download the model weights and tokenizer, please visit the [Meta AI website](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and accept our License. +In order to download the model weights and tokenizer, please visit the [Meta website](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and accept our License. Once your request is approved, you will receive a signed URL over email. Then run the download.sh script, passing the URL provided when prompted to start the download. @@ -24,7 +24,7 @@ Keep in mind that the links expire after 24 hours and a certain amount of downlo ### Access on Hugging Face -We are also providing downloads on [Hugging Face](https://huggingface.co/meta-llama). You must first request a download from the Meta AI website using the same email address as your Hugging Face account. After doing so, you can request access to any of the models on Hugging Face and within 1-2 days your account will be granted access to all versions. +We are also providing downloads on [Hugging Face](https://huggingface.co/meta-llama). You must first request a download from the Meta website using the same email address as your Hugging Face account. After doing so, you can request access to any of the models on Hugging Face and within 1-2 days your account will be granted access to all versions. ## Quick Start @@ -36,7 +36,7 @@ You can follow the steps below to quickly get up and running with Llama 2 models ```bash pip install -e . ``` -3. Visit the [Meta AI website](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and register to download the model/s. +3. Visit the [Meta website](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and register to download the model/s. 4. Once registered, you will get an email with a URL to download the models. You will need this URL when you run the download.sh script. From 98851c30094074d13b592a3db5d8897a870ff608 Mon Sep 17 00:00:00 2001 From: sekyondaMeta <127536312+sekyondaMeta@users.noreply.github.com> Date: Wed, 11 Oct 2023 15:05:15 -0400 Subject: [PATCH 48/84] Update FAQ.md --- FAQ.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/FAQ.md b/FAQ.md index 53e908266..c4891037a 100644 --- a/FAQ.md +++ b/FAQ.md @@ -73,3 +73,26 @@ You can adapt the finetuning script found [here](https://github.com/facebookrese **Q: Am I allowed a develop derivative models through fine-tuning based on Llama 2 for languages other than english? Is this a violation of the acceptable use policy?** A: Developers may fine-tune Llama 2 models for languages beyond English provided they comply with the Llama 2 Community License and the Acceptable Use Policy. + +**Q: What OS is currently supproted?** + +A: +Linux is the only OS currently supported by the code availabe in this repo. + +**Q Getting an error with download script** + ``` + download.sh: 14: [[: not found + ``` + +A: +Make sure to run the command as follows + + ```./download.sh``` + +**Q: Issue with the URL** + ``` + HTTP request sent, awaiting response... 400 Bad Request + ``` +A: +The issue occurs because of not copying the URL correctly. If you right click on the link and copy the link, the link may be copied with url defence wrapper. +To avoid this problem, please select the url manually and copy it From 5d9bb58a65c8f90615696497015a41ac4ffbdd54 Mon Sep 17 00:00:00 2001 From: sekyondaMeta <127536312+sekyondaMeta@users.noreply.github.com> Date: Wed, 11 Oct 2023 15:08:34 -0400 Subject: [PATCH 49/84] Update FAQ.md --- FAQ.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/FAQ.md b/FAQ.md index c4891037a..668f6a743 100644 --- a/FAQ.md +++ b/FAQ.md @@ -79,7 +79,8 @@ A: Developers may fine-tune Llama 2 models for languages beyond English provided A: Linux is the only OS currently supported by the code availabe in this repo. -**Q Getting an error with download script** +**Q: Getting an error with download script** + ``` download.sh: 14: [[: not found ``` @@ -87,12 +88,15 @@ Linux is the only OS currently supported by the code availabe in this repo. A: Make sure to run the command as follows - ```./download.sh``` +``` +./download.sh +``` **Q: Issue with the URL** - ``` +``` HTTP request sent, awaiting response... 400 Bad Request - ``` +``` + A: The issue occurs because of not copying the URL correctly. If you right click on the link and copy the link, the link may be copied with url defence wrapper. To avoid this problem, please select the url manually and copy it From 0da077cff670eb3012eb49925f7ee748d603b4a4 Mon Sep 17 00:00:00 2001 From: Joseph Spisak Date: Wed, 11 Oct 2023 12:17:35 -0700 Subject: [PATCH 50/84] Update FAQ.md made some small fixes and added some context. --- FAQ.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/FAQ.md b/FAQ.md index 668f6a743..e1d31afa8 100644 --- a/FAQ.md +++ b/FAQ.md @@ -74,12 +74,12 @@ You can adapt the finetuning script found [here](https://github.com/facebookrese A: Developers may fine-tune Llama 2 models for languages beyond English provided they comply with the Llama 2 Community License and the Acceptable Use Policy. -**Q: What OS is currently supproted?** +**Q: What operating systems (OS) are officially supported?** A: -Linux is the only OS currently supported by the code availabe in this repo. +Linux is the only OS currently supported by this repo. -**Q: Getting an error with download script** +**Q: I am getting the following error with download script. What should I do?** ``` download.sh: 14: [[: not found @@ -92,7 +92,7 @@ Make sure to run the command as follows ./download.sh ``` -**Q: Issue with the URL** +**Q: I am getting "Issue with the URL" as an error message. What do I do?** ``` HTTP request sent, awaiting response... 400 Bad Request ``` From f9ddb1d0f3460a4c031caaa825b99a987a58a0d7 Mon Sep 17 00:00:00 2001 From: yonashub Date: Sat, 14 Oct 2023 21:30:47 -0500 Subject: [PATCH 51/84] change "Content Length" to "Context Length MODEL_CARD.md --- MODEL_CARD.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_CARD.md b/MODEL_CARD.md index 5b0128564..370807880 100644 --- a/MODEL_CARD.md +++ b/MODEL_CARD.md @@ -12,7 +12,7 @@ Meta developed and released the Llama 2 family of large language models (LLMs), **Model Architecture** Llama 2 is an auto-regressive language model that uses an optimized transformer architecture. The tuned versions use supervised fine-tuning (SFT) and reinforcement learning with human feedback (RLHF) to align with human preferences for helpfulness and safety. -||Training Data|Params|Content Length|GQA|Tokens|LR| +||Training Data|Params|Context Length|GQA|Tokens|LR| |---|---|---|---|---|---|---| Llama 2|*A new mix of publicly available online data*|7B|4k|✗|2.0T|3.0 x 10-4 Llama 2|*A new mix of publicly available online data*|13B|4k|✗|2.0T|3.0 x 10-4 From 0cc2987b616051c9fe4445295de68a2236bb15d9 Mon Sep 17 00:00:00 2001 From: Suraj Subramanian <5676233+subramen@users.noreply.github.com> Date: Mon, 16 Oct 2023 14:51:11 -0400 Subject: [PATCH 52/84] Update issue templates --- .github/ISSUE_TEMPLATE/bug_report.md | 38 ++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 000000000..fba12726a --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,38 @@ +--- +name: Bug report +about: Create a report to help us reproduce and fix the issue +title: '' +labels: '' +assignees: '' + +--- + +**Before submitting a bug, please make sure the issue hasn't been already addressed by searching through the [existing and past issues](https://github.com/facebookresearch/llama/issues)** + +## Describe the bug +Please provide a clear and concise description of what the bug is. If relevant, please include a _minimal_ (least lines of code necessary) _reproducible_ (running this will give us the same result as you get) code snippet. Make sure to include the relevant imports. + +Remember to wrap the code and outputs in ```` ```triple-quotes blocks``` ````. + +### Minimal reproducible example + +```python +# sample code to repro the bug +``` + +### Output + +``` + +``` + +## Runtime Environment +- Model: [eg: `llama-2-7b-chat`] +- Using via huggingface?: [yes/no] +- OS: [eg. Linux/Ubuntu, Windows] +- GPU VRAM: +- Number of GPUs: +- GPU Make: [eg: Nvidia, AMD, Intel] + +**Additional context** +Add any other context about the problem or environment here. From 06faf3aab2971e7931e3d5b41e53c4a614d5bad7 Mon Sep 17 00:00:00 2001 From: Suraj Subramanian <5676233+subramen@users.noreply.github.com> Date: Wed, 18 Oct 2023 13:38:04 -0400 Subject: [PATCH 53/84] Add FAQs --- .github/ISSUE_TEMPLATE/bug_report.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index fba12726a..443570ef5 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -7,7 +7,7 @@ assignees: '' --- -**Before submitting a bug, please make sure the issue hasn't been already addressed by searching through the [existing and past issues](https://github.com/facebookresearch/llama/issues)** +**Before submitting a bug, please make sure the issue hasn't been already addressed by searching through the [FAQs](https://github.com/facebookresearch/llama/blob/main/FAQ.md) and [existing/past issues](https://github.com/facebookresearch/llama/issues)** ## Describe the bug Please provide a clear and concise description of what the bug is. If relevant, please include a _minimal_ (least lines of code necessary) _reproducible_ (running this will give us the same result as you get) code snippet. Make sure to include the relevant imports. From 786af967858822723cb3b55e412d93db0897fd10 Mon Sep 17 00:00:00 2001 From: Joseph Spisak Date: Wed, 1 Nov 2023 20:44:47 -0700 Subject: [PATCH 54/84] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5a5a63cfa..5de6bb99a 100755 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ This repository is intended as a minimal example to load [Llama 2](https://ai.me ## Updates post-launch -See [UPDATES.md](UPDATES.md). Also for a running list of frequently asked questions, see [here](https://github.com/facebookresearch/llama/blob/main/FAQ.md). +See [UPDATES.md](UPDATES.md). Also for a running list of frequently asked questions, see [here](https://ai.meta.com/llama/faq/). ## Download From 664ddc8c8f2ab4254115b3919741032245d65312 Mon Sep 17 00:00:00 2001 From: Joseph Spisak Date: Wed, 1 Nov 2023 20:47:17 -0700 Subject: [PATCH 55/84] Delete FAQ.md --- FAQ.md | 102 --------------------------------------------------------- 1 file changed, 102 deletions(-) delete mode 100644 FAQ.md diff --git a/FAQ.md b/FAQ.md deleted file mode 100644 index e1d31afa8..000000000 --- a/FAQ.md +++ /dev/null @@ -1,102 +0,0 @@ -**Q: If I’m a developer/business, how can I access it?** - - -A: Details on how to access the models are available on our website [link](http://ai.meta.com/llama). Please note that the models are subject to the [acceptable use policy](https://github.com/facebookresearch/llama/blob/main/USE_POLICY.md) and the provided [responsible use guide](https://ai.meta.com/static-resource/responsible-use-guide/). - -**Q: Where can the models be found?** - -A: -- Models are available through multiple sources but the place to start is at https://ai.meta.com/llama/ -- Model code, quickstart guide and fine-tuning examples are available through our [Github Llama repository](https://github.com/fairinternal/llama_v2). Model Weights are available through an email link after the user submits a sign-up form. -- Models are also being hosted by Microsoft, Amazon Web Services, and Hugging Face, and may also be available through other hosting providers in the future. - -**Q: Can anyone access Llama 2? What are the terms?** - -A: -- Llama 2 is broadly available to developers and licensees through a variety of hosting providers and on the Meta website. -- Llama 2 is licensed under the Llama 2 Community License Agreement, which provides a permissive license to the models along with certain restrictions to help ensure that the models are being used responsibly. - -**Q: What’s different about Llama 2 from Llama 1?** - -A: -- We received unprecedented interest in the Llama 1 model we released for the research community – more than 100,000 individuals and organizations have applied for access to Llama 1 and tens of thousands are now using it to innovate. After external feedback, fine tuning, and extensive safety evaluations, we made the decision to release the next version of Llama more broadly. -- Llama 2 is also available under a permissive commercial license, whereas Llama 1 was limited to non-commercial use. -- Llama 2 is capable of processing longer prompts than Llama 1 and is also designed to work more efficiently. -- For Llama 2 we’re pairing our release of our pretrained models with versions fine-tuned for helpfulness and safety. Sharing fine-tuned versions makes it easier to use our models while also improving safety performance. - -**Q: What if I want to access Llama 2 models but I’m not sure if my use is permitted under the Llama 2 Community License?** - -A: On a limited case by case basis, we will consider bespoke licensing requests from individual entities. Please contact llama2@meta.com to provide more details about your request. - -**Q: Where did the data come from to train the models? Was any Meta user data leveraged for training the models?** - -A: -- A combination of sources are used for training. These sources include information that is publicly available online and annotated data to train our models. -- Llama 2 is not trained on Meta user data. - - -**Q: Why are you not sharing the training datasets for Llama 2?** - -A: We believe developers will have plenty to work with as we release our model weights and starting code for pre-trained and conversational fine-tuned versions as well as responsible use resources. While data mixes are intentionally withheld for competitive reasons, all models have gone through Meta’s internal Privacy Review process to ensure responsible data usage in building our products. We are dedicated to the responsible and ethical development of our genAI products, ensuring our policies reflect diverse contexts and meet evolving societal expectations. - - -**Q: Did we use human annotators to develop the data for our models?** - -A: Yes. There are more details about our use of human annotators in the [research paper](https://arxiv.org/abs/2307.09288). - -**Q: Can I use the output of the models to improve the Llama 2 family of models, even though I cannot use them for other LLMs?** - -A: It's correct that the license restricts using any part of the Llama 2 models, including the response outputs to train another AI model (LLM or otherwise). However, one can use the outputs to further train the Llama 2 family of models. Techniques such as Quantized Aware Training (QAT) utilize such a technique and hence this is allowed. - - -**Q: What is Llama 2's max sequence length?** - -A: -4096. If you want to use more tokens, you will need to fine-tune the model so that it supports longer sequences. More information and examples on fine tuning can be found in the [Llama Recipes repository](https://github.com/facebookresearch/llama-recipes). - - -**Q: Is there a multi-lingual checkpoint for researchers to download?** - -A: -The Llama models thus far have been mainly focused on the English language. We are looking at true multi-linguality for the future but for now there are a lot of community projects that fine tune Llama models to support languages. - -**Q: How can I fine tune the Llama 2 models?** - -A: -You can find examples on how to fine tune the Llama 2 models in the [Llama Recipes repository](https://github.com/facebookresearch/llama-recipes). - -**Q: How can I pretrain the Llama 2 models?** - -A: -You can adapt the finetuning script found [here](https://github.com/facebookresearch/llama-recipes/blob/main/llama_finetuning.py) for pretraining. You can also find the hyperparams used for pretraining in Section 2 of [the LLama 2 paper](https://arxiv.org/pdf/2307.09288.pdf). - -**Q: Am I allowed a develop derivative models through fine-tuning based on Llama 2 for languages other than english? Is this a violation of the acceptable use policy?** - -A: Developers may fine-tune Llama 2 models for languages beyond English provided they comply with the Llama 2 Community License and the Acceptable Use Policy. - -**Q: What operating systems (OS) are officially supported?** - -A: -Linux is the only OS currently supported by this repo. - -**Q: I am getting the following error with download script. What should I do?** - - ``` - download.sh: 14: [[: not found - ``` - -A: -Make sure to run the command as follows - -``` -./download.sh -``` - -**Q: I am getting "Issue with the URL" as an error message. What do I do?** -``` - HTTP request sent, awaiting response... 400 Bad Request -``` - -A: -The issue occurs because of not copying the URL correctly. If you right click on the link and copy the link, the link may be copied with url defence wrapper. -To avoid this problem, please select the url manually and copy it From 7909dee4a8922f756132911a2879f7b0019f013b Mon Sep 17 00:00:00 2001 From: JacobHelwig <60412857+JacobHelwig@users.noreply.github.com> Date: Thu, 2 Nov 2023 12:47:01 -0500 Subject: [PATCH 56/84] Correct "bug," typo to "bug", in README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5de6bb99a..4d9b06bb9 100755 --- a/README.md +++ b/README.md @@ -104,7 +104,7 @@ In order to help developers address these risks, we have created the [Responsibl ## Issues -Please report any software “bug,” or other problems with the models through one of the following means: +Please report any software “bug”, or other problems with the models through one of the following means: - Reporting issues with the model: [github.com/facebookresearch/llama](http://github.com/facebookresearch/llama) - Reporting risky content generated by the model: [developers.facebook.com/llama_output_feedback](http://developers.facebook.com/llama_output_feedback) - Reporting bugs and security concerns: [facebook.com/whitehat/info](http://facebook.com/whitehat/info) From e9077bd24177a74aa79f406bef7d4b57fe393157 Mon Sep 17 00:00:00 2001 From: flu0r1ne Date: Thu, 2 Nov 2023 19:33:26 -0500 Subject: [PATCH 57/84] Fix key-value caching for seqlen != 1 This commit fixes a bug in the key-value caching. Currently, a square attention mask is misapplied to the scores matrix despite not matching the shape of the scores matrix. This results in a runtime error. In a correct implementation, the decoder mask needs to describe how the new seq_len tokens interact with all the cached tokens. That is, the attention mask needs to be of shape (seq_len, total_len), indicating how the token at row i (representing token i + cached_len in the transformer model) attends to token j. Accordingly, the matrix needs to mask entries where j > cached_len + i. This patch horizontally appends (seq_len, cached_len) zeros to an upper-triangular mask of size (seq_len, seq_len) to form the (seq_len, total_len) mask. --- llama/model.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/llama/model.py b/llama/model.py index 770526d8c..63f3cf1ef 100755 --- a/llama/model.py +++ b/llama/model.py @@ -474,9 +474,19 @@ def forward(self, tokens: torch.Tensor, start_pos: int): mask = None if seqlen > 1: mask = torch.full( - (1, 1, seqlen, seqlen), float("-inf"), device=tokens.device + (seqlen, seqlen), float("-inf"), device=tokens.device ) - mask = torch.triu(mask, diagonal=start_pos + 1).type_as(h) + + mask = torch.triu(mask, diagonal=1) + + # When performing key-value caching, we compute the attention scores + # only for the new sequence. Thus, the matrix of scores is of size + # (seq_len, total_len), and the only masked entries are (i, j) for + # j > cached_len + i, since row i corresponds to token cached_len + i. + mask = torch.hstack([ + torch.zeros((seqlen, start_pos), device=tokens.device), + mask + ]).type_as(h) for layer in self.layers: h = layer(h, start_pos, freqs_cis, mask) From 9cd8d505cabad0d8eee19fc68b5eddbaf9796327 Mon Sep 17 00:00:00 2001 From: Suraj Subramanian <5676233+subramen@users.noreply.github.com> Date: Wed, 8 Nov 2023 08:13:08 -0500 Subject: [PATCH 58/84] Update issue templates --- .github/ISSUE_TEMPLATE/bug_report.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 443570ef5..e9775165e 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -10,17 +10,17 @@ assignees: '' **Before submitting a bug, please make sure the issue hasn't been already addressed by searching through the [FAQs](https://github.com/facebookresearch/llama/blob/main/FAQ.md) and [existing/past issues](https://github.com/facebookresearch/llama/issues)** ## Describe the bug -Please provide a clear and concise description of what the bug is. If relevant, please include a _minimal_ (least lines of code necessary) _reproducible_ (running this will give us the same result as you get) code snippet. Make sure to include the relevant imports. - -Remember to wrap the code and outputs in ```` ```triple-quotes blocks``` ````. + ### Minimal reproducible example + ```python # sample code to repro the bug ``` ### Output + ``` From dccf644213a2771a81fc4a754eed9623ea7f8444 Mon Sep 17 00:00:00 2001 From: Suraj Subramanian <5676233+subramen@users.noreply.github.com> Date: Wed, 8 Nov 2023 08:47:11 -0500 Subject: [PATCH 59/84] fix faq link --- .github/ISSUE_TEMPLATE/bug_report.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index e9775165e..428351ed0 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -7,7 +7,7 @@ assignees: '' --- -**Before submitting a bug, please make sure the issue hasn't been already addressed by searching through the [FAQs](https://github.com/facebookresearch/llama/blob/main/FAQ.md) and [existing/past issues](https://github.com/facebookresearch/llama/issues)** +**Before submitting a bug, please make sure the issue hasn't been already addressed by searching through the [FAQs](https://ai.meta.com/llama/faq/) and [existing/past issues](https://github.com/facebookresearch/llama/issues)** ## Describe the bug From 94b055f4aed453f06103bc9b0b679a3df7f2fd44 Mon Sep 17 00:00:00 2001 From: Joseph Spisak Date: Fri, 10 Nov 2023 07:38:39 -0800 Subject: [PATCH 60/84] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4d9b06bb9..96ae58f01 100755 --- a/README.md +++ b/README.md @@ -124,7 +124,7 @@ See the [LICENSE](LICENSE) file, as well as our accompanying [Acceptable Use Pol 2. [Llama 2 technical overview](https://ai.meta.com/resources/models-and-libraries/llama) 3. [Open Innovation AI Research Community](https://ai.meta.com/llama/open-innovation-ai-research-community/) -For common questions, the FAQ can be found [here](https://github.com/facebookresearch/llama/blob/main/FAQ.md) which will be kept up to date over time as new questions arise. +For common questions, the FAQ can be found [here](https://ai.meta.com/llama/faq/) which will be kept up to date over time as new questions arise. ## Original LLaMA The repo for the original llama release is in the [`llama_v1`](https://github.com/facebookresearch/llama/tree/llama_v1) branch. From 6b3154bfbbb56c4665ca083be1d46c1e4f1bcc33 Mon Sep 17 00:00:00 2001 From: Alex <76689481+flu0r1ne@users.noreply.github.com> Date: Mon, 13 Nov 2023 13:41:06 -0600 Subject: [PATCH 61/84] Update transformer mask comment Update names for consistency with code Co-authored-by: ruanslv --- llama/model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama/model.py b/llama/model.py index 63f3cf1ef..f7bf64c4d 100755 --- a/llama/model.py +++ b/llama/model.py @@ -481,8 +481,8 @@ def forward(self, tokens: torch.Tensor, start_pos: int): # When performing key-value caching, we compute the attention scores # only for the new sequence. Thus, the matrix of scores is of size - # (seq_len, total_len), and the only masked entries are (i, j) for - # j > cached_len + i, since row i corresponds to token cached_len + i. + # (seqlen, cache_len + seqlen), and the only masked entries are (i, j) for + # j > cache_len + i, since row i corresponds to token cache_len + i. mask = torch.hstack([ torch.zeros((seqlen, start_pos), device=tokens.device), mask From cd0719ddb42541fa4433e12d9922528832dd6eeb Mon Sep 17 00:00:00 2001 From: flu0r1ne Date: Mon, 13 Nov 2023 14:05:24 -0600 Subject: [PATCH 62/84] Correct KV comment seqlen -> seqlen + cache_len Update and add comments about the shape of the key and value matrices in the attention component. E.g., the second dimension is of length seqlen + cache_len not seqlen as previously stated. --- llama/model.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llama/model.py b/llama/model.py index f7bf64c4d..c78570f68 100755 --- a/llama/model.py +++ b/llama/model.py @@ -289,12 +289,12 @@ def forward( values = self.cache_v[:bsz, : start_pos + seqlen] # repeat k/v heads if n_kv_heads < n_heads - keys = repeat_kv(keys, self.n_rep) # (bs, seqlen, n_local_heads, head_dim) - values = repeat_kv(values, self.n_rep) # (bs, seqlen, n_local_heads, head_dim) + keys = repeat_kv(keys, self.n_rep) # (bs, cache_len + seqlen, n_local_heads, head_dim) + values = repeat_kv(values, self.n_rep) # (bs, cache_len + seqlen, n_local_heads, head_dim) xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim) - keys = keys.transpose(1, 2) - values = values.transpose(1, 2) + keys = keys.transpose(1, 2) # (bs, n_local_heads, cache_len + seqlen, head_dim) + values = values.transpose(1, 2) # (bs, n_local_heads, cache_len + seqlen, head_dim) scores = torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(self.head_dim) if mask is not None: scores = scores + mask # (bs, n_local_heads, seqlen, cache_len + seqlen) From 53b227b5322164942f46bbd22d76c2008ae349d0 Mon Sep 17 00:00:00 2001 From: ryanhankins Date: Wed, 21 Feb 2024 16:39:41 -0600 Subject: [PATCH 63/84] Update README.md Repair URL to link to Llama examples safety checker. The existing URL was out of date. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 96ae58f01..03e0e0de5 100755 --- a/README.md +++ b/README.md @@ -88,7 +88,7 @@ torchrun --nproc_per_node 1 example_text_completion.py \ The fine-tuned models were trained for dialogue applications. To get the expected features and performance for them, a specific formatting defined in [`chat_completion`](https://github.com/facebookresearch/llama/blob/main/llama/generation.py#L212) needs to be followed, including the `INST` and `<>` tags, `BOS` and `EOS` tokens, and the whitespaces and breaklines in between (we recommend calling `strip()` on inputs to avoid double-spaces). -You can also deploy additional classifiers for filtering out inputs and outputs that are deemed unsafe. See the llama-recipes repo for [an example](https://github.com/facebookresearch/llama-recipes/blob/main/inference/inference.py) of how to add a safety checker to the inputs and outputs of your inference code. +You can also deploy additional classifiers for filtering out inputs and outputs that are deemed unsafe. See the llama-recipes repo for [an example](https://github.com/facebookresearch/llama-recipes/blob/main/examples/inference.py) of how to add a safety checker to the inputs and outputs of your inference code. Examples using llama-2-7b-chat: From c28bdb58c48060f20f05d923b22c35bbd754863b Mon Sep 17 00:00:00 2001 From: Navyata Bawa Date: Wed, 28 Feb 2024 10:55:21 -0800 Subject: [PATCH 64/84] Updating contributor guide --- CONTRIBUTING.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5eb507d67..536346a9e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -3,7 +3,9 @@ We want to make contributing to this project as easy and transparent as possible. ## Pull Requests -We actively welcome your pull requests. +We welcome your pull requests. + +### For requests regarding bug-fixes or improvements to the core model: 1. Fork the repo and create your branch from `main`. 2. If you've added code that should be tested, add tests. @@ -12,6 +14,10 @@ We actively welcome your pull requests. 5. Make sure your code lints. 6. If you haven't already, complete the Contributor License Agreement ("CLA"). +### For requests regarding new feature support, adding additional platform support and model use cases, please contribute to the [llama-recipes repo](https://github.com/facebookresearch/llama-recipes). +

+ + ## Contributor License Agreement ("CLA") In order to accept your pull request, we need you to submit a CLA. You only need to do this once to work on any of Meta's open source projects. From acdb925413d5f1d53b8fdeb697fe676eca757ff4 Mon Sep 17 00:00:00 2001 From: Shorthills AI <141953346+ShorthillsAI@users.noreply.github.com> Date: Fri, 1 Mar 2024 12:53:48 +0530 Subject: [PATCH 65/84] Update README.md --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 03e0e0de5..4e1b8d9ba 100755 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # Llama 2 -We are unlocking the power of large language models. Our latest version of Llama is now accessible to individuals, creators, researchers and businesses of all sizes so that they can experiment, innovate and scale their ideas responsibly. +We are unlocking the power of large language models. Our latest version of Llama is now accessible to individuals, creators, researchers, and businesses of all sizes so that they can experiment, innovate, and scale their ideas responsibly. -This release includes model weights and starting code for pretrained and fine-tuned Llama language models — ranging from 7B to 70B parameters. +This release includes model weights and starting code for pre-trained and fine-tuned Llama language models — ranging from 7B to 70B parameters. This repository is intended as a minimal example to load [Llama 2](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/) models and run inference. For more detailed examples leveraging Hugging Face, see [llama-recipes](https://github.com/facebookresearch/llama-recipes/). @@ -18,11 +18,11 @@ In order to download the model weights and tokenizer, please visit the [Meta web Once your request is approved, you will receive a signed URL over email. Then run the download.sh script, passing the URL provided when prompted to start the download. -Pre-requisites: Make sure you have `wget` and `md5sum` installed. Then to run the script: `./download.sh`. +Pre-requisites: Make sure you have `wget` and `md5sum` installed. Then run the script: `./download.sh`. Keep in mind that the links expire after 24 hours and a certain amount of downloads. If you start seeing errors such as `403: Forbidden`, you can always re-request a link. -### Access on Hugging Face +### Access to Hugging Face We are also providing downloads on [Hugging Face](https://huggingface.co/meta-llama). You must first request a download from the Meta website using the same email address as your Hugging Face account. After doing so, you can request access to any of the models on Hugging Face and within 1-2 days your account will be granted access to all versions. @@ -32,9 +32,9 @@ You can follow the steps below to quickly get up and running with Llama 2 models 1. In a conda env with PyTorch / CUDA available clone and download this repository. -2. In the top level directory run: +2. In the top-level directory run: ```bash - pip install -e . + pip install -e. ``` 3. Visit the [Meta website](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and register to download the model/s. From 11ebe80305e3bcc8a8a8d75ec0e4b39955b916eb Mon Sep 17 00:00:00 2001 From: Shorthills AI <141953346+ShorthillsAI@users.noreply.github.com> Date: Wed, 6 Mar 2024 08:06:47 +0530 Subject: [PATCH 66/84] Update README.md Undo the pip install e. changes --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4e1b8d9ba..a15d26f5b 100755 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ You can follow the steps below to quickly get up and running with Llama 2 models 2. In the top-level directory run: ```bash - pip install -e. + pip install -e . ``` 3. Visit the [Meta website](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and register to download the model/s. From 0b466166eee0b57ec876f83ded533c78ff9ff7d2 Mon Sep 17 00:00:00 2001 From: Jeff Tang Date: Wed, 13 Mar 2024 10:18:18 -0700 Subject: [PATCH 67/84] change LLaMA to Llama in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a15d26f5b..79fb116a2 100755 --- a/README.md +++ b/README.md @@ -126,5 +126,5 @@ See the [LICENSE](LICENSE) file, as well as our accompanying [Acceptable Use Pol For common questions, the FAQ can be found [here](https://ai.meta.com/llama/faq/) which will be kept up to date over time as new questions arise. -## Original LLaMA +## Original Llama The repo for the original llama release is in the [`llama_v1`](https://github.com/facebookresearch/llama/tree/llama_v1) branch. From 826ad1198c02f93fe14fef5dbf4a499fcb02e3b1 Mon Sep 17 00:00:00 2001 From: Joseph Spisak Date: Wed, 20 Mar 2024 10:50:59 -0700 Subject: [PATCH 68/84] Update README.md --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index 79fb116a2..960d5a2ec 100755 --- a/README.md +++ b/README.md @@ -12,8 +12,6 @@ See [UPDATES.md](UPDATES.md). Also for a running list of frequently asked questi ## Download -⚠️ **7/18: We're aware of people encountering a number of download issues today. Anyone still encountering issues should remove all local files, re-clone the repository, and [request a new download link](https://ai.meta.com/resources/models-and-libraries/llama-downloads/). It's critical to do all of these in case you have local corrupt files.** - In order to download the model weights and tokenizer, please visit the [Meta website](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and accept our License. Once your request is approved, you will receive a signed URL over email. Then run the download.sh script, passing the URL provided when prompted to start the download. From 1e8375848d3a3ebaccab83fd670b880864cf9409 Mon Sep 17 00:00:00 2001 From: wangzhihong Date: Thu, 21 Mar 2024 10:09:34 +0800 Subject: [PATCH 69/84] update the code to use the module's __call__ --- llama/model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama/model.py b/llama/model.py index c78570f68..562fcad1b 100755 --- a/llama/model.py +++ b/llama/model.py @@ -403,10 +403,10 @@ def forward( torch.Tensor: Output tensor after applying attention and feedforward layers. """ - h = x + self.attention.forward( + h = x + self.attention( self.attention_norm(x), start_pos, freqs_cis, mask ) - out = h + self.feed_forward.forward(self.ffn_norm(h)) + out = h + self.feed_forward(self.ffn_norm(h)) return out From 1f9a8d774a10fbe41321e530428e91f7eb7eb822 Mon Sep 17 00:00:00 2001 From: MattGurney Date: Sat, 23 Mar 2024 19:03:21 +1100 Subject: [PATCH 70/84] Update MODEL_CARD.md Move word "Uses" into markdown header. --- MODEL_CARD.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_CARD.md b/MODEL_CARD.md index 370807880..8651be9d8 100644 --- a/MODEL_CARD.md +++ b/MODEL_CARD.md @@ -33,7 +33,7 @@ Llama 2|*A new mix of publicly available online data*|70B|4k|✔|2.0T|1.5 x # **Intended Use** **Intended Use Cases** Llama 2 is intended for commercial and research use in English. Tuned models are intended for assistant-like chat, whereas pretrained models can be adapted for a variety of natural language generation tasks. -**Out-of-scope** Uses Use in any manner that violates applicable laws or regulations (including trade compliance laws). Use in any other way that is prohibited by the Acceptable Use Policy and Llama 2 Community License. Use in languages other than English**. +**Out-of-scope Uses** Use in any manner that violates applicable laws or regulations (including trade compliance laws). Use in any other way that is prohibited by the Acceptable Use Policy and Llama 2 Community License. Use in languages other than English**. **Note: Developers may fine-tune Llama 2 models for languages beyond English provided they comply with the Llama 2 Community License and the Acceptable Use Policy. From fd7308965bf7e92c7a860584198705112cd87b9f Mon Sep 17 00:00:00 2001 From: Omar Sanseviero Date: Mon, 8 Apr 2024 16:12:21 +0200 Subject: [PATCH 71/84] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 960d5a2ec..d962390a1 100755 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ Keep in mind that the links expire after 24 hours and a certain amount of downlo ### Access to Hugging Face -We are also providing downloads on [Hugging Face](https://huggingface.co/meta-llama). You must first request a download from the Meta website using the same email address as your Hugging Face account. After doing so, you can request access to any of the models on Hugging Face and within 1-2 days your account will be granted access to all versions. +We are also providing downloads on [Hugging Face](https://huggingface.co/meta-llama). You can request access to the models by acknowledging the license and filling the form in the model card of a repo. After doing so, you should get access to all the Llama models of a version (Code Llama, Llama 2, or Llama Guard) within 1 hour. ## Quick Start From 893ff972e1355f43d4a22c6aeeaaa015b73f25d3 Mon Sep 17 00:00:00 2001 From: Dan Dascalescu Date: Wed, 15 May 2024 00:53:25 +0300 Subject: [PATCH 72/84] README: LLama 2 is no longer the latest version --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d962390a1..87157e698 100755 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Llama 2 -We are unlocking the power of large language models. Our latest version of Llama is now accessible to individuals, creators, researchers, and businesses of all sizes so that they can experiment, innovate, and scale their ideas responsibly. +We are unlocking the power of large language models. Llama 2 is now accessible to individuals, creators, researchers, and businesses of all sizes so that they can experiment, innovate, and scale their ideas responsibly. This release includes model weights and starting code for pre-trained and fine-tuned Llama language models — ranging from 7B to 70B parameters. From c0098be87adea7fedeb1b149b4c272a8384395b1 Mon Sep 17 00:00:00 2001 From: hyungupark <165874125+hyungupark@users.noreply.github.com> Date: Wed, 15 May 2024 12:49:24 +0900 Subject: [PATCH 73/84] Update download.sh modify for CPU_ARCH not found --- download.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/download.sh b/download.sh index 9ba1b533c..b16a37f3a 100755 --- a/download.sh +++ b/download.sh @@ -62,9 +62,10 @@ do wget --continue ${PRESIGNED_URL/'*'/"${MODEL_PATH}/params.json"} -O ${TARGET_FOLDER}"/${MODEL_PATH}/params.json" wget --continue ${PRESIGNED_URL/'*'/"${MODEL_PATH}/checklist.chk"} -O ${TARGET_FOLDER}"/${MODEL_PATH}/checklist.chk" echo "Checking checksums" - if [ "$CPU_ARCH" = "arm64" ]; then + CPU_ARCH=$(uname -m) + if [[ "$CPU_ARCH" == "arm64" ]]; then (cd ${TARGET_FOLDER}"/${MODEL_PATH}" && md5 checklist.chk) else (cd ${TARGET_FOLDER}"/${MODEL_PATH}" && md5sum -c checklist.chk) fi -done \ No newline at end of file +done From 12b676b909368581d39cebafae57226688d5676a Mon Sep 17 00:00:00 2001 From: Samuel Selvan Date: Mon, 22 Jul 2024 18:15:37 -0700 Subject: [PATCH 74/84] Update download.sh --- download.sh | 161 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 115 insertions(+), 46 deletions(-) diff --git a/download.sh b/download.sh index 9ba1b533c..cd191eaf1 100755 --- a/download.sh +++ b/download.sh @@ -1,70 +1,139 @@ #!/usr/bin/env bash # Copyright (c) Meta Platforms, Inc. and affiliates. -# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. +# This software may be used and distributed according to the terms of the Llama 3.1 Community License Agreement. set -e read -p "Enter the URL from email: " PRESIGNED_URL -echo "" -read -p "Enter the list of models to download without spaces (7B,13B,70B,7B-chat,13B-chat,70B-chat), or press Enter for all: " MODEL_SIZE +ALL_MODELS_LIST="meta-llama-3.1-405b,meta-llama-3.1-70b,meta-llama-3.1-8b,meta-llama-guard-3-8b,prompt-guard" +printf "\n **** Model list ***\n" +for MODEL in ${ALL_MODELS_LIST//,/ } +do + printf " - ${MODEL}\n" +done +read -p "Choose the model to download: " SELECTED_MODEL +printf "\n Selected model: ${SELECTED_MODEL} \n" + +SELECTED_MODELS="" +if [[ $SELECTED_MODEL == "meta-llama-3.1-405b" ]]; then + MODEL_LIST="meta-llama-3.1-405b-instruct-mp16,meta-llama-3.1-405b-instruct-mp8,meta-llama-3.1-405b-instruct-fb8,meta-llama-3.1-405b-mp16,meta-llama-3.1-405b-mp8,meta-llama-3.1-405b-fp8" +elif [[ $SELECTED_MODEL == "meta-llama-3.1-70b" ]]; then + MODEL_LIST="meta-llama-3.1-70b-instruct,meta-llama-3.1-70b" +elif [[ $SELECTED_MODEL == "meta-llama-3.1-8b" ]]; then + MODEL_LIST="meta-llama-3.1-8b-instruct,meta-llama-3.1-8b" +elif [[ $SELECTED_MODEL == "meta-llama-guard-3-8b" ]]; then + MODEL_LIST="meta-llama-guard-3-8b-int8-hf,meta-llama-guard-3-8b" +elif [[ $SELECTED_MODEL == "prompt-guard" ]]; then + SELECTED_MODELS="prompt-guard" + MODEL_LIST="" +fi + +if [[ -z "$SELECTED_MODELS" ]]; then + printf "\n **** Available models to download: ***\n" + for MODEL in ${MODEL_LIST//,/ } + do + printf " - ${MODEL}\n" + done + read -p "Enter the list of models to download without spaces or press Enter for all: " SELECTED_MODELS +fi + TARGET_FOLDER="." # where all files should end up mkdir -p ${TARGET_FOLDER} -if [[ $MODEL_SIZE == "" ]]; then - MODEL_SIZE="7B,13B,70B,7B-chat,13B-chat,70B-chat" +if [[ $SELECTED_MODELS == "" ]]; then + SELECTED_MODELS=${MODEL_LIST} fi -echo "Downloading LICENSE and Acceptable Usage Policy" +if [[ $SELECTED_MODEL == "meta-llama-3.1-405b" ]]; then + printf "\nModel requires significant storage and computational resources, occupying approximately 750GB of disk storage space and necessitating two nodes on MP16 for inferencing.\n" + read -p "Enter Y to continue: " ACK + if [[ $ACK != 'Y' ]]; then + printf "Exiting..." + exit 1 + fi +fi + +printf "Downloading LICENSE and Acceptable Usage Policy\n" wget --continue ${PRESIGNED_URL/'*'/"LICENSE"} -O ${TARGET_FOLDER}"/LICENSE" wget --continue ${PRESIGNED_URL/'*'/"USE_POLICY.md"} -O ${TARGET_FOLDER}"/USE_POLICY.md" -echo "Downloading tokenizer" -wget --continue ${PRESIGNED_URL/'*'/"tokenizer.model"} -O ${TARGET_FOLDER}"/tokenizer.model" -wget --continue ${PRESIGNED_URL/'*'/"tokenizer_checklist.chk"} -O ${TARGET_FOLDER}"/tokenizer_checklist.chk" -CPU_ARCH=$(uname -m) - if [ "$CPU_ARCH" = "arm64" ]; then - (cd ${TARGET_FOLDER} && md5 tokenizer_checklist.chk) - else - (cd ${TARGET_FOLDER} && md5sum -c tokenizer_checklist.chk) - fi - -for m in ${MODEL_SIZE//,/ } +for m in ${SELECTED_MODELS//,/ } do - if [[ $m == "7B" ]]; then - SHARD=0 - MODEL_PATH="llama-2-7b" - elif [[ $m == "7B-chat" ]]; then - SHARD=0 - MODEL_PATH="llama-2-7b-chat" - elif [[ $m == "13B" ]]; then - SHARD=1 - MODEL_PATH="llama-2-13b" - elif [[ $m == "13B-chat" ]]; then - SHARD=1 - MODEL_PATH="llama-2-13b-chat" - elif [[ $m == "70B" ]]; then - SHARD=7 - MODEL_PATH="llama-2-70b" - elif [[ $m == "70B-chat" ]]; then - SHARD=7 - MODEL_PATH="llama-2-70b-chat" + + ADDITIONAL_FILES="" + TOKENIZER_MODEL=1 + if [[ $m == "meta-llama-3.1-405b-instruct-mp16" ]]; then + PTH_FILE_COUNT=15 + MODEL_PATH="Meta-Llama-3.1-405B-Instruct-MP16" + elif [[ $m == "meta-llama-3.1-405b-instruct-mp8" ]]; then + PTH_FILE_COUNT=7 + MODEL_PATH="Meta-Llama-3.1-405B-Instruct-MP8" + elif [[ $m == "meta-llama-3.1-405b-instruct-fp8" ]]; then + PTH_FILE_COUNT=7 + MODEL_PATH="Meta-Llama-3.1-405B-Instruct" + ADDITIONAL_FILES="fp8_scales_0.pt,fp8_scales_1.pt,fp8_scales_2.pt,fp8_scales_3.pt,fp8_scales_4.pt,fp8_scales_5.pt,fp8_scales_6.pt,fp8_scales_7.pt" + elif [[ $m == "meta-llama-3.1-405b-mp16" ]]; then + PTH_FILE_COUNT=15 + MODEL_PATH="Meta-Llama-3.1-405B-MP16" + elif [[ $m == "meta-llama-3.1-405b-mp8" ]]; then + PTH_FILE_COUNT=7 + MODEL_PATH="Meta-Llama-3.1-405B-MP8" + elif [[ $m == "meta-llama-3.1-405b-fp8" ]]; then + PTH_FILE_COUNT=7 + MODEL_PATH="Meta-Llama-3.1-405B" + elif [[ $m == "meta-llama-3.1-70b-instruct" ]]; then + PTH_FILE_COUNT=7 + MODEL_PATH="Meta-Llama-3.1-70B-Instruct" + elif [[ $m == "meta-llama-3.1-70b" ]]; then + PTH_FILE_COUNT=7 + MODEL_PATH="Meta-Llama-3.1-70B" + elif [[ $m == "meta-llama-3.1-8b-instruct" ]]; then + PTH_FILE_COUNT=0 + MODEL_PATH="Meta-Llama-3.1-8B-Instruct" + elif [[ $m == "meta-llama-3.1-8b" ]]; then + PTH_FILE_COUNT=0 + MODEL_PATH="Meta-Llama-3.1-8B" + elif [[ $m == "meta-llama-guard-3-8b-int8-hf" ]]; then + PTH_FILE_COUNT=-1 + MODEL_PATH="Meta-Llama-Guard-3-8B-INT8-HF" + ADDITIONAL_FILES="generation_config.json,model-00001-of-00002.safetensors,model-00002-of-00002.safetensors,model.safetensors.index.json,special_tokens_map.json,tokenizer_config.json,tokenizer.json" + TOKENIZER_MODEL=0 + elif [[ $m == "meta-llama-guard-3-8b" ]]; then + PTH_FILE_COUNT=0 + MODEL_PATH="Meta-Llama-Guard-3-8B" + elif [[ $m == "prompt-guard" ]]; then + PTH_FILE_COUNT=-1 + MODEL_PATH="Prompt-Guard" + ADDITIONAL_FILES="model.safetensors,special_tokens_map.json,tokenizer_config.json,tokenizer.json" + TOKENIZER_MODEL=0 fi - echo "Downloading ${MODEL_PATH}" + printf "\n***Downloading ${MODEL_PATH}***\n" mkdir -p ${TARGET_FOLDER}"/${MODEL_PATH}" - for s in $(seq -f "0%g" 0 ${SHARD}) + if [[ $TOKENIZER_MODEL == 1 ]]; then + printf "Downloading tokenizer\n" + wget --continue ${PRESIGNED_URL/'*'/"${MODEL_PATH}/tokenizer.model"} -O ${TARGET_FOLDER}"/${MODEL_PATH}/tokenizer.model" + fi + + + if [[ $PTH_FILE_COUNT -ge 0 ]]; then + for s in $(seq -f "0%g" 0 ${PTH_FILE_COUNT}) + do + printf "Downloading consolidated.${s}.pth\n" + wget --continue ${PRESIGNED_URL/'*'/"${MODEL_PATH}/consolidated.${s}.pth"} -O ${TARGET_FOLDER}"/${MODEL_PATH}/consolidated.${s}.pth" + done + fi + + for ADDITIONAL_FILE in ${ADDITIONAL_FILES//,/ } do - wget --continue ${PRESIGNED_URL/'*'/"${MODEL_PATH}/consolidated.${s}.pth"} -O ${TARGET_FOLDER}"/${MODEL_PATH}/consolidated.${s}.pth" + printf "Downloading $ADDITIONAL_FILE...\n" + wget --continue ${PRESIGNED_URL/'*'/"${MODEL_PATH}/${ADDITIONAL_FILE}"} -O ${TARGET_FOLDER}"/${MODEL_PATH}/${ADDITIONAL_FILE}" done - wget --continue ${PRESIGNED_URL/'*'/"${MODEL_PATH}/params.json"} -O ${TARGET_FOLDER}"/${MODEL_PATH}/params.json" - wget --continue ${PRESIGNED_URL/'*'/"${MODEL_PATH}/checklist.chk"} -O ${TARGET_FOLDER}"/${MODEL_PATH}/checklist.chk" - echo "Checking checksums" - if [ "$CPU_ARCH" = "arm64" ]; then - (cd ${TARGET_FOLDER}"/${MODEL_PATH}" && md5 checklist.chk) - else - (cd ${TARGET_FOLDER}"/${MODEL_PATH}" && md5sum -c checklist.chk) + if [[ $m != "prompt-guard" && $m != "meta-llama-guard-3-8b-int8-hf" ]]; then + printf "Downloading params.json...\n" + wget --continue ${PRESIGNED_URL/'*'/"${MODEL_PATH}/params.json"} -O ${TARGET_FOLDER}"/${MODEL_PATH}/params.json" fi -done \ No newline at end of file +done From 66bc7307da52efc6327307407386e26b17afa09c Mon Sep 17 00:00:00 2001 From: Samuel Selvan Date: Mon, 22 Jul 2024 18:20:54 -0700 Subject: [PATCH 75/84] Update download.sh --- download.sh | 159 +++++++++++++++------------------------------------- 1 file changed, 45 insertions(+), 114 deletions(-) diff --git a/download.sh b/download.sh index cd191eaf1..044624fd1 100755 --- a/download.sh +++ b/download.sh @@ -1,139 +1,70 @@ #!/usr/bin/env bash # Copyright (c) Meta Platforms, Inc. and affiliates. -# This software may be used and distributed according to the terms of the Llama 3.1 Community License Agreement. +# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. set -e read -p "Enter the URL from email: " PRESIGNED_URL -ALL_MODELS_LIST="meta-llama-3.1-405b,meta-llama-3.1-70b,meta-llama-3.1-8b,meta-llama-guard-3-8b,prompt-guard" -printf "\n **** Model list ***\n" -for MODEL in ${ALL_MODELS_LIST//,/ } -do - printf " - ${MODEL}\n" -done -read -p "Choose the model to download: " SELECTED_MODEL -printf "\n Selected model: ${SELECTED_MODEL} \n" - -SELECTED_MODELS="" -if [[ $SELECTED_MODEL == "meta-llama-3.1-405b" ]]; then - MODEL_LIST="meta-llama-3.1-405b-instruct-mp16,meta-llama-3.1-405b-instruct-mp8,meta-llama-3.1-405b-instruct-fb8,meta-llama-3.1-405b-mp16,meta-llama-3.1-405b-mp8,meta-llama-3.1-405b-fp8" -elif [[ $SELECTED_MODEL == "meta-llama-3.1-70b" ]]; then - MODEL_LIST="meta-llama-3.1-70b-instruct,meta-llama-3.1-70b" -elif [[ $SELECTED_MODEL == "meta-llama-3.1-8b" ]]; then - MODEL_LIST="meta-llama-3.1-8b-instruct,meta-llama-3.1-8b" -elif [[ $SELECTED_MODEL == "meta-llama-guard-3-8b" ]]; then - MODEL_LIST="meta-llama-guard-3-8b-int8-hf,meta-llama-guard-3-8b" -elif [[ $SELECTED_MODEL == "prompt-guard" ]]; then - SELECTED_MODELS="prompt-guard" - MODEL_LIST="" -fi - -if [[ -z "$SELECTED_MODELS" ]]; then - printf "\n **** Available models to download: ***\n" - for MODEL in ${MODEL_LIST//,/ } - do - printf " - ${MODEL}\n" - done - read -p "Enter the list of models to download without spaces or press Enter for all: " SELECTED_MODELS -fi - +echo "" +read -p "Enter the list of models to download without spaces (7B,13B,70B,7B-chat,13B-chat,70B-chat), or press Enter for all: " MODEL_SIZE TARGET_FOLDER="." # where all files should end up mkdir -p ${TARGET_FOLDER} -if [[ $SELECTED_MODELS == "" ]]; then - SELECTED_MODELS=${MODEL_LIST} +if [[ $MODEL_SIZE == "" ]]; then + MODEL_SIZE="7B,13B,70B,7B-chat,13B-chat,70B-chat" fi -if [[ $SELECTED_MODEL == "meta-llama-3.1-405b" ]]; then - printf "\nModel requires significant storage and computational resources, occupying approximately 750GB of disk storage space and necessitating two nodes on MP16 for inferencing.\n" - read -p "Enter Y to continue: " ACK - if [[ $ACK != 'Y' ]]; then - printf "Exiting..." - exit 1 - fi -fi - -printf "Downloading LICENSE and Acceptable Usage Policy\n" +echo "Downloading LICENSE and Acceptable Usage Policy" wget --continue ${PRESIGNED_URL/'*'/"LICENSE"} -O ${TARGET_FOLDER}"/LICENSE" wget --continue ${PRESIGNED_URL/'*'/"USE_POLICY.md"} -O ${TARGET_FOLDER}"/USE_POLICY.md" -for m in ${SELECTED_MODELS//,/ } +echo "Downloading tokenizer" +wget --continue ${PRESIGNED_URL/'*'/"tokenizer.model"} -O ${TARGET_FOLDER}"/tokenizer.model" +wget --continue ${PRESIGNED_URL/'*'/"tokenizer_checklist.chk"} -O ${TARGET_FOLDER}"/tokenizer_checklist.chk" +CPU_ARCH=$(uname -m) + if [ "$CPU_ARCH" = "arm64" ]; then + (cd ${TARGET_FOLDER} && md5 tokenizer_checklist.chk) + else + (cd ${TARGET_FOLDER} && md5sum -c tokenizer_checklist.chk) + fi + +for m in ${MODEL_SIZE//,/ } do - - ADDITIONAL_FILES="" - TOKENIZER_MODEL=1 - if [[ $m == "meta-llama-3.1-405b-instruct-mp16" ]]; then - PTH_FILE_COUNT=15 - MODEL_PATH="Meta-Llama-3.1-405B-Instruct-MP16" - elif [[ $m == "meta-llama-3.1-405b-instruct-mp8" ]]; then - PTH_FILE_COUNT=7 - MODEL_PATH="Meta-Llama-3.1-405B-Instruct-MP8" - elif [[ $m == "meta-llama-3.1-405b-instruct-fp8" ]]; then - PTH_FILE_COUNT=7 - MODEL_PATH="Meta-Llama-3.1-405B-Instruct" - ADDITIONAL_FILES="fp8_scales_0.pt,fp8_scales_1.pt,fp8_scales_2.pt,fp8_scales_3.pt,fp8_scales_4.pt,fp8_scales_5.pt,fp8_scales_6.pt,fp8_scales_7.pt" - elif [[ $m == "meta-llama-3.1-405b-mp16" ]]; then - PTH_FILE_COUNT=15 - MODEL_PATH="Meta-Llama-3.1-405B-MP16" - elif [[ $m == "meta-llama-3.1-405b-mp8" ]]; then - PTH_FILE_COUNT=7 - MODEL_PATH="Meta-Llama-3.1-405B-MP8" - elif [[ $m == "meta-llama-3.1-405b-fp8" ]]; then - PTH_FILE_COUNT=7 - MODEL_PATH="Meta-Llama-3.1-405B" - elif [[ $m == "meta-llama-3.1-70b-instruct" ]]; then - PTH_FILE_COUNT=7 - MODEL_PATH="Meta-Llama-3.1-70B-Instruct" - elif [[ $m == "meta-llama-3.1-70b" ]]; then - PTH_FILE_COUNT=7 - MODEL_PATH="Meta-Llama-3.1-70B" - elif [[ $m == "meta-llama-3.1-8b-instruct" ]]; then - PTH_FILE_COUNT=0 - MODEL_PATH="Meta-Llama-3.1-8B-Instruct" - elif [[ $m == "meta-llama-3.1-8b" ]]; then - PTH_FILE_COUNT=0 - MODEL_PATH="Meta-Llama-3.1-8B" - elif [[ $m == "meta-llama-guard-3-8b-int8-hf" ]]; then - PTH_FILE_COUNT=-1 - MODEL_PATH="Meta-Llama-Guard-3-8B-INT8-HF" - ADDITIONAL_FILES="generation_config.json,model-00001-of-00002.safetensors,model-00002-of-00002.safetensors,model.safetensors.index.json,special_tokens_map.json,tokenizer_config.json,tokenizer.json" - TOKENIZER_MODEL=0 - elif [[ $m == "meta-llama-guard-3-8b" ]]; then - PTH_FILE_COUNT=0 - MODEL_PATH="Meta-Llama-Guard-3-8B" - elif [[ $m == "prompt-guard" ]]; then - PTH_FILE_COUNT=-1 - MODEL_PATH="Prompt-Guard" - ADDITIONAL_FILES="model.safetensors,special_tokens_map.json,tokenizer_config.json,tokenizer.json" - TOKENIZER_MODEL=0 + if [[ $m == "7B" ]]; then + SHARD=0 + MODEL_PATH="llama-2-7b" + elif [[ $m == "7B-chat" ]]; then + SHARD=0 + MODEL_PATH="llama-2-7b-chat" + elif [[ $m == "13B" ]]; then + SHARD=1 + MODEL_PATH="llama-2-13b" + elif [[ $m == "13B-chat" ]]; then + SHARD=1 + MODEL_PATH="llama-2-13b-chat" + elif [[ $m == "70B" ]]; then + SHARD=7 + MODEL_PATH="llama-2-70b" + elif [[ $m == "70B-chat" ]]; then + SHARD=7 + MODEL_PATH="llama-2-70b-chat" fi - printf "\n***Downloading ${MODEL_PATH}***\n" + echo "Downloading ${MODEL_PATH}" mkdir -p ${TARGET_FOLDER}"/${MODEL_PATH}" - if [[ $TOKENIZER_MODEL == 1 ]]; then - printf "Downloading tokenizer\n" - wget --continue ${PRESIGNED_URL/'*'/"${MODEL_PATH}/tokenizer.model"} -O ${TARGET_FOLDER}"/${MODEL_PATH}/tokenizer.model" - fi - - - if [[ $PTH_FILE_COUNT -ge 0 ]]; then - for s in $(seq -f "0%g" 0 ${PTH_FILE_COUNT}) - do - printf "Downloading consolidated.${s}.pth\n" - wget --continue ${PRESIGNED_URL/'*'/"${MODEL_PATH}/consolidated.${s}.pth"} -O ${TARGET_FOLDER}"/${MODEL_PATH}/consolidated.${s}.pth" - done - fi - - for ADDITIONAL_FILE in ${ADDITIONAL_FILES//,/ } + for s in $(seq -f "0%g" 0 ${SHARD}) do - printf "Downloading $ADDITIONAL_FILE...\n" - wget --continue ${PRESIGNED_URL/'*'/"${MODEL_PATH}/${ADDITIONAL_FILE}"} -O ${TARGET_FOLDER}"/${MODEL_PATH}/${ADDITIONAL_FILE}" + wget --continue ${PRESIGNED_URL/'*'/"${MODEL_PATH}/consolidated.${s}.pth"} -O ${TARGET_FOLDER}"/${MODEL_PATH}/consolidated.${s}.pth" done - if [[ $m != "prompt-guard" && $m != "meta-llama-guard-3-8b-int8-hf" ]]; then - printf "Downloading params.json...\n" - wget --continue ${PRESIGNED_URL/'*'/"${MODEL_PATH}/params.json"} -O ${TARGET_FOLDER}"/${MODEL_PATH}/params.json" + wget --continue ${PRESIGNED_URL/'*'/"${MODEL_PATH}/params.json"} -O ${TARGET_FOLDER}"/${MODEL_PATH}/params.json" + wget --continue ${PRESIGNED_URL/'*'/"${MODEL_PATH}/checklist.chk"} -O ${TARGET_FOLDER}"/${MODEL_PATH}/checklist.chk" + echo "Checking checksums" + if [ "$CPU_ARCH" = "arm64" ]; then + (cd ${TARGET_FOLDER}"/${MODEL_PATH}" && md5 checklist.chk) + else + (cd ${TARGET_FOLDER}"/${MODEL_PATH}" && md5sum -c checklist.chk) fi done From 8fac8befd776bc03242fe7bc2236cdb41b6c609c Mon Sep 17 00:00:00 2001 From: Joseph Spisak Date: Tue, 23 Jul 2024 07:50:27 -0700 Subject: [PATCH 76/84] Update README.md --- README.md | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 87157e698..b103584f3 100755 --- a/README.md +++ b/README.md @@ -1,4 +1,18 @@ -# Llama 2 +## **Note of deprecation** + +Thank you for developing with Llama models. As part of the Llama 3.1 release, we’ve consolidated GitHub repos and added some additional repos as we’ve expanded Llama’s functionality into being an e2e Llama Stack. Please use the following repos going forward: +- [llama-models](https://github.com/meta-llama/llama-models) - Central repo for the foundation models including basic utilities, model cards, license and use policies +- [PurpleLlama](https://github.com/meta-llama/PurpleLlama) - Key component of Llama Stack focusing on safety risks and inference time mitigations +- [llama-toolchain](https://github.com/meta-llama/llama-toolchain) - Model development (inference/fine-tuning/safety shields/synthetic data generation) interfaces and canonical implementations +- [llama-agentic-system](https://github.com/meta-llama/llama-agentic-system) - E2E standalone Llama Stack system, along with opinionated underlying interface, that enables creation of agentic applications +- [llama-recipes](https://github.com/meta-llama/llama-recipes) - Community driven scripts and integrations + +If you have any questions, please feel free to file an issue on any of the above repos and we will do our best to respond in a timely manner. + +Thank you! + + +# (Deprecated) Llama 2 We are unlocking the power of large language models. Llama 2 is now accessible to individuals, creators, researchers, and businesses of all sizes so that they can experiment, innovate, and scale their ideas responsibly. From ff2e4fdfb3b32172fa09185ea1ad33f75a03a82a Mon Sep 17 00:00:00 2001 From: Rhonda Giandalia Date: Thu, 23 Jan 2025 23:34:23 -0800 Subject: [PATCH 77/84] Update bug_report.md --- .github/ISSUE_TEMPLATE/bug_report.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 428351ed0..e5dbbfa6b 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -8,7 +8,7 @@ assignees: '' --- **Before submitting a bug, please make sure the issue hasn't been already addressed by searching through the [FAQs](https://ai.meta.com/llama/faq/) and [existing/past issues](https://github.com/facebookresearch/llama/issues)** - + ## Describe the bug From 689c7f261b9c5514636ecc3c5fefefcbb3e6eed7 Mon Sep 17 00:00:00 2001 From: amitsangani Date: Sun, 26 Jan 2025 13:42:26 -0800 Subject: [PATCH 78/84] Update README.md Modified from Llama Recipes to Llama Cookbook. --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index b103584f3..5c46c9446 100755 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ Thank you for developing with Llama models. As part of the Llama 3.1 release, we - [PurpleLlama](https://github.com/meta-llama/PurpleLlama) - Key component of Llama Stack focusing on safety risks and inference time mitigations - [llama-toolchain](https://github.com/meta-llama/llama-toolchain) - Model development (inference/fine-tuning/safety shields/synthetic data generation) interfaces and canonical implementations - [llama-agentic-system](https://github.com/meta-llama/llama-agentic-system) - E2E standalone Llama Stack system, along with opinionated underlying interface, that enables creation of agentic applications -- [llama-recipes](https://github.com/meta-llama/llama-recipes) - Community driven scripts and integrations +- [llama-cookbook](https://github.com/meta-llama/llama-recipes) - Community driven scripts and integrations If you have any questions, please feel free to file an issue on any of the above repos and we will do our best to respond in a timely manner. @@ -18,7 +18,7 @@ We are unlocking the power of large language models. Llama 2 is now accessible t This release includes model weights and starting code for pre-trained and fine-tuned Llama language models — ranging from 7B to 70B parameters. -This repository is intended as a minimal example to load [Llama 2](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/) models and run inference. For more detailed examples leveraging Hugging Face, see [llama-recipes](https://github.com/facebookresearch/llama-recipes/). +This repository is intended as a minimal example to load [Llama 2](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/) models and run inference. For more detailed examples leveraging Hugging Face, see [llama-cookbook](https://github.com/facebookresearch/llama-recipes/). ## Updates post-launch @@ -40,7 +40,7 @@ We are also providing downloads on [Hugging Face](https://huggingface.co/meta-ll ## Quick Start -You can follow the steps below to quickly get up and running with Llama 2 models. These steps will let you run quick inference locally. For more examples, see the [Llama 2 recipes repository](https://github.com/facebookresearch/llama-recipes). +You can follow the steps below to quickly get up and running with Llama 2 models. These steps will let you run quick inference locally. For more examples, see the [Llama 2 cookbook repository](https://github.com/facebookresearch/llama-recipes). 1. In a conda env with PyTorch / CUDA available clone and download this repository. @@ -100,7 +100,7 @@ torchrun --nproc_per_node 1 example_text_completion.py \ The fine-tuned models were trained for dialogue applications. To get the expected features and performance for them, a specific formatting defined in [`chat_completion`](https://github.com/facebookresearch/llama/blob/main/llama/generation.py#L212) needs to be followed, including the `INST` and `<>` tags, `BOS` and `EOS` tokens, and the whitespaces and breaklines in between (we recommend calling `strip()` on inputs to avoid double-spaces). -You can also deploy additional classifiers for filtering out inputs and outputs that are deemed unsafe. See the llama-recipes repo for [an example](https://github.com/facebookresearch/llama-recipes/blob/main/examples/inference.py) of how to add a safety checker to the inputs and outputs of your inference code. +You can also deploy additional classifiers for filtering out inputs and outputs that are deemed unsafe. See the llama-cookbook repo for [an example](https://github.com/facebookresearch/llama-recipes/blob/main/examples/inference.py) of how to add a safety checker to the inputs and outputs of your inference code. Examples using llama-2-7b-chat: From bf159f35881fbb18efe414df796e1fcb69384aca Mon Sep 17 00:00:00 2001 From: Rhonda Giandalia Date: Wed, 29 Jan 2025 02:46:18 -0800 Subject: [PATCH 79/84] Create django.yml --- .github/workflows/django.yml | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 .github/workflows/django.yml diff --git a/.github/workflows/django.yml b/.github/workflows/django.yml new file mode 100644 index 000000000..9766b45dc --- /dev/null +++ b/.github/workflows/django.yml @@ -0,0 +1,30 @@ +name: Django CI + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + max-parallel: 4 + matrix: + python-version: [3.7, 3.8, 3.9] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install Dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + - name: Run Tests + run: | + python manage.py test From da130a0538bd655767082dbd1d834613c93a0c56 Mon Sep 17 00:00:00 2001 From: Rhonda Giandalia Date: Fri, 28 Mar 2025 15:58:41 -0700 Subject: [PATCH 80/84] Update example_text_completion.py --- example_text_completion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example_text_completion.py b/example_text_completion.py index 0d60b9c98..8c27abd54 100755 --- a/example_text_completion.py +++ b/example_text_completion.py @@ -65,5 +65,5 @@ def main( print("\n==================================\n") -if __name__ == "__main__": +if __Rhonda Giandalia__ == "__main__": fire.Fire(main) From 77e6153754dca67e36cc92d9e4ac5346629b0f96 Mon Sep 17 00:00:00 2001 From: Rhonda Giandalia Date: Sat, 5 Apr 2025 21:24:45 -0700 Subject: [PATCH 81/84] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 5c46c9446..0a51ae72e 100755 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ Keep in mind that the links expire after 24 hours and a certain amount of downlo ### Access to Hugging Face -We are also providing downloads on [Hugging Face](https://huggingface.co/meta-llama). You can request access to the models by acknowledging the license and filling the form in the model card of a repo. After doing so, you should get access to all the Llama models of a version (Code Llama, Llama 2, or Llama Guard) within 1 hour. +We are also providing downloads on [Hugging Face](https://huggingface.co/meta-llama). You can request access to the models by acknowledging the license and filling in the form in the model card of a repo. After doing so, you should get access to all the Llama models of a version (Code Llama, Llama 2, or Llama Guard) within 1 hour. ## Quick Start @@ -138,5 +138,5 @@ See the [LICENSE](LICENSE) file, as well as our accompanying [Acceptable Use Pol For common questions, the FAQ can be found [here](https://ai.meta.com/llama/faq/) which will be kept up to date over time as new questions arise. -## Original Llama + ## Original Llama8=( The repo for the original llama release is in the [`llama_v1`](https://github.com/facebookresearch/llama/tree/llama_v1) branch. From f2306eb3aed912e8e337c7b1d149466060a1c759 Mon Sep 17 00:00:00 2001 From: Rhonda Giandalia Date: Mon, 28 Apr 2025 13:07:45 -0700 Subject: [PATCH 82/84] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0a51ae72e..138cafbf5 100755 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -## **Note of deprecation** + ## **Note of deprecation** Thank you for developing with Llama models. As part of the Llama 3.1 release, we’ve consolidated GitHub repos and added some additional repos as we’ve expanded Llama’s functionality into being an e2e Llama Stack. Please use the following repos going forward: - [llama-models](https://github.com/meta-llama/llama-models) - Central repo for the foundation models including basic utilities, model cards, license and use policies From a6893772cf9d45cf4020a4677f2ca7f6af6d2556 Mon Sep 17 00:00:00 2001 From: Rhonda Giandalia Date: Sat, 2 Aug 2025 08:30:47 -0700 Subject: [PATCH 83/84] Update tokenizer.py --- llama/tokenizer.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/llama/tokenizer.py b/llama/tokenizer.py index 3eda89a06..b4dd21d1b 100755 --- a/llama/tokenizer.py +++ b/llama/tokenizer.py @@ -13,16 +13,16 @@ class Tokenizer: """tokenizing and encoding/decoding text using SentencePiece.""" - def __init__(self, model_path: str): + Def __init__(self, model_path: str): """ Initializes the Tokenizer with a SentencePiece model. Args: - model_path (str): The path to the SentencePiece model file. + model_path_Monday (str): The path to the SentencePiece model Monday file. """ # reload tokenizer assert os.path.isfile(model_path), model_path - self.sp_model = SentencePieceProcessor(model_file=model_path) + self.sp_model.Monday = SentencePieceProcessor(model_file=model_path) logger.info(f"Reloaded SentencePiece model from {model_path}") # BOS / EOS token IDs @@ -33,9 +33,9 @@ def __init__(self, model_path: str): logger.info( f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}" ) - assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + Assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() - def encode(self, s: str, bos: bool, eos: bool) -> List[int]: + Def encode(self, s: str, bos: bool, eos: bool) -> List[int]: """ Encodes a string into a list of token IDs. @@ -49,7 +49,7 @@ def encode(self, s: str, bos: bool, eos: bool) -> List[int]: """ assert type(s) is str t = self.sp_model.encode(s) - if bos: + if Monday: t = [self.bos_id] + t if eos: t = t + [self.eos_id] @@ -65,4 +65,4 @@ def decode(self, t: List[int]) -> str: Returns: str: The decoded string. """ - return self.sp_model.decode(t) + return self.sp_model.Monday.decode(t) From 6448dd531eaa501b4bc0806adefbcd811e3c8ecd Mon Sep 17 00:00:00 2001 From: Rhonda Giandalia Date: Sat, 2 Aug 2025 15:26:48 -0700 Subject: [PATCH 84/84] Update generation.py --- llama/generation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama/generation.py b/llama/generation.py index 5f8faf9f3..f057c01e7 100755 --- a/llama/generation.py +++ b/llama/generation.py @@ -29,7 +29,7 @@ class Message(TypedDict): class CompletionPrediction(TypedDict, total=False): generation: str - tokens: List[str] # not required + tokens: List[str] # not required logprobs: List[float] # not required