microsoft · kunal-vaishnavi · Aug 23, 2023 · Aug 6, 2023 · Aug 6, 2023 · Aug 6, 2023
diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake
@@ -463,6 +463,9 @@ file(GLOB onnxruntime_python_transformers_models_bert_src CONFIGURE_DEPENDS
 file(GLOB onnxruntime_python_transformers_models_gpt2_src CONFIGURE_DEPENDS
     "${ONNXRUNTIME_ROOT}/python/tools/transformers/models/gpt2/*.py"
 )
+file(GLOB onnxruntime_python_transformers_models_llama_src CONFIGURE_DEPENDS
+    "${ONNXRUNTIME_ROOT}/python/tools/transformers/models/llama/*.py"
+)
 file(GLOB onnxruntime_python_transformers_models_longformer_src CONFIGURE_DEPENDS
     "${ONNXRUNTIME_ROOT}/python/tools/transformers/models/longformer/*.py"
 )
@@ -534,6 +537,7 @@ add_custom_command(
   COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/bart
   COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/bert
   COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/gpt2
+  COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/llama
   COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/longformer
   COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/stable_diffusion
   COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/t5
@@ -625,6 +629,9 @@ add_custom_command(
   COMMAND ${CMAKE_COMMAND} -E copy
       ${onnxruntime_python_transformers_models_gpt2_src}
       $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/gpt2/
+  COMMAND ${CMAKE_COMMAND} -E copy
+      ${onnxruntime_python_transformers_models_llama_src}
+      $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/llama/
   COMMAND ${CMAKE_COMMAND} -E copy
       ${onnxruntime_python_transformers_models_longformer_src}
       $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/longformer/

diff --git a/onnxruntime/python/tools/transformers/benchmark_helper.py b/onnxruntime/python/tools/transformers/benchmark_helper.py
@@ -170,7 +170,7 @@ def prepare_environment(cache_dir, output_dir, use_gpu, provider=None):
 
     logger.info(f"PyTorch Version:{torch.__version__}")
     logger.info(f"Transformers Version:{transformers.__version__}")
-    logger.info(f"Onnxruntime Version:{onnxruntime.__version__}")
+    logger.info(f"OnnxRuntime Version:{onnxruntime.__version__}")
 
     # Support three major versions of PyTorch and OnnxRuntime, and up to 9 months of transformers.
     assert version.parse(torch.__version__) >= version.parse("1.10.0")

diff --git a/onnxruntime/python/tools/transformers/models/llama/README.md b/onnxruntime/python/tools/transformers/models/llama/README.md
@@ -0,0 +1,161 @@
+# LLaMA
+
+## Exporting LLaMA
+
+There are several ways to export LLaMA models such as LLaMA and LLaMA-2 (using LLaMA-2 7B as an example).
+
+Option 1: from convert_to_onnx
+```
+# From source:
+$ git clone https://github.com/microsoft/onnxruntime
+$ cd onnxruntime/onnxruntime/python/tools/transformers/models/llama
+$ python3 convert_to_onnx.py -m meta-llama/Llama-2-7b-hf --output llama2-7b
+
+# From wheel:
+$ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b
+```
+
+To make this option compatible with [Hugging Face's Optimum](https://github.com/huggingface/optimum), you will need to create `config.json` and `generation_config.json` for your model and store them in the same directory as your ONNX models. For example, you can find those JSON files for LLaMA-2 7B on Hugging Face [here](https://huggingface.co/meta-llama/Llama-2-7b-hf).
+
+Option 2: from [Microsoft's custom export](https://github.com/microsoft/Llama-2-Onnx)
+
+Option 3: from [Hugging Face's Optimum](https://github.com/huggingface/optimum)
+
+## Examples of Exporting LLaMA
+
+Here are some additional examples for exporting LLaMA.
+
+### Export Saved Model on Disk
+```
+# From source:
+$ python3 convert_to_onnx.py -m meta-llama/Llama-2-7b-hf --input ./Llama-2-7b-hf --output ./llama2-7b
+
+# From wheel:
+$ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --input ./Llama-2-7b-hf --output ./llama2-7b
+```
+
+### Export with Different Precision
+
+FP16:
+```
+# From source:
+$ python3 convert_to_onnx.py -m meta-llama/Llama-2-7b-hf --output llama2-7b --precision fp16
+
+# From wheel:
+$ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b --precision fp16
+```
+
+INT8:
+```
+# From source:
+$ python3 convert_to_onnx.py -m meta-llama/Llama-2-7b-hf --output llama2-7b --precision int8 --smooth_quant
+
+# From wheel:
+$ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b --precision int8 --smooth_quant
+```
+
+Note: [Intel's Neural Compressor](https://github.com/intel/neural-compressor) takes time to run the SmoothQuant quantization algorithm on LLMs. On an [Azure Standard_NC24s_v3 VM](https://learn.microsoft.com/en-us/azure/virtual-machines/ncv3-series), it takes about ~30-45 min for each of the exported ONNX models.
+
+## Benchmark LLaMA
+
+Here are some examples of how you can benchmark LLaMA.
+
+Note: In the below examples, `PyTorch` refers to running in PyTorch without `torch.compile` and `PyTorch 2.0` refers to running in PyTorch with `torch.compile`.
+
+### Variants
+
+1. PyTorch (without `torch.compile`), FP32
+```
+python3 benchmark.py \
+    --benchmark-type hf-pt \
+    --model-name meta-llama/Llama-2-7b-hf \
+    --model-size 7b \
+    --precision fp32 \
+    --batch-sizes "1 2" \
+    --sequence-lengths "8 16" \
+    --device cpu \
+    --auth
+```
+
+2. PyTorch 2.0 (with `torch.compile`), FP16
+```
+python3 benchmark.py \
+    --benchmark-type hf-pt2 \
+    --model-name meta-llama/Llama-2-7b-hf \
+    --model-size 7b \
+    --precision fp16 \
+    --batch-sizes "1 2" \
+    --sequence-lengths "8 16" \
+    --device cuda \
+    --auth
+```
+
+3. Optimum + ONNX Runtime, FP32, export via Optimum or convert_to_onnx
+```
+python3 benchmark.py \
+    --benchmark-type hf-ort \
+    --hf-ort-model-path ./Llama-2-7b-hf-onnx/ \
+    --model-name meta-llama/Llama-2-7b-hf \
+    --model-size 7b \
+    --precision fp32 \
+    --batch-sizes "1 2" \
+    --sequence-lengths "8 16" \
+    --device cpu \
+    --auth
+```
+
+4. Optimum + ONNX Runtime, FP16, export via convert_to_onnx
+```
+python3 benchmark.py \
+    --benchmark-type hf-ort \
+    --hf-ort-model-path ./llama2-7b/ \
+    --model-name meta-llama/Llama-2-7b-hf \
+    --model-size 7b \
+    --precision fp16 \
+    --batch-sizes "1 2" \
+    --sequence-lengths "8 16" \
+    --device cuda \
+    --auth
+```
+
+5. Optimum + ONNX Runtime, INT8, export via convert_to_onnx
+```
+python3 benchmark.py \
+    --benchmark-type hf-ort \
+    --hf-ort-model-path ./llama2-7b/ \
+    --model-name meta-llama/Llama-2-7b-hf \
+    --model-size 7b \
+    --precision int8 \
+    --batch-sizes "1 2" \
+    --sequence-lengths "8 16" \
+    --device cpu \
+    --auth
+```
+
+6. ONNX Runtime, FP32, Microsoft custom export
+```
+python3 benchmark.py \
+    --benchmark-type ort \
+    --ort-model-path llama-2-onnx/7B_float32/ONNX/LlamaV2_7B_float32.onnx \
+    --model-name meta-llama/Llama-2-7b-hf \
+    --model-size 7b \
+    --precision fp32 \
+    --batch-sizes "1 2" \
+    --sequence-lengths "8 16" \
+    --device cpu
+```
+
+7. ONNX Runtime, FP16, Microsoft custom export
+```
+python3 benchmark.py \
+    --benchmark-type ort \
+    --ort-model-path ./llama-2-onnx/7B_float16/ONNX/LlamaV2_7B_float16.onnx \
+    --model-name meta-llama/Llama-2-7b-hf \
+    --model-size 7b \
+    --precision fp16 \
+    --batch-sizes "1 2" \
+    --sequence-lengths "8 16" \
+    --device cuda
+```
+
+You can also profile each of these variants by adding the `--profile` flag and providing one batch size and sequence length combination.
diff --git a/onnxruntime/python/tools/transformers/models/llama/__init__.py b/onnxruntime/python/tools/transformers/models/llama/__init__.py
@@ -0,0 +1,8 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import os
+import sys
+
+sys.path.append(os.path.dirname(__file__))