pytorch
diff --git a/‎backends/qualcomm/builders/node_visitor.py
Lines changed: 1 addition & 0 deletions b/‎backends/qualcomm/builders/node_visitor.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/qualcomm/partition/common_defs.py
Lines changed: 2 additions & 0 deletions b/‎backends/qualcomm/partition/common_defs.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/models/llama2/README.md
Lines changed: 22 additions & 5 deletions b/‎examples/models/llama2/README.md
Lines changed: 22 additions & 5 deletions
diff --git a/‎examples/models/llama2/eval_llama_lib.py
Lines changed: 3 additions & 4 deletions b/‎examples/models/llama2/eval_llama_lib.py
Lines changed: 3 additions & 4 deletions
diff --git a/‎examples/models/llama2/export_llama_lib.py
Lines changed: 146 additions & 11 deletions b/‎examples/models/llama2/export_llama_lib.py
Lines changed: 146 additions & 11 deletions
diff --git a/‎examples/models/llama2/tests/TARGETS
Lines changed: 15 additions & 0 deletions b/‎examples/models/llama2/tests/TARGETS
Lines changed: 15 additions & 0 deletions
@@ -29,6 +29,7 @@
     QNN_uint16: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UFIXED_POINT_16,
 }
 QNN_TENSOR_TYPE_MAP = {
+    torch.bool: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_BOOL_8,
     torch.float32: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_FLOAT_32,
     torch.int8: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_INT_8,
     torch.int16: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_INT_16,
 
@@ -13,6 +13,8 @@
     exir_ops.edge.aten.clone.default,
     exir_ops.edge.aten.index.Tensor,
     exir_ops.edge.aten.full.default,
+    exir_ops.edge.aten.slice_scatter.default,
+    exir_ops.edge.aten.index_put.default,
 ]
 
 allow_list_operator = [
 
@@ -24,11 +24,12 @@ For Llama3, we can use the same process. Note that it's only supported in the Ex
 ## Quantization:
 We employed 4-bit groupwise per token dynamic quantization of all the linear layers of the model. Dynamic quantization refers to quantizating activations dynamically, such that quantization parameters for activations are calculated, from min/max range, at runtime. Here we quantized activations with 8bits (signed integer). Furthermore, weights are statically quantized. In our case weights were per-channel groupwise quantized with 4bit signed integer. For more information refer to this [page](https://github.com/pytorch-labs/ao/).
 
-We evaluated WikiText perplexity using [LM Eval](https://github.com/EleutherAI/lm-evaluation-harness). Below are the results for two different groupsizes.
+We evaluated WikiText perplexity using [LM Eval](https://github.com/EleutherAI/lm-evaluation-harness). Below are the results for two different groupsizes, with max_seq_len 2048, and 1000 samples.
 
-|Llama 2 | Baseline (FP32) | Groupwise 4-bit (128) | Groupwise 4-bit (256)
+|Model | Baseline (FP32) | Groupwise 4-bit (128) | Groupwise 4-bit (256)
 |--------|-----------------| ---------------------- | ---------------
-|Wikitext Perplexity | 9.16 | 10.2 | 10.7
+|Llama 2 7B | 9.2 | 10.2 | 10.7
+|Llama 3 8B | 7.9 | 9.4 | 9.7
 
 Note that groupsize less than 128 was not enabled, since such model were still too large. This is because our current efforts have focused on enabling FP32 and support for FP16 is under way. What this implies for model size is that 1) embedding table is in FP32 and 2) quantized weights scales are FP32.
 
@@ -56,7 +57,7 @@ Performance was measured on Samsung Galaxy S22, S24, One Plus 12 and iPhone 15 m
 - For Llama7b, your device may require at least 32GB RAM. If this is a constraint for you, please try the smaller stories model.
 
 ## Step 1: Setup
-1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch
+1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. For installation run `./install_requirements.sh --pybind xnnpack`
 2. Run `examples/models/llama2/install_requirements.sh` to install a few dependencies.
 
 ## Step 2: Prepare model
@@ -102,6 +103,16 @@ If you want to deploy and run a smaller model for educational purposes. From `ex
     python -m examples.models.llama2.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
     ```
 
+### Option C: Download and export Llama3 8B model
+
+You can export and run the original Llama3 8B model.
+
+1. Llama3 pretrained parameters can be downloaded from [Meta's official llama3 repository](https://github.com/meta-llama/llama3/).
+
+2. Export model and generate `.pte` file
+    ```
+    python -m examples.models.llama2.export_llama --checkpoint <consolidated.00.pth> -p <params.json> -d=fp32 -X -qmode 8da4w -kv --use_sdpa_with_kv_cache --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte" group_size 128 --metadata '{"get_bos_id":128000, "get_eos_id":128001}' --embedding-quantize 4,32
+    ```
 
 ## (Optional) Finetuning
 
@@ -147,6 +158,7 @@ The Wikitext results generated above used: `{max_seq_len: 2048, limit: 1000}`
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
         -DEXECUTORCH_BUILD_XNNPACK=ON \
+        -DEXECUTORCH_BUILD_QUANTIZED=ON \
         -DEXECUTORCH_BUILD_OPTIMIZED=ON \
         -DEXECUTORCH_BUILD_CUSTOM=ON \
         -Bcmake-out .
@@ -162,17 +174,22 @@ The Wikitext results generated above used: `{max_seq_len: 2048, limit: 1000}`
         -DEXECUTORCH_BUILD_CUSTOM=ON \
         -DEXECUTORCH_BUILD_OPTIMIZED=ON \
         -DEXECUTORCH_BUILD_XNNPACK=ON \
+        -DEXECUTORCH_BUILD_QUANTIZED=ON \
         -Bcmake-out/examples/models/llama2 \
         examples/models/llama2
 
     cmake --build cmake-out/examples/models/llama2 -j16 --config Release
     ```
 
+For Llama3, add `-DEXECUTORCH_USE_TIKTOKEN=ON` option when building the llama runner.
+
 3. Run model. Run options available [here](https://github.com/pytorch/executorch/blob/main/examples/models/llama2/main.cpp#L18-L40).
     ```
     cmake-out/examples/models/llama2/llama_main --model_path=<model pte file> --tokenizer_path=<tokenizer.bin> --prompt=<prompt>
     ```
 
+For Llama3, you can pass the original `tokenizer.model` (without converting to `.bin` file).
+
 ## Step 5: Run benchmark on Android phone
 
 **1. Build llama runner binary for Android**
@@ -280,7 +297,7 @@ This example tries to reuse the Python code, with minimal modifications to make
 ```
 git clean -xfd
 pip uninstall executorch
-./install_requirements.sh <options>
+./install_requirements.sh --pybind xnnpack
 
 rm -rf cmake-out
 ```
 
@@ -42,12 +42,11 @@ def __init__(
         tokenizer: Union[SentencePieceTokenizer, Tiktoken],
         max_seq_length: Optional[int] = None,
     ):
-        super().__init__()
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        super().__init__(device=device)
         self._model = model
         self._tokenizer = tokenizer
-        self._device = (
-            torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-        )
+        self._device = torch.device(device)
         self._max_seq_length = 2048 if max_seq_length is None else max_seq_length
 
     @property
 
@@ -9,6 +9,7 @@
 import argparse
 import copy
 import logging
+import math
 import os
 import shlex
 
@@ -143,6 +144,80 @@ def replace_sdpa_with_custom_op(module: torch.nn.Module) -> torch.nn.Module:
     return module
 
 
+class SDPASimple(torch.nn.Module):
+
+    def __init__(
+        self,
+        kv_cache: KVCache,
+        dim: int,
+        head_dim: int,
+        n_rep: int,
+    ):
+        super().__init__()
+        self.kv_cache = kv_cache
+        self.dim = dim
+        self.head_dim = head_dim
+        self.n_rep = n_rep
+
+    def forward(
+        self,
+        input_pos: torch.Tensor,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        bsz,
+        seqlen,
+        mask,
+    ):
+        q = q.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+
+        k, v = self.kv_cache.update(input_pos, k, v)
+        attn_mask = mask[None, None, input_pos]
+
+        k = k.repeat_interleave(self.n_rep, dim=1)
+        v = v.repeat_interleave(self.n_rep, dim=1)
+        scale_factor = 1 / math.sqrt(q.size(-1))
+        attn_weight = q @ k.transpose(-2, -1) * scale_factor
+        attn_weight += attn_mask
+        attn_weight = torch.softmax(attn_weight, dim=-1)
+        y = attn_weight @ v
+
+        return y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
+
+
+def replace_sdpa_with_simple_sdpa(module: torch.nn.Module):
+    for name, child in module.named_children():
+        if isinstance(child, SDPA):
+            setattr(
+                module,
+                name,
+                SDPASimple(child.kv_cache, child.dim, child.head_dim, child.n_rep),
+            )
+        else:
+            replace_sdpa_with_simple_sdpa(child)
+    return module
+
+
+def replace_causal_mask(module: torch.nn.Module):
+    for buffer_fqn_name, buffer in module.named_buffers():
+        buffer_name = buffer_fqn_name.split(".")[-1]
+        if buffer_name == "mask":
+            max_seq_len = buffer.shape[-1]
+            mask = torch.full(
+                (max_seq_len, max_seq_len),
+                float("-inf"),
+                device="cpu",
+            )
+
+            mask = torch.triu(mask, diagonal=1)
+            module.register_buffer(buffer_name, mask)
+    for _, child in module.named_children():
+        replace_causal_mask(child)
+    return module
+
+
 def quantize(
     model: torch.nn.Module,
     qmode: str,
@@ -280,6 +355,13 @@ def build_args_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--pt2e_quantize",
         default=None,
+        choices=[
+            "xnnpack_dynamic",
+            "xnnpack_dynamic_qc4",
+            "qnn_8a8w",
+            "qnn_16a16w",
+            "qnn_16a4w",
+        ],
         help="Use PT2E quantization. Comma separated options. e.g. xnnpack_dynamic (for per channel 8 bit weight), xnnpack_dynamic_qc4 (for per channel 4 bit weight), embedding.",
     )
     parser.add_argument(
@@ -539,7 +621,10 @@ def _prepare_for_llama_export(modelname: str, args) -> LlamaEdgeManager:
         bitwidth = int(bitwidth)
         transforms.append(
             lambda model: EmbeddingQuantHandler(
-                model, bitwidth=bitwidth, group_size=group_size
+                model,
+                bitwidth=bitwidth,
+                group_size=group_size,
+                packed=(bitwidth == 4),
             ).quantized_model()
         )
 
@@ -549,6 +634,9 @@ def _prepare_for_llama_export(modelname: str, args) -> LlamaEdgeManager:
     if args.use_sdpa_with_kv_cache:
         transforms.append(replace_sdpa_with_custom_op)
 
+    if args.qnn and args.use_kv_cache:
+        transforms.append(replace_sdpa_with_simple_sdpa)
+        transforms.append(replace_causal_mask)
     return (
         load_llama_model(
             modelname=modelname,
@@ -572,13 +660,16 @@ def _export_llama(modelname, args) -> str:  # noqa: C901
     # export_to_edge
     pt2e_quant_params = _get_pt2e_quantization_params(args)
     quantizers = get_pt2e_quantizers(pt2e_quant_params, args)
-    if args.qnn:
-        assert (
-            args.quantization_mode is None
-        ), "Currently qnn backend only supports QnnQuantizer via pt2e flow"
+    quant_dtype = None
+    if args.qnn and args.pt2e_quantize:
         try:
             # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.quantizer.quantizer`
-            from executorch.backends.qualcomm.quantizer.quantizer import QnnQuantizer
+            from executorch.backends.qualcomm.quantizer.quantizer import (
+                get_16a4w_qnn_ptq_config,
+                get_default_16bit_qnn_ptq_config,
+                QnnQuantizer,
+                QuantDtype,
+            )
 
             # reset quantizers and pt2e_quant_params from xnnpack backend
             pt2e_quant_params = None
@@ -588,10 +679,41 @@ def _export_llama(modelname, args) -> str:  # noqa: C901
                 "Please install the Qualcomm backend follwing https://pytorch.org/executorch/main/build-run-qualcomm.html"
             )
 
+        backend, quant_config = args.pt2e_quantize.split("_")
+        assert (
+            backend == "qnn"
+        ), f"The quantization config is for backend {backend} instead of qnn."
         # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
         qnn_quantizer = QnnQuantizer()
         # more custom quantization are supported including 16a4w etc. default to 8bit quantized
         custom_annotations = ()
+        if quant_config == "8a8w":
+            # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
+            quant_dtype = QuantDtype.use_8a8w
+            pass
+        elif quant_config == "16a16w":
+            # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
+            quant_dtype = QuantDtype.use_16a16w
+            qnn_quantizer.add_16bit_quant_ops(qnn_quantizer.SUPPORTED_OPS)
+            # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
+            qnn_quantizer.set_bit16_op_quant_config(get_default_16bit_qnn_ptq_config())
+        elif quant_config == "16a4w":
+            # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
+            quant_dtype = QuantDtype.use_16a4w
+            qnn_quantizer.add_16bit_quant_ops(qnn_quantizer.SUPPORTED_OPS)
+            # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
+            qnn_quantizer.set_bit16_op_quant_config(get_16a4w_qnn_ptq_config())
+            qnn_quantizer.set_per_channel_weight_dtype(
+                weight_dtype_for_16bit_act="int4"
+            )
+        else:
+            raise AssertionError(
+                f"No support for quant type {quant_config}. Support 8a8w, 16a16w and 16a4w."
+            )
+
+        assert (
+            args.quantization_mode is None
+        ), "Currently qnn backend only supports QnnQuantizer via pt2e flow"
         qnn_quantizer.add_custom_quant_annotations(custom_annotations)
         quantizers.append(qnn_quantizer)
 
@@ -708,25 +830,38 @@ def _export_llama(modelname, args) -> str:  # noqa: C901
                 "Please install the Qualcomm backend follwing https://pytorch.org/executorch/main/build-run-qualcomm.html"
             )
 
-        # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`
-        backend_options = generate_htp_compiler_spec(use_fp16=False)
+        use_fp16 = True
+        skip_node_op_set = {}
+        if args.pt2e_quantize:
+            use_fp16 = False
+            # TODO: fix the lowering error without skipping nodes
+            # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
+            if quant_dtype == QuantDtype.use_8a8w:
+                raise NotImplementedError("8a8w for llama is still under development")
+            # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
+            elif quant_dtype == QuantDtype.use_16a16w:
+                raise NotImplementedError("16a16w for llama is still under development")
+            # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
+            elif quant_dtype == QuantDtype.use_16a4w:
+                raise NotImplementedError("16a4w for llama is still under development")
         partitioners.append(
             # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`
             QnnPartitioner(
                 # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`
                 generate_qnn_executorch_compiler_spec(
                     # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
                     soc_model=QcomChipset.SM8650,  # default to SM8650
-                    backend_options=backend_options,
+                    # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
+                    backend_options=generate_htp_compiler_spec(use_fp16=use_fp16),
                     debug=False,
                     saver=False,
                 ),
                 skip_node_id_set={},
-                skip_node_op_set={},
+                skip_node_op_set=skip_node_op_set,
             )
         )
         # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`
-        _transform(builder_exported_to_edge.export_program())
+        _transform(builder_exported_to_edge.edge_manager.exported_program())
 
     if args.generate_etrecord:
         if not builder_exported_to_edge.edge_manager:
 
@@ -0,0 +1,15 @@
+load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
+
+oncall("executorch")
+
+python_unittest(
+    name = "test_simple_sdpa",
+    srcs = [
+        "test_simple_sdpa.py",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/examples/models/llama2:export_library",
+        "//executorch/examples/models/llama2:llama_transformer",
+    ],
+)
Original file line number	Diff line number	Diff line change
`@@ -29,6 +29,7 @@`
`29`	`29`	`QNN_uint16: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UFIXED_POINT_16,`
`30`	`30`	`}`
`31`	`31`	`QNN_TENSOR_TYPE_MAP = {`
	`32`	`+ torch.bool: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_BOOL_8,`
`32`	`33`	`torch.float32: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_FLOAT_32,`
`33`	`34`	`torch.int8: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_INT_8,`
`34`	`35`	`torch.int16: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_INT_16,`
Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,8 @@`
`13`	`13`	`exir_ops.edge.aten.clone.default,`
`14`	`14`	`exir_ops.edge.aten.index.Tensor,`
`15`	`15`	`exir_ops.edge.aten.full.default,`
	`16`	`+ exir_ops.edge.aten.slice_scatter.default,`
	`17`	`+ exir_ops.edge.aten.index_put.default,`
`16`	`18`	`]`
`17`	`19`
`18`	`20`	`allow_list_operator = [`