diff --git a/backends/qualcomm/_passes/insert_requantize.py b/backends/qualcomm/_passes/insert_requantize.py index 11aad02a0cf..83b729f3c44 100644 --- a/backends/qualcomm/_passes/insert_requantize.py +++ b/backends/qualcomm/_passes/insert_requantize.py @@ -89,15 +89,9 @@ def _single_output_annotation( requantize_dict = n.meta.pop(QCOM_REQUANTIZE) # {quant_attr: user_node_name_list} group_quant_attr_dict = self._invert_dict(requantize_dict) - # TODO: If users of the node contain output node, - # we replace the node with to_copy op. However, it would - # be problem when the node has multiple to_copy ops - add_output = len(group_quant_attr_dict) == 1 for hashable_quant_attr, user_nodes in group_quant_attr_dict.items(): user_nodes_copy = user_nodes.copy() - if add_output: - user_nodes_copy.append("output") self._insert_to_copy(gm, n, dict(hashable_quant_attr), user_nodes_copy) def _insert(self, graph_module: torch.fx.GraphModule) -> torch.fx.GraphModule: diff --git a/backends/qualcomm/quantizer/custom_annotation.py b/backends/qualcomm/quantizer/custom_annotation.py index d1c1757cc10..33237f3bebe 100644 --- a/backends/qualcomm/quantizer/custom_annotation.py +++ b/backends/qualcomm/quantizer/custom_annotation.py @@ -14,17 +14,80 @@ QuantizationConfig, ) from executorch.exir.dialects._ops import ops as exir_ops -from torch.ao.quantization.observer import MinMaxObserver +from torch.ao.quantization.observer import FixedQParamsObserver, MinMaxObserver from torch.ao.quantization.quantizer import ( QuantizationAnnotation, + QuantizationSpec, SharedQuantizationSpec, ) from torch.fx import Node -def annotate_matmul_16a8w( # noqa: C901 - gm: torch.fx.GraphModule, traverse_input1=True -) -> None: +def annotate_linear_16a8w_in_affine_layer(gm: torch.fx.GraphModule) -> None: + def annotate_conv2d(node: Node, quantization_config: QuantizationConfig) -> None: + input_qspec_map = {} + input_act = node.args[0] + input_spec = quantization_config.input_activation + input_qspec_map[input_act] = input_spec + + weight = node.args[1] + input_qspec_map[weight] = quantization_config.weight + + node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=quantization_config.output_activation, + _annotated=True, + ) + + quantization_config_16a8w_per_channel = get_ptq_per_channel_quant_config( + torch.uint16, weight_dtype=torch.int8, act_observer=MinMaxObserver + ) + for node in gm.graph.nodes: + if node.op == "call_function" and node.target == torch.ops.aten.conv2d.default: + if "nn_module_stack" in node.meta: + module_values_list = list(node.meta["nn_module_stack"].values()) + full_qualified_name = module_values_list[-1][0] + if full_qualified_name == "output.conv": + annotate_conv2d( + node, quantization_config=quantization_config_16a8w_per_channel + ) + + +def annotate_prefill_kv_output(gm: torch.fx.GraphModule, kv_quant_attrs: dict): + for node in gm.graph.nodes: + if node.op == "output": + for index, prefill_output in enumerate(node.args[0]): + kv_quant_attr = kv_quant_attrs[index] + fixed_observer = FixedQParamsObserver.with_args( + scale=kv_quant_attr[0], + zero_point=kv_quant_attr[1], + quant_min=kv_quant_attr[2], + quant_max=kv_quant_attr[3], + dtype=kv_quant_attr[4], + qscheme=torch.torch.per_tensor_affine, + ) + + fixed_output_spec = QuantizationSpec( + quant_min=kv_quant_attr[2], + quant_max=kv_quant_attr[3], + dtype=kv_quant_attr[4], + ch_axis=0, + observer_or_fake_quant_ctr=fixed_observer, + ) + + input_qspec_map = {} + for input in prefill_output.args: + if isinstance(input, Node): + input_qspec_map[input] = fixed_output_spec + + prefill_output.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=fixed_output_spec, + _annotated=True, + ) + + +def annotate_matmul_16a8w(gm: torch.fx.GraphModule) -> None: # noqa: C901 """ This function is specific for matmul op 16a8w. For k, we will tag such as the below, and @@ -142,8 +205,7 @@ def annotate_matmul_input1(node: Node): for node in gm.graph.nodes: if node.op == "call_function" and node.target == torch.ops.aten.matmul.default: annotate_matmul(node, quantization_config_16a8w) - if traverse_input1: - annotate_matmul_input1(node.args[1]) + annotate_matmul_input1(node.args[1]) def custom_annotate_llama_matmul_16a8w(gm: torch.fx.GraphModule) -> None: # noqa: C901 diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index 30ed34032f4..a6af5335331 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -3529,7 +3529,7 @@ def test_stories_single_llama(self): cmds = [ "python", - f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama2/llama.py", + f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py", "--artifact", self.artifact_dir, "--build_folder", @@ -3556,6 +3556,8 @@ def test_stories_single_llama(self): "16a4w", "--temperature", "0", + "--llama_model", + "stories110m", ] if self.host: cmds.extend(["--host", self.host]) diff --git a/examples/qualcomm/CMakeLists.txt b/examples/qualcomm/CMakeLists.txt index a8e16bb5c9f..55969f937ee 100644 --- a/examples/qualcomm/CMakeLists.txt +++ b/examples/qualcomm/CMakeLists.txt @@ -84,11 +84,8 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag}) # build qnn_executor_runner add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/executor_runner) -# build qnn_llama_runner for llama2 -add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/oss_scripts/llama2) - -# build qnn_llama_runner for llama3.2 -add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/oss_scripts/llama3_2) +# build qnn_llama_runner for llama +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/oss_scripts/llama) # build qaihub_llama2_7b_runner and qaihub_llama3_8b_runner add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/qaihub_scripts/llama) diff --git a/examples/qualcomm/README.md b/examples/qualcomm/README.md index 3d5eb429397..bdac58d2bfc 100644 --- a/examples/qualcomm/README.md +++ b/examples/qualcomm/README.md @@ -4,10 +4,10 @@ This directory contains examples for some AI models. We have seperated the example scripts into the following subfolders, please refer to [README.md](../../backends/qualcomm/README.md) for the example scripts' directory structure: -1. executor_runner: This folder contains a general executor runner capable of running most of the models. As a rule of thumb, if a model does not have its own customized runner, execute the model using [executor_runner](./executor_runner/qnn_executor_runner.cpp). On the other hand, if a model has its own runner, such as [llama2](./oss_scripts/llama2/qnn_llama_runner.cpp), use the customized runner to execute the model. Customized runner should be located under the same folder as the model's python script. +1. executor_runner: This folder contains a general executor runner capable of running most of the models. As a rule of thumb, if a model does not have its own customized runner, execute the model using [executor_runner](./executor_runner/qnn_executor_runner.cpp). On the other hand, if a model has its own runner, such as [llama](./oss_scripts/llama/qnn_llama_runner.cpp), use the customized runner to execute the model. Customized runner should be located under the same folder as the model's python script. 2. oss_scripts: OSS stands for Open Source Software. This folder contains python scripts for open source models. Some models under this folder might also have their own customized runner. - For example, [llama2](./oss_scripts/llama2/qnn_llama_runner.cpp) contains not only the python scripts to prepare the model but also a customized runner for executing the model. + For example, [llama](./oss_scripts/llama/qnn_llama_runner.cpp) contains not only the python scripts to prepare the model but also a customized runner for executing the model. 3. qaihub_scripts: QAIHub stands for [Qualcomm AI Hub](https://aihub.qualcomm.com/). On QAIHub, users can find pre-compiled context binaries, a format used by QNN to save its models. This provides users with a new option for model deployment. Different from oss_scripts & scripts, which the example scripts are converting a model from nn.Module to ExecuTorch .pte files, qaihub_scripts provides example scripts for converting pre-compiled context binaries to ExecuTorch .pte files. Additionaly, users can find customized example runners specific to the QAIHub models for execution. For example [qaihub_llama2_7b](./qaihub_scripts/llama2/qaihub_llama2_7b.py) is a script converting context binaries to ExecuTorch .pte files, and [qaihub_llama2_7b_runner](./qaihub_scripts/llama2/qaihub_llama2_7b_runner.cpp) is a customized example runner to execute llama2 .pte files. Please be aware that context-binaries downloaded from QAIHub are tied to a specific QNN SDK version. Before executing the scripts and runner, please ensure that you are using the QNN SDK version that is matching the context binary. Please refer to [Check context binary version](#check-context-binary-version) for tutorial on how to check the QNN Version for a context binary. diff --git a/examples/qualcomm/oss_scripts/llama3_2/CMakeLists.txt b/examples/qualcomm/oss_scripts/llama/CMakeLists.txt similarity index 65% rename from examples/qualcomm/oss_scripts/llama3_2/CMakeLists.txt rename to examples/qualcomm/oss_scripts/llama/CMakeLists.txt index 93b35a697c6..c92711d9eb8 100644 --- a/examples/qualcomm/oss_scripts/llama3_2/CMakeLists.txt +++ b/examples/qualcomm/oss_scripts/llama/CMakeLists.txt @@ -18,38 +18,35 @@ target_link_libraries( ) target_link_options_shared_lib(custom_ops) -# preprocess qnn runner src files for llama3.2 -set(_llama3_2_runner__srcs ${_llama_runner__srcs}) -list(TRANSFORM _llama3_2_runner__srcs PREPEND "${EXECUTORCH_SOURCE_DIR}/") -list(FILTER _llama3_2_runner__srcs EXCLUDE REGEX ".*(/runner/).*") +# preprocess qnn runner src files for llama +set(_llama_runner__srcs ${_llama_runner__srcs}) +list(TRANSFORM _llama_runner__srcs PREPEND "${EXECUTORCH_SOURCE_DIR}/") +list(FILTER _llama_runner__srcs EXCLUDE REGEX ".*(/runner/).*") list( PREPEND - _llama3_2_runner__srcs - ${CMAKE_CURRENT_LIST_DIR}/qnn_llama3_2_runner.cpp + _llama_runner__srcs + ${CMAKE_CURRENT_LIST_DIR}/qnn_llama_runner.cpp ${CMAKE_CURRENT_LIST_DIR}/runner/runner.cpp ${CMAKE_CURRENT_LIST_DIR}/runner/runner.h ${CMAKE_CURRENT_LIST_DIR}/runner/io_memory.cpp ${CMAKE_CURRENT_LIST_DIR}/runner/io_memory.h ) -list( - APPEND _llama3_2_runner__srcs - ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizer/tiktoken.cpp -) list( APPEND - _llama3_2_runner__srcs + _llama_runner__srcs + ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizer/tiktoken.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../../models/llama/tokenizer/llama_tiktoken.cpp ) -# build qnn llama3.2 1b runner -add_executable(qnn_llama3_2_runner ${_llama3_2_runner__srcs}) +# build qnn llama runner +add_executable(qnn_llama_runner ${_llama_runner__srcs}) target_include_directories( - qnn_llama3_2_runner PUBLIC ${_common_include_directories} + qnn_llama_runner PUBLIC ${_common_include_directories} ) target_link_libraries( - qnn_llama3_2_runner + qnn_llama_runner qnn_executorch_backend executorch_core extension_data_loader @@ -60,8 +57,8 @@ target_link_libraries( custom_ops ) target_compile_options( - qnn_llama3_2_runner PUBLIC ${_common_compile_options} + qnn_llama_runner PUBLIC ${_common_compile_options} ) set_target_properties( - qnn_llama3_2_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'" + qnn_llama_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'" ) diff --git a/examples/qualcomm/oss_scripts/llama/README.md b/examples/qualcomm/oss_scripts/llama/README.md new file mode 100644 index 00000000000..79c20180d69 --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/README.md @@ -0,0 +1,70 @@ +# Summary + +## Overview +This file provides you the instructions to run LLAMA model with different parameters via Qualcomm HTP backend. We currently support the following models: + 1. LLAMA2 Stories 110M + 2. LLAMA3.2 1B + 3. LLAMA3.2 3B (WIP) +We offer the following modes to execute the model: + +Prefill Mode: This is also known as batch prefill mode, where the model takes in a list of tokens as input and generates the next token along with the key-value (KV) cache for all tokens. This mode is efficient for generating the initial sequence of tokens (usually the user's prompt). + +KV Cache Mode: In KV Cache mode, the model takes in a single previous token and generates the next predicted token along with its KV cache. It is efficient for generating subsequent tokens after the initial prompt. + +Hybrid Mode: Hybrid mode leverages the strengths of both batch prefill and KV cache modes to optimize token generation speed. Initially, it uses prefill mode to efficiently generate the prompt's key-value (KV) cache. Then, the mode switches to KV cache mode, which excels at generating subsequent tokens. + + +## Instructions +### Note +1. For hybrid mode, the export time will be longer and can take up to 1-4 hours to complete, depending on the specific model users are exporting. +2. When exporting a hybrid mode model, memory consumption will be higher. Taking LLAMA3.2 1B as an example, please ensure the device has at least 80 GB of memory and swap space. + + +### Step 1: Setup +1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. +2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm AI Engine Direct Backend. + +### Step 2: Prepare Model + +#### LLAMA2 +Download and prepare stories110M model + +```bash +# tokenizer.model & stories110M.pt: +wget "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt" +wget "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model" + +# tokenizer.bin: +python -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin + +# params.json: +echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json +``` + +#### LLAMA3.2 +Follow the [instructions](https://www.llama.com/) to download models. +At the end of this step, users should have the following files ready: `consolidated.00.pth`, `params.json`, and `tokenizer.model`. + + +### Step3: Run default examples using hybrid mode. +#### LLAMA2 +```bash +python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint stories110M.pt --params params.json --tokenizer_model tokenizer.model --tokenizer_bin tokenizer.bin --llama_model stories110m --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 --prompt "Once upon a time" +``` + +#### LLAMA3.2 +Default example using hybrid mode. +```bash +python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --llama_model llama3_2 --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 --prompt "what is 1+1" +``` + +### Additional Configs when running the script +If you would like to compile the model only, we have provided the flag `--compile_only`. Taking LLAMA3.2 as an example: +```bash +python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --llama_model llama3_2 --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 --prompt "what is 1+1" --compile_only +``` + +On the other hand, if you already have a pre-compiled .pte model, you can perform inference by providing the flag `--pre_gen_pte` and specifying the folder that contains the .pte model. Taking LLAMA3.2 as an example: +```bash +python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --llama_model llama3_2 --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 --prompt "what is 1+1" --pre_gen_pte ${FOLDER_TO_PRE_GEN_PTE} +``` \ No newline at end of file diff --git a/examples/qualcomm/oss_scripts/llama2/TARGETS b/examples/qualcomm/oss_scripts/llama/TARGETS similarity index 80% rename from examples/qualcomm/oss_scripts/llama2/TARGETS rename to examples/qualcomm/oss_scripts/llama/TARGETS index b0f5ea7f640..7043e8b9275 100644 --- a/examples/qualcomm/oss_scripts/llama2/TARGETS +++ b/examples/qualcomm/oss_scripts/llama/TARGETS @@ -5,7 +5,6 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") oncall("executorch") - python_library( name = "static_llama", srcs = [ @@ -19,9 +18,12 @@ python_library( python_binary( name = "llama", srcs = ["llama.py"], - main_function = "executorch.examples.qualcomm.oss_scripts.llama2.llama.main", + main_function = "executorch.examples.qualcomm.oss_scripts.llama.llama.main", + preload_deps = [ + "//executorch/extension/llm/custom_ops:model_sharding_py", + ], deps = [ - ":static_llama", + "//executorch/examples/qualcomm/oss_scripts/llama:static_llama", "//caffe2:torch", "//executorch/extension/pybindings:aten_lib", "//executorch/backends/qualcomm/partition:partition", @@ -38,6 +40,8 @@ runtime.command_alias( name = "llama_qnn", env = { "LD_LIBRARY_PATH": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:qnn_offline_compile_libs)".format(get_qnn_library_verision()), + # Place holder to pass the QNN_SDK_ROOT check in executorch/examples/qualcomm/utils.py + "QNN_SDK_ROOT": "", }, exe = ":llama", ) diff --git a/examples/qualcomm/oss_scripts/llama3_2/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py similarity index 84% rename from examples/qualcomm/oss_scripts/llama3_2/llama.py rename to examples/qualcomm/oss_scripts/llama/llama.py index a18690e941d..0af0f55b88f 100755 --- a/examples/qualcomm/oss_scripts/llama3_2/llama.py +++ b/examples/qualcomm/oss_scripts/llama/llama.py @@ -22,7 +22,9 @@ from executorch.backends.qualcomm.partition.qnn_partitioner import QnnPartitioner from executorch.backends.qualcomm.quantizer.custom_annotation import ( + annotate_linear_16a8w_in_affine_layer, annotate_matmul_16a8w, + annotate_prefill_kv_output, ) from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype @@ -38,7 +40,8 @@ get_soc_to_chipset_map, update_spill_fill_size, ) -from executorch.examples.qualcomm.oss_scripts.llama2.model.static_llama import ( +from executorch.examples.models.llama.tokenizer.tiktoken import Tokenizer as Tiktoken +from executorch.examples.qualcomm.oss_scripts.llama.model.static_llama import ( LlamaModel, ModelArgs, ) @@ -55,6 +58,9 @@ from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass from executorch.extension.llm.custom_ops import model_sharding from executorch.extension.llm.export.builder import DType +from executorch.extension.llm.tokenizer.tokenizer import ( + Tokenizer as SentencePieceTokenizer, +) from executorch.extension.llm.tokenizer.utils import get_tokenizer from torch.ao.quantization.observer import MinMaxObserver @@ -70,21 +76,28 @@ def _kv_calibrate( example_inputs, user_prompts, module: torch.fx.GraphModule, - tokenizer_model_path="tokenizer.model", + tokenizer, max_seq_len=512, ): - sp_model = get_tokenizer(tokenizer_model_path) _, atten_mask, _, k_caches, v_caches = example_inputs # TODO: change criteria & support batch inputs if necessary pos = torch.tensor(0, dtype=torch.int32) max_cache_len = max_seq_len - 1 - token_list = sp_model.encode( - user_prompts, bos=True, eos=False, allowed_special="all" - ) + + token_list = [] + # Llama2 tokenizer has no special tokens + if isinstance(tokenizer, SentencePieceTokenizer): + token_list = tokenizer.encode(user_prompts, bos=True, eos=False) + elif isinstance(tokenizer, Tiktoken): + token_list = tokenizer.encode( + user_prompts, bos=True, eos=False, allowed_special="all" + ) + else: + raise RuntimeError("Unkown tokenizer") with torch.no_grad(): - while token_list[-1] != sp_model.eos_id and pos < max_cache_len: + while token_list[-1] != tokenizer.eos_id and pos < max_cache_len: logits, new_k_caches, new_v_caches = module( torch.full((1, 1), token_list[pos], dtype=torch.int32), atten_mask, @@ -106,28 +119,36 @@ def _kv_calibrate( if pos >= len(token_list): token_list.append(torch.argmax(logits[:, -1], dim=-1).item()) - print(f"calibration data:\n{sp_model.decode(token_list)}") + print(f"kv calibration data:\n{tokenizer.decode(token_list)}") def _prefill_calibrate( example_inputs, user_prompts, module: torch.fx.GraphModule, - tokenizer_model_path="tokenizer.model", + tokenizer, max_seq_len=512, ): - sp_model = get_tokenizer(tokenizer_model_path) _, atten_mask = example_inputs max_cache_len = max_seq_len - 1 # TODO: change criteria & support batch inputs if necessary - token_list = sp_model.encode( - user_prompts, bos=True, eos=False, allowed_special="all" - ) + + token_list = [] + # Llama2 tokenizer has no special tokens + if isinstance(tokenizer, SentencePieceTokenizer): + token_list = tokenizer.encode(user_prompts, bos=True, eos=False) + elif isinstance(tokenizer, Tiktoken): + token_list = tokenizer.encode( + user_prompts, bos=True, eos=False, allowed_special="all" + ) + else: + raise RuntimeError("Unkown tokenizer") + pos = len(token_list) with torch.no_grad(): - while token_list[-1] != sp_model.eos_id and pos < max_cache_len: + while token_list[-1] != tokenizer.eos_id and pos < max_cache_len: tmp_token_list = torch.tensor(token_list).reshape(1, -1) if pos < max_cache_len: tmp_token_list = torch.cat( @@ -144,14 +165,14 @@ def _prefill_calibrate( token_list.append(torch.argmax(logits[:, pos - 1], dim=-1).item()) pos += 1 - print(f"calibration data:\n{sp_model.decode(token_list)}") + print(f"prefill calibration data:\n{tokenizer.decode(token_list)}") def calibrate( example_inputs, user_prompts, module: torch.fx.GraphModule, - tokenizer_model_path="tokenizer.model", + tokenizer, max_seq_len=512, ): if len(example_inputs) == 2: @@ -159,7 +180,7 @@ def calibrate( example_inputs, user_prompts, module, - tokenizer_model_path, + tokenizer, max_seq_len, ) elif len(example_inputs) == 5: @@ -167,7 +188,7 @@ def calibrate( example_inputs, user_prompts, module, - tokenizer_model_path, + tokenizer, max_seq_len, ) else: @@ -280,7 +301,7 @@ def _tag_ios(self, gm: torch.fx.GraphModule, fixed_point_type): return quant_attrs - def quantize(self, quant_dtype, args, custom_annotations=()): + def quantize(self, quant_dtype, args, tokenizer, custom_annotations=()): self.quant_dtype = quant_dtype quantizer = make_quantizer( quant_dtype=quant_dtype, @@ -303,7 +324,7 @@ def quantize(self, quant_dtype, args, custom_annotations=()): self.get_example_inputs(self.llama_meta["get_use_kv_cache"]), args.prompt, fx_graph_module, - tokenizer_model_path=args.tokenizer_model, + tokenizer=tokenizer, max_seq_len=self.llama_meta["get_max_seq_len"], ) @@ -366,7 +387,7 @@ def lowering_modules( if num_sharding > 0: update_spill_fill_size(edge_prog_mgr.exported_program()) exec_prog_mgr = edge_prog_mgr.to_executorch(config=executorch_config) - with open(f"{work_space}/{pte_filename}.pte", "wb") as file: + with open(f"{work_space}/{self.pte_filename}.pte", "wb") as file: exec_prog_mgr.write_to_file(file) def get_example_inputs(self, use_kv_cache=True): @@ -376,7 +397,7 @@ def get_quant_attrs(self): return self.quant_attrs -def compile(args, pte_filename): +def compile(args, pte_filename, tokenizer): os.makedirs(args.artifact, exist_ok=True) start_ts = time.time() @@ -407,13 +428,13 @@ def compile(args, pte_filename): ) elif args.model_mode == "hybrid": llama_instance_list.append( - LlamaModel(prefill_config, output_new_cache_only=False) + LlamaModel(kv_config, output_new_cache_only=True) ) llama_instance_list.append( - LlamaModel(kv_config, output_new_cache_only=True) + LlamaModel(prefill_config, output_new_cache_only=False) ) else: - raise RuntimeError(f"No such model_mode {args.model_mode}.") + raise RuntimeError(f"Unknown model_mode: {args.model_mode}.") if "model" in state_dict: state_dict = state_dict["model"] @@ -467,17 +488,35 @@ def compile(args, pte_filename): if args.ptq: start_quantize_ts = time.time() - for llama_instance in llama_instance_list: - llama_instance.quantize( - quant_dtype=quant_dtype, - args=args, - custom_annotations=( - partial( - annotate_matmul_16a8w, - traverse_input1=llama_instance.llama_meta["get_use_kv_cache"], - ), - ), + custom_annotations = (annotate_matmul_16a8w,) + if args.llama_model == "stories110m": + custom_annotations = custom_annotations + ( + annotate_linear_16a8w_in_affine_layer, ) + if args.ptq != None: + kv_quant_attrs = {} + for i, llama_instance in enumerate(llama_instance_list): + llama_instance.quantize( + quant_dtype=quant_dtype, + args=args, + tokenizer=tokenizer, + custom_annotations=custom_annotations, + ) + # If hybrid mode, we store kv output quant_attrs and apply to prefill output quant_attrs later + if i == 0 and args.model_mode == "hybrid": + output_indices = 0 + for node in llama_instance.llama_model.graph.nodes: + if node.op == "output": + for output in node.args[0]: + kv_quant_attrs[output_indices] = output.args[1:] + output_indices += 1 + break + custom_annotations = custom_annotations + ( + partial( + annotate_prefill_kv_output, + kv_quant_attrs=kv_quant_attrs, + ), + ) end_quantize_ts = time.time() logging.info(f"Time for quantizing: {end_quantize_ts - start_quantize_ts}") @@ -520,7 +559,7 @@ def compile(args, pte_filename): backend_options = generate_htp_compiler_spec( use_fp16=use_fp16, use_multi_contexts=args.num_sharding > 0 ) - graph_names = ["prefill_forward", "kv_forward"] + graph_names = ["kv_forward", "prefill_forward"] compiler_specs = [ generate_qnn_executorch_compiler_spec( soc_model=get_soc_to_chipset_map()[args.model], @@ -633,7 +672,7 @@ def compile(args, pte_filename): call_delegate_inputs_dict=call_delegate_inputs_dict, outputs_dict=outputs_dict, backend_config=executorch_config, - constant_methods=llama_instance_list[1].llama_meta, # kv method meta + constant_methods=llama_instance_list[0].llama_meta, # kv method meta ) with open(f"{args.artifact}/{pte_filename}.pte", "wb") as file: exec_prog.write_to_file(file) @@ -662,7 +701,7 @@ def compile(args, pte_filename): input_nodes_dict=input_nodes_dict, output_nodes_dict=output_nodes_dict, backend_config=executorch_config, - constant_methods=llama_instance_list[1].llama_meta, # kv method meta + constant_methods=llama_instance_list[0].llama_meta, # kv method meta ) with open(f"{args.artifact}/{pte_filename}.pte", "wb") as file: prog_mgr.write_to_file(file) @@ -672,7 +711,7 @@ def compile(args, pte_filename): return quant_attrs -def inference(args, quant_attrs, pte_filename, pre_gen_pte=""): +def inference(args, quant_attrs, pte_filename, runtime_tokenizer_path, pre_gen_pte=""): workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/single_llama" if args.model_mode == "prefill": @@ -682,14 +721,14 @@ def inference(args, quant_attrs, pte_filename, pre_gen_pte=""): elif args.model_mode == "hybrid": eval_mode = 2 else: - raise RuntimeError(f"No such model_mode {args.model_mode}.") + raise RuntimeError(f"Unknown model_mode: {args.model_mode}.") seq_len = args.prefill_seq_len if args.model_mode == "prefill" else args.kv_seq_len runner_args = " ".join( [ f"--model_path {pte_filename}.pte", "--output_path outputs/outputs.txt", - f"--tokenizer_path {os.path.basename(args.tokenizer_model)}", + f"--tokenizer_path {os.path.basename(runtime_tokenizer_path)}", f'--prompt "{args.prompt}"', f"--seq_len {seq_len}", f"--eval_mode {eval_mode}", @@ -702,7 +741,7 @@ def inference(args, quant_attrs, pte_filename, pre_gen_pte=""): runner_cmd = " ".join( [ f"cd {workspace} &&", - f"./qnn_llama3_2_runner {runner_args}", + f"./qnn_llama_runner {runner_args}", ] ) @@ -720,10 +759,10 @@ def inference(args, quant_attrs, pte_filename, pre_gen_pte=""): host_id=args.host, soc_model=args.model, shared_buffer=args.shared_buffer, - runner=f"examples/qualcomm/oss_scripts/llama3_2/qnn_llama3_2_runner", + runner=f"examples/qualcomm/oss_scripts/llama/qnn_llama_runner", ) # No pregen inputs, input_list is not required - adb.push(inputs=[], input_list="", files=[args.tokenizer_model]) + adb.push(inputs=[], input_list="", files=[runtime_tokenizer_path]) adb.execute(custom_runner_cmd=runner_cmd) # collect output data @@ -756,8 +795,8 @@ def main(): parser.add_argument( "-a", "--artifact", - help="path for storing generated artifacts and output by this example. Default ./llama3_2_qnn", - default="./llama3_2_qnn", + help="path for storing generated artifacts and output by this example. Default ./llama_qnn", + default="./llama_qnn", type=str, ) @@ -768,6 +807,13 @@ def main(): type=str, ) + parser.add_argument( + "--llama_model", + choices=["stories110m", "llama3_2"], + help="The Llama model to export. Current available options are: [stories110m, llama3_2]", + required=True, + ) + parser.add_argument( "--checkpoint", help="Pass llama checkpoint.", @@ -783,10 +829,9 @@ def main(): ) parser.add_argument( - "--model_size", - help="Determine what runner be used. For llama 3.2, we only support 1B/3B. ", - choices=["1B", "3B"], - required=True, + "--tokenizer_bin", + help="For Llama2. Pass Llama2 tokenizer binary.", + required=False, type=str, ) @@ -806,7 +851,7 @@ def main(): parser.add_argument( "--system_prompt", - help="Tells the model what kind of assistant it should be. For example, You are a helpful AI assistant for travel tips and recommendations. Default is None", + help="For Llama3. Tells the model what kind of assistant it should be. For example, You are a helpful AI assistant for travel tips and recommendations. Default is None", default="", type=str, ) @@ -829,7 +874,7 @@ def main(): parser.add_argument( "--pre_gen_pte", - help="Run the Pre-generated llama in the given directory", + help="Run the pre-generated llama in the given directory.", type=str, ) @@ -867,26 +912,46 @@ def main(): exit("Cannot set both compile_only and pre_gen_pte as true") if args.model_mode == "kv": - pte_filename = "kv_llama3_2_qnn" + pte_filename = "kv_llama_qnn" elif args.model_mode == "prefill": - pte_filename = "prefill_llama3_2_qnn" + pte_filename = "prefill_llama_qnn" elif args.model_mode == "hybrid": assert ( args.kv_seq_len >= args.prefill_seq_len ), "Please ensure kv_seq_len is >= prefill_seq_len" - pte_filename = "hybrid_llama3_2_qnn" + pte_filename = "hybrid_llama_qnn" else: - raise RuntimeError(f"No such model_mode {args.model_mode}.") + raise RuntimeError(f"Unknown model_mode: {args.model_mode}.") + + tokenizer = get_tokenizer(args.tokenizer_model) + runtime_tokenizer_path = "" + if args.llama_model == "stories110m": + assert isinstance( + tokenizer, SentencePieceTokenizer + ), f"Wrong tokenizer provided for stories110m." + assert ( + args.tokenizer_bin is not None + ), "Please provide tokenizer_bin for stories110m." + runtime_tokenizer_path = args.tokenizer_bin + elif args.llama_model == "llama3_2": + assert isinstance( + tokenizer, Tiktoken + ), f"Wrong tokenizer provided for llama3_2." + runtime_tokenizer_path = args.tokenizer_model + else: + raise RuntimeError(f"Unknown llama_model: {args.llama_model}.") if args.pre_gen_pte: quant_attrs = json.load( open(f"{args.pre_gen_pte}/{pte_filename}_quant_attrs.txt") ) - inference(args, quant_attrs, pte_filename, args.pre_gen_pte) + inference( + args, quant_attrs, pte_filename, runtime_tokenizer_path, args.pre_gen_pte + ) exit(f"Finish the running pre_gen_pte from {args.pre_gen_pte}") if args.compile_only: - quant_attrs = compile(args, pte_filename) + quant_attrs = compile(args, pte_filename, tokenizer) if quant_attrs: json.dump( { @@ -900,7 +965,7 @@ def main(): exit(f"Finish compile_only and save to {args.artifact}") try: - quant_attrs = compile(args, pte_filename) + quant_attrs = compile(args, pte_filename, tokenizer) if quant_attrs: logging.info( f"Logit scale: {quant_attrs['scale']}; Logit offset: {quant_attrs['zero_point']}" @@ -914,7 +979,7 @@ def main(): ) else: logging.warning("Quant attributes of the logit is None.") - inference(args, quant_attrs, pte_filename) + inference(args, quant_attrs, pte_filename, runtime_tokenizer_path) except Exception as e: if args.ip and args.port != -1: with Client((args.ip, args.port)) as conn: diff --git a/examples/qualcomm/oss_scripts/llama2/model/static_llama.py b/examples/qualcomm/oss_scripts/llama/model/static_llama.py similarity index 100% rename from examples/qualcomm/oss_scripts/llama2/model/static_llama.py rename to examples/qualcomm/oss_scripts/llama/model/static_llama.py diff --git a/examples/qualcomm/oss_scripts/llama3_2/qnn_llama3_2_runner.cpp b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp similarity index 85% rename from examples/qualcomm/oss_scripts/llama3_2/qnn_llama3_2_runner.cpp rename to examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp index 2af882580e1..7660952ef0c 100644 --- a/examples/qualcomm/oss_scripts/llama3_2/qnn_llama3_2_runner.cpp +++ b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp @@ -9,12 +9,13 @@ /** * @file * - * This tool can run Llama3.2 1B/3B with Qualcomm AI Engine Direct. + * This tool can run Llama2 110M, Llama3.2 1B / 3B(WIP) with Qualcomm AI Engine + * Direct. * */ #include -#include +#include #include #include #include @@ -22,7 +23,7 @@ DEFINE_string( model_path, - "qnn_llama2.pte", + "kv_llama_qnn.pte", "Model serialized in flatbuffer format."); DEFINE_string( @@ -42,11 +43,11 @@ DEFINE_double( DEFINE_int32( seq_len, 128, - "Total number of tokens to generate (prompt + output). Defaults to max_seq_len. If the number of input tokens + seq_len > max_seq_len, the output will be truncated to max_seq_len tokens."); + "Total number of tokens to generate (prompt + output)."); DEFINE_int32( eval_mode, - 0, + 1, "0: PromptProcessor(prefill) / 1: TokenGenerator(kv) / 2: HybridMode (prefill+kv)"); DEFINE_double(logits_scale, 0.0, "Logits scale"); DEFINE_int32(logits_offset, 0, "Logits offset"); diff --git a/examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.cpp b/examples/qualcomm/oss_scripts/llama/runner/io_memory.cpp similarity index 99% rename from examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.cpp rename to examples/qualcomm/oss_scripts/llama/runner/io_memory.cpp index 941ff97685b..22efd5a3344 100644 --- a/examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/io_memory.cpp @@ -6,7 +6,7 @@ * LICENSE file in the root directory of this source tree. */ -#include +#include #include #include @@ -403,7 +403,7 @@ void HybridMemory::update_prefill_to_kv_io( // If prompt len is 30, prefill will handle to pos = 30. // At this point, pos should be 31. for (int i = 0; i < pos + 1; i++) { - ptr->kv_attention_mask[kv_cache_len_ - i] = 0; + ptr->kv_attention_mask[kv_cache_len_ - i] = 65535; } // update v_cache diff --git a/examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.h b/examples/qualcomm/oss_scripts/llama/runner/io_memory.h similarity index 100% rename from examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.h rename to examples/qualcomm/oss_scripts/llama/runner/io_memory.h diff --git a/examples/qualcomm/oss_scripts/llama3_2/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp similarity index 94% rename from examples/qualcomm/oss_scripts/llama3_2/runner/runner.cpp rename to examples/qualcomm/oss_scripts/llama/runner/runner.cpp index 02a53861b89..e06d52fbb37 100644 --- a/examples/qualcomm/oss_scripts/llama3_2/runner/runner.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp @@ -10,7 +10,7 @@ // logic. The module takes in a string as input and emits a string as output. #include -#include +#include #include #include #include @@ -57,7 +57,7 @@ Runner::Runner( ET_LOG(Info, "creating module: model_path=%s", models_path[i].c_str()); } ET_LOG(Info, "creating runner: tokenizer_path=%s", tokenizer_path_.c_str()); - ET_LOG(Info, "eval mode=%d", eval_mode); + ET_LOG(Info, "eval mode=%d", eval_mode_); } bool Runner::is_loaded() const { @@ -168,12 +168,14 @@ Error Runner::load() { // llama2 tokenizer tokenizer_ = std::make_unique(); err = tokenizer_->load(tokenizer_path_); + llama_version_ = LlamaVersion::kLlama2; ET_CHECK_MSG( err == Error::Ok, "failed to load tokenizer %s", tokenizer_path_.c_str()); } else { eos_id_.insert(tokenizer_->encode("<|eot_id|>", 0, 0).get()[0]); + llama_version_ = LlamaVersion::kLlama3; } bos_id_ = tokenizer_->bos_tok(); eos_id_.insert(tokenizer_->eos_tok()); @@ -217,8 +219,7 @@ int32_t Runner::logitsToToken(const Tensor& logits_tensor, int64_t pos) { // offset to the meaningful logit we want. if (logits_tensor.sizes().data()[1] > 1) { - auto vocab_size = logits_tensor.size(2); - logits_last += pos * vocab_size; + logits_last += pos * vocab_size_; } // dequantize @@ -277,17 +278,27 @@ Error Runner::generate( ET_CHECK_MSG(!prompt.empty(), "prompt cannot be null"); - if (!system_prompt.empty()) { - prompt_.append("<|start_header_id|>system<|end_header_id|>\n\n"); - prompt_.append(system_prompt); - prompt_.append("<|eot_id|>"); - } - prompt_.append("<|start_header_id|>user<|end_header_id|>\n\n"); - prompt_.append(prompt); - prompt_.append("<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"); - - if (token_callback) { - token_callback("<|begin_of_text|>"); + switch (llama_version_) { + case LlamaVersion::kLlama2: + prompt_.append(prompt); + break; + case LlamaVersion::kLlama3: + if (!system_prompt.empty()) { + prompt_.append("<|start_header_id|>system<|end_header_id|>\n\n"); + prompt_.append(system_prompt); + prompt_.append("<|eot_id|>"); + } + prompt_.append("<|start_header_id|>user<|end_header_id|>\n\n"); + prompt_.append(prompt); + prompt_.append( + "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"); + if (token_callback) { + token_callback("<|begin_of_text|>"); + } + break; + default: + ET_CHECK_MSG(false, "unsupported llama version"); + break; } int max_seq_len = std::max(prefill_cache_len_, kv_cache_len_) + 1; @@ -318,14 +329,14 @@ Error Runner::generate( int64_t pos = 0, prev_token, cur_token = prompt_tokens[0]; HybridMemory::IO* ptr = static_cast(io_mem_->get_mutable_ptr()); + if (token_callback) { + token_callback(prompt_); + } auto prefill_execute = [&](const std::string& method_name) { for (int i = 0; i < num_prompt_tokens; i++) { ptr->prefill_input_toks[i] = static_cast(prompt_tokens[i]); } - if (token_callback) { - token_callback(prompt_); - } pos = num_prompt_tokens - 1; cur_token = prompt_tokens[pos]; @@ -389,7 +400,7 @@ Error Runner::generate( auto piece_res = tokenizer_->decode(prev_token, cur_token); ET_CHECK(piece_res.ok()); - if (token_callback) { + if (token_callback && pos >= num_prompt_tokens) { token_callback(piece_res.get().c_str()); } diff --git a/examples/qualcomm/oss_scripts/llama3_2/runner/runner.h b/examples/qualcomm/oss_scripts/llama/runner/runner.h similarity index 95% rename from examples/qualcomm/oss_scripts/llama3_2/runner/runner.h rename to examples/qualcomm/oss_scripts/llama/runner/runner.h index 75ad6402199..aaf79360bdb 100644 --- a/examples/qualcomm/oss_scripts/llama3_2/runner/runner.h +++ b/examples/qualcomm/oss_scripts/llama/runner/runner.h @@ -17,7 +17,7 @@ #include #include -#include +#include #include #include #include @@ -73,6 +73,10 @@ class Runner { get_methods_meta(std::string& method_name); private: + enum LlamaVersion { + kLlama2 = 0, + kLlama3, + }; template T getMetadataHelper(std::string method_name, T default_val); int32_t logitsToToken( @@ -104,6 +108,7 @@ class Runner { std::string prefill_forward_name_; std::string kv_forward_name_; std::vector method_names_; + LlamaVersion llama_version_; }; } // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/targets.bzl b/examples/qualcomm/oss_scripts/llama/targets.bzl new file mode 100644 index 00000000000..9780da0369d --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/targets.bzl @@ -0,0 +1,54 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_oss_build_kwargs", "runtime") +load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision") + +def define_common_targets(): + runtime.cxx_library( + name = "runner_lib", + srcs = glob( + [ + "runner/*.cpp", + ], + ), + exported_headers = glob([ + "runner/*.h", + ]), + compiler_flags = [ + "-Wno-global-constructors", + "-Wunused-command-line-argument", + ], + deps = [ + "//executorch/extension/llm/runner:stats", + "//executorch/extension/tensor:tensor", + "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()), + ], + exported_deps = [ + "//executorch/extension/module:module", + "//executorch/extension/llm/sampler:sampler", + "//executorch/examples/models/llama/tokenizer:tiktoken", + "//executorch/extension/llm/tokenizer:bpe_tokenizer", + "//executorch/extension/evalue_util:print_evalue", + "//executorch/backends/qualcomm/runtime:runtime", + ], + external_deps = [ + "gflags", + ], + **get_oss_build_kwargs() + ) + + runtime.cxx_binary( + name = "qnn_llama_runner", + srcs = [ + "qnn_llama_runner.cpp", + ], + compiler_flags = [ + "-Wno-global-constructors", + ], + deps = [ + ":runner_lib", + "//executorch/extension/threadpool:threadpool", # this depeneency shouldn't be needed. But it fails to build.. + ], + external_deps = [ + "gflags", + ], + **get_oss_build_kwargs() + ) diff --git a/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt b/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt deleted file mode 100644 index 61a2ecda56b..00000000000 --- a/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (c) Qualcomm Innovation Center, Inc. -# All rights reserved -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -set(_qnn_llama_runner__srcs ${_llama_runner__srcs}) - -# preprocess qnn llama runner src files -list(TRANSFORM _qnn_llama_runner__srcs PREPEND "${EXECUTORCH_SOURCE_DIR}/") -list(FILTER _qnn_llama_runner__srcs EXCLUDE REGEX ".*(/runner/).*") -list( - PREPEND - _qnn_llama_runner__srcs - ${CMAKE_CURRENT_LIST_DIR}/qnn_llama_runner.cpp - ${CMAKE_CURRENT_LIST_DIR}/runner/runner.cpp - ${CMAKE_CURRENT_LIST_DIR}/runner/runner.h -) - -# build qnn llama runner -add_executable(qnn_llama_runner ${_qnn_llama_runner__srcs}) -target_include_directories( - qnn_llama_runner PUBLIC ${_common_include_directories} -) -target_link_libraries( - qnn_llama_runner - qnn_executorch_backend - full_portable_ops_lib - extension_data_loader - extension_module - extension_tensor - gflags - re2::re2 -) -target_compile_options(qnn_llama_runner PUBLIC ${_common_compile_options}) -set_target_properties( - qnn_llama_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'" -) diff --git a/examples/qualcomm/oss_scripts/llama2/README.md b/examples/qualcomm/oss_scripts/llama2/README.md deleted file mode 100644 index d83902a6de8..00000000000 --- a/examples/qualcomm/oss_scripts/llama2/README.md +++ /dev/null @@ -1,39 +0,0 @@ -# Summary - -## Overview -This file provides you the instructions to run LLAMA2 with different parameters via Qualcomm HTP backend. Following settings support for Stories 110M - -Please check corresponding section for more information. - -## Stories 110M -This example demonstrates how to run a smaller LLAMA2, stories110M on mobile via Qualcomm HTP backend. Model architecture is fine-tuned specifically for HTP to accelerate the performance. Weight is quantized via PTQ quantization to fit the model on a phone. - -### Instructions -#### Step 1: Setup -1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. -2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm AI Engine Direct Backend. - -#### Step2: Prepare Model -Download and preapre stories110M model - -```bash -# tokenizer.model & stories110M.pt: -wget "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt" -wget "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model" - -# tokenizer.bin: -python -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin - -# params.json: -echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json -``` - -#### Step3: Run default examples -Default example generates the story based on the given prompt, "Once". -```bash -# 16a4w quant: -python examples/qualcomm/oss_scripts/llama2/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint stories110M --params params.json --tokenizer_model tokenizer.model --tokenizer_bin tokenizer.bin --prompt "Once" -``` - -#### (Note) Customized PTQ data set -User prompts are used for PTQ calibration data. Take the examples above, the word "Once" is the only word for PTQ. If you want to observe more data during the calibration time. Please add more prompts to the args `--prompt`. \ No newline at end of file diff --git a/examples/qualcomm/oss_scripts/llama2/llama.py b/examples/qualcomm/oss_scripts/llama2/llama.py deleted file mode 100755 index 2a2968362ac..00000000000 --- a/examples/qualcomm/oss_scripts/llama2/llama.py +++ /dev/null @@ -1,690 +0,0 @@ -# Copyright (c) Qualcomm Innovation Center, Inc. -# All rights reserved -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# TODO: reenable pyre after fixing the issues -# pyre-ignore-all-errors - -import codecs -import getpass -import json -import os -import time -from multiprocessing.connection import Client - -import torch -from executorch.backends.qualcomm._passes.build_quant_io import BuildQuantIo - -from executorch.backends.qualcomm.partition.qnn_partitioner import QnnPartitioner - -from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype -from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset -from executorch.backends.qualcomm.utils.constants import QCOM_QUANTIZED_IO -from executorch.backends.qualcomm.utils.utils import ( - capture_program, - convert_linear_to_conv2d, - generate_htp_compiler_spec, - generate_qnn_executorch_compiler_spec, - get_soc_to_chipset_map, -) -from executorch.examples.qualcomm.oss_scripts.llama2.model.static_llama import ( - LlamaModel, - ModelArgs, -) -from executorch.examples.qualcomm.utils import ( - make_output_dir, - make_quantizer, - setup_common_args_and_variables, - SimpleADB, -) -from executorch.exir import EdgeCompileConfig, EdgeProgramManager -from executorch.exir.capture._config import ExecutorchBackendConfig -from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass -from executorch.extension.llm.export.builder import DType - -from sentencepiece import SentencePieceProcessor -from torch.ao.quantization.observer import MinMaxObserver -from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e - - -pte_filename = "llama2_qnn" - - -def annotate_matmul_16a8w(gm: torch.fx.GraphModule) -> None: - """ - This function is specific for matmul op 16a8w. - """ - - from executorch.backends.qualcomm.quantizer.annotators import QUANT_ANNOTATION_KEY - from executorch.backends.qualcomm.quantizer.quantizer import ( - get_16a8w_qnn_ptq_config, - get_8a8w_qnn_ptq_config, - QuantizationConfig, - ) - from torch.ao.quantization.quantizer import ( - QuantizationAnnotation, - SharedQuantizationSpec, - ) - from torch.fx import Node - - def annotate_matmul(node: Node, quantization_config: QuantizationConfig): - input_qspec_map = {} - input_act = node.args[0] - input_spec = quantization_config.input_activation - input_qspec_map[input_act] = input_spec - - input_act1 = node.args[1] - input_spec1 = quantization_config.weight - input_qspec_map[input_act1] = input_spec1 - - node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation( - input_qspec_map=input_qspec_map, - output_qspec=quantization_config.output_activation, - _annotated=True, - ) - - def annotate_cat(node: Node, quantization_config: QuantizationConfig): - input_nodes = node.args[0] - - first_input_node = input_nodes[0] - input_qspec_map = {} - input_qspec_map[first_input_node] = quantization_config.input_activation - share_qparams_with_input_act0_qspec = SharedQuantizationSpec( - (first_input_node, node) - ) - - for input_node in input_nodes[1:]: - if input_node not in input_qspec_map: - input_qspec_map[input_node] = share_qparams_with_input_act0_qspec - - node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation( - input_qspec_map=input_qspec_map, - output_qspec=share_qparams_with_input_act0_qspec, - _annotated=True, - ) - - def annotate_single_in_single_out( - node: Node, quantization_config: QuantizationConfig - ) -> None: - input_qspec_map = {} - input_act = node.args[0] - input_qspec_map[input_act] = quantization_config.input_activation - - node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation( - input_qspec_map=input_qspec_map, - output_qspec=quantization_config.output_activation, - _annotated=True, - ) - - def annotate_matmul_input1(node: Node): - quantization_config_8a8w = get_8a8w_qnn_ptq_config(act_symmetric=True) - while isinstance(node, Node) and node.op == "call_function": - if node.target in [ - torch.ops.aten.permute.default, - torch.ops.aten.transpose.int, - ]: - annotate_single_in_single_out(node, quantization_config_8a8w) - node = node.args[0] - elif node.target == torch.ops.aten.cat.default: - annotate_cat(node, quantization_config_8a8w) - node = node.args[0][0] - else: - node = node.args[0] - - quantization_config_16a8w = get_16a8w_qnn_ptq_config() - - for node in gm.graph.nodes: - if node.op == "call_function" and node.target == torch.ops.aten.matmul.default: - annotate_matmul(node, quantization_config_16a8w) - annotate_matmul_input1(node.args[1]) - - -def annotate_linear_16a8w_in_affine_layer(gm: torch.fx.GraphModule) -> None: - from executorch.backends.qualcomm.quantizer.annotators import QUANT_ANNOTATION_KEY - from executorch.backends.qualcomm.quantizer.quantizer import ( - get_ptq_per_channel_quant_config, - QuantizationConfig, - ) - from torch.ao.quantization.quantizer import QuantizationAnnotation - from torch.fx import Node - - def annotate_conv2d(node: Node, quantization_config: QuantizationConfig) -> None: - input_qspec_map = {} - input_act = node.args[0] - input_spec = quantization_config.input_activation - input_qspec_map[input_act] = input_spec - - weight = node.args[1] - input_qspec_map[weight] = quantization_config.weight - - node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation( - input_qspec_map=input_qspec_map, - output_qspec=quantization_config.output_activation, - _annotated=True, - ) - - quantization_config_16a8w_per_channel = get_ptq_per_channel_quant_config( - torch.uint16, weight_dtype=torch.int8 - ) - for node in gm.graph.nodes: - if node.op == "call_function" and node.target == torch.ops.aten.conv2d.default: - if "nn_module_stack" in node.meta: - module_values_list = list(node.meta["nn_module_stack"].values()) - full_qualified_name = module_values_list[0][0] - if full_qualified_name == "L['self'].llama.output": - annotate_conv2d( - node, quantization_config=quantization_config_16a8w_per_channel - ) - - -def _kv_calibrate( - example_inputs, - user_prompts, - module: torch.fx.GraphModule, - tokenizer_model_path="tokenizer.model", - max_seq_len=512, -): - sp_model = SentencePieceProcessor(model_file=tokenizer_model_path) - _, atten_mask, _, k_caches, v_caches = example_inputs - - # TODO: change criteria & support batch inputs if necessary - pos = torch.tensor(0, dtype=torch.int32) - token_list = [sp_model.bos_id()] - for prompt in user_prompts.split(): - token_list += sp_model.encode(prompt) - - def sample_top_p(probs: torch.Tensor, top_p: float) -> torch.Tensor: - probs_sort, probs_indices = torch.sort(probs, dim=-1, descending=True) - probs_sum = torch.cumsum(probs_sort, dim=-1) - mask = probs_sum - probs_sort > top_p - probs_sort[mask] = 0 - probs_sort /= probs_sort.sum(dim=-1, keepdim=True) - next_token = torch.multinomial(probs_sort, num_samples=1) - return probs_indices.gather(dim=-1, index=next_token) - - with torch.no_grad(): - while token_list[-1] != sp_model.eos_id() and pos < max_seq_len - 1: - logits, new_k_caches, new_v_caches = module( - torch.full((1, 1), token_list[pos]), - atten_mask, - torch.full((1, 1), pos), - *k_caches, - *v_caches, - ) - k_caches = [ - torch.cat([k_cache[:, :, 1:], new_k_caches[i]], dim=-1) - for i, k_cache in enumerate(k_caches) - ] - v_caches = [ - torch.cat([v_cache[:, 1:, :], new_v_caches[i]], dim=1) - for i, v_cache in enumerate(v_caches) - ] - - pos += 1 - atten_mask[0][-pos - 1] = 0 - if pos >= len(token_list): - probs = torch.softmax(logits[:, -1] / 0.8, dim=-1) - token_list.append(sample_top_p(probs, 0.9).item()) - - print(f"calibration data:\n{sp_model.decode(token_list)}") - - -def _batch_prefill_calibrate( - example_inputs, - user_prompts, - module: torch.fx.GraphModule, - tokenizer_model_path="tokenizer.model", - max_seq_len=512, -): - sp_model = SentencePieceProcessor(model_file=tokenizer_model_path) - _, atten_mask = example_inputs - max_cache_len = max_seq_len - 1 - - # TODO: change criteria & support batch inputs if necessary - token_list = sp_model.encode(user_prompts, bos=True, eos=False) - token_list = torch.tensor(token_list)[:max_cache_len].reshape(1, -1) - last_prompt_pos = token_list.numel() - if last_prompt_pos < max_cache_len: - token_list = torch.cat( - [ - token_list, - torch.zeros((1, max_cache_len - last_prompt_pos), dtype=torch.int32), - ], - dim=1, - ) - else: - token_list = token_list[:, :max_cache_len] - - with torch.no_grad(): - logits, new_k_caches, new_v_caches = module( - token_list, - atten_mask, - ) - predict = [torch.argmax(logits[:, last_prompt_pos - 1], dim=-1).item()] - - print(f"calibration data:\n{sp_model.decode(predict)}") - - -def calibrate( - example_inputs, - user_prompts, - module: torch.fx.GraphModule, - tokenizer_model_path="tokenizer.model", - max_seq_len=512, -): - if len(example_inputs) == 2: - _batch_prefill_calibrate( - example_inputs, - user_prompts, - module, - tokenizer_model_path, - max_seq_len, - ) - elif len(example_inputs) == 5: - _kv_calibrate( - example_inputs, - user_prompts, - module, - tokenizer_model_path, - max_seq_len, - ) - else: - raise RuntimeError("Get wrong inputs") - - -class SingleLlama: - def __init__(self, llama_model) -> None: - super().__init__() - self.llama_model = llama_model - self.quant_dtype = None - self.llama_meta = self.llama_model.get_metadata() - self.has_quant_io = False - if self.llama_meta["get_use_kv_cache"]: - tokens, atten_mask, pos_ids, k_caches, v_caches = self.get_example_inputs( - use_kv_cache=True - ) - self.inputs = (tokens, atten_mask, pos_ids, *k_caches, *v_caches) - else: - tokens, atten_mask = self.get_example_inputs(use_kv_cache=False) - self.inputs = (tokens, atten_mask) - - def _tag_kv_ios(self, gm: torch.fx.GraphModule, kv_type): - if not self.has_quant_io: - return - - # shape of k caches and v caches - input_cache_shape = { - (self.llama_meta["get_head_dim"], self.llama_meta["get_max_seq_len"]), - (self.llama_meta["get_max_seq_len"], self.llama_meta["get_head_dim"]), - } - for n in gm.graph.nodes: - if ( - n.op == "placeholder" - and len(users := list(n.users)) == 1 - and users[0].meta["val"].size()[-2:] in input_cache_shape - ): - n.meta[QCOM_QUANTIZED_IO] = kv_type - elif n.op == "output": - for a in n.args[0]: - # single head, kv mode - if ( - a.meta["val"].flatten().size()[0] - == self.llama_meta["get_head_dim"] - ): - a.meta[QCOM_QUANTIZED_IO] = kv_type - # single head, batch_prefill mode - elif a.meta["val"].flatten().size()[0] == self.llama_meta[ - "get_head_dim" - ] * (self.llama_meta["get_max_seq_len"] - 1): - a.meta[QCOM_QUANTIZED_IO] = kv_type - - def quantize(self, quant_dtype, args, custom_annotations=()): - self.quant_dtype = quant_dtype - quantizer = make_quantizer( - quant_dtype=quant_dtype, - per_channel_conv=True, - per_channel_linear=True, - act_observer=MinMaxObserver, - ) - quantizer.add_custom_quant_annotations(custom_annotations) - - self.has_quant_io = True - fx_graph_module = None - - with torch.no_grad(): - fx_graph_module = torch.export.export( - self.llama_model, self.inputs, strict=True - ).module() - fx_graph_module = prepare_pt2e(fx_graph_module, quantizer) - print("Quantizing the model...") - - calibrate( - self.get_example_inputs(self.llama_meta["get_use_kv_cache"]), - args.prompt, - fx_graph_module, - tokenizer_model_path=args.tokenizer_model, - max_seq_len=args.seq_len, - ) - - self.llama_model = convert_pt2e(fx_graph_module) - - def lowering_modules( - self, work_space, kv_type=torch.uint8, soc_model=QcomChipset.SM8650 - ): - executorch_config = ExecutorchBackendConfig( - passes=[ - BuildQuantIo(), - ], - # For shared buffer, user must pass the memory address - # which is allocated by RPC memory to executor runner. - # Therefore, won't want to pre-allocate - # by memory manager in runtime. - memory_planning_pass=MemoryPlanningPass( - alloc_graph_input=False, - alloc_graph_output=False, - ), - extract_delegate_segments=True, - ) - with torch.no_grad(): - # backend option - backend_options = generate_htp_compiler_spec(use_fp16=False) - compiler_specs = generate_qnn_executorch_compiler_spec( - soc_model=soc_model, - backend_options=backend_options, - shared_buffer=True, - ) - partitioner = QnnPartitioner(compiler_specs) - edge_prog = capture_program(self.llama_model, self.inputs) - self._tag_kv_ios(edge_prog.exported_program.graph_module, kv_type=kv_type) - edge_prog_mgr = EdgeProgramManager( - edge_programs={"forward": edge_prog.exported_program}, - constant_methods=self.llama_meta, - compile_config=EdgeCompileConfig(_check_ir_validity=False), - ) - edge_prog_mgr = edge_prog_mgr.to_backend(partitioner) - exec_prog_mgr = edge_prog_mgr.to_executorch(config=executorch_config) - with open(f"{work_space}/{pte_filename}.pte", "wb") as file: - exec_prog_mgr.write_to_file(file) - - def get_example_inputs(self, use_kv_cache=True): - return self.llama_model.get_example_inputs(use_kv_cache) - - -def compile(args): - os.makedirs(args.artifact, exist_ok=True) - start_ts = time.time() - - if args.model_mode == "kv": - use_kv_cache = output_new_cache_only = True - elif args.model_mode == "batch_prefill" or args.model_mode == "hybrid": - raise NotImplementedError( - f"model_mode {args.model_mode} is not implemented yet." - ) - else: - raise RuntimeError(f"No such model_mode {args.model_mode}.") - - with open(args.params) as f: - config = ModelArgs(**json.load(f)) - # TODO: support batch inputs if necessary - config.max_batch_size = 1 - config.max_seq_len = args.seq_len - config.use_kv_cache = use_kv_cache - state_dict = torch.load( - args.checkpoint, weights_only=True, map_location="cpu", mmap=True - ) - end_load_ts = time.time() - print("torch.load checkpoint", end_load_ts - start_ts) - - llama_instance = None - with torch.device("meta"): - llama_instance = LlamaModel(config, output_new_cache_only=output_new_cache_only) - if "model" in state_dict: - state_dict = state_dict["model"] - llama_instance.load_state_dict( - state_dict, - strict=False, - assign=True, - ) - end_load_state_dict_ts = time.time() - print("instance.load_state_dict", end_load_state_dict_ts - end_load_ts) - - for layer in llama_instance.layers: - if getattr(layer.attention, "prepare_sha", None): - layer.attention.prepare_sha() - - kv_type = torch.uint8 - assert args.ptq in [ - "8a8w", - "16a4w", - ], f"No support for quant type {args.ptq}. Support 8a8w and 16a4w." - quant_dtype = getattr(QuantDtype, f"use_{args.ptq}") - assert args.tokenizer_model is not None, "Need tokenizer model for calibration" - - if args.dtype_override is not None: - dtype_override = DType[args.dtype_override] - llama_instance = llama_instance.to(dtype_override.to_torch_dtype()) - - llama_instance = convert_linear_to_conv2d(llama_instance) - single_llama = SingleLlama(llama_instance.eval()) - - start_quantize_ts = time.time() - single_llama.quantize( - quant_dtype, - args=args, - custom_annotations=( - annotate_matmul_16a8w, - annotate_linear_16a8w_in_affine_layer, - ), - ) - end_quantize_ts = time.time() - print("single_llama.quantize(quant_dtype)", end_quantize_ts - start_quantize_ts) - single_llama.lowering_modules( - args.artifact, kv_type=kv_type, soc_model=get_soc_to_chipset_map()[args.model] - ) - end_lowering_ts = time.time() - print("Complete Compile", end_lowering_ts - end_quantize_ts) - - -def inference(args, pre_gen_pte=""): - workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/single_llama" - - if args.model_mode != "kv": - raise NotImplementedError( - f"model_mode {args.model_mode} is not implemented yet." - ) - - assert args.tokenizer_bin is not None, "Need tokenizer model for interence" - runner_args = " ".join( - [ - f"--model_path {pte_filename}.pte", - "--output_folder_path outputs", - f"--tokenizer_path {os.path.basename(args.tokenizer_bin)}", - f'--prompt "{args.prompt}"', - f"--seq_len {args.seq_len}", - f"--temperature {args.temperature}", - ] - ) - runner_cmd = " ".join( - [ - f"cd {workspace} &&", - f"./qnn_llama_runner {runner_args}", - ] - ) - - pte_path = ( - f"{pre_gen_pte}/{pte_filename}.pte" - if pre_gen_pte - else f"{args.artifact}/{pte_filename}.pte" - ) - adb = SimpleADB( - qnn_sdk=os.getenv("QNN_SDK_ROOT"), - build_path=f"{args.build_folder}", - pte_path=pte_path, - workspace=workspace, - device_id=args.device, - host_id=args.host, - soc_model=args.model, - shared_buffer=args.shared_buffer, - runner="examples/qualcomm/oss_scripts/llama2/qnn_llama_runner", - ) - # No pregen inputs, input_list is not required - adb.push(inputs=[], input_list="", files=[args.tokenizer_bin]) - adb.execute(custom_runner_cmd=runner_cmd) - - # collect output data - output_data_folder = f"{args.artifact}/outputs" - make_output_dir(output_data_folder) - outputs = [] - - def post_process(): - for f in sorted( - os.listdir(output_data_folder), key=lambda f: int(f.split("_")[1]) - ): - with codecs.open( - os.path.join(output_data_folder, f), - "r", - encoding="utf-8", - errors="replace", - ) as fdata: - outputs.append(fdata.read()) - - adb.pull(output_path=args.artifact, callback=post_process) - - if args.ip and args.port != -1: - with Client((args.ip, args.port)) as conn: - conn.send( - json.dumps( - { - "result": outputs, - } - ) - ) - else: - for idx, output in enumerate(outputs): - print(f"Results[{idx}]:\n{output}") - - -def main(): - parser = setup_common_args_and_variables() - parser.add_argument( - "-a", - "--artifact", - help="path for storing generated artifacts and output by this example. Default ./llama2_qnn", - default="./llama2_qnn", - type=str, - ) - - parser.add_argument( - "-P", - "--ptq", - help="If specified, will do PTQ quantization. default is 16bits activation and 4bits weight. Support 8a8w and 16a4w.", - default="16a4w", - ) - - parser.add_argument( - "--checkpoint", - help="Pass llama2 checkpoint.", - required=True, - type=str, - ) - - parser.add_argument( - "--params", - help="Pass llama2 params json file.", - required=True, - type=str, - ) - - parser.add_argument( - "--tokenizer_bin", - help="Pass llama2 tokenizer binary.", - required=False, - type=str, - ) - - parser.add_argument( - "--tokenizer_model", - help="Pass llama2 tokenizer model.", - type=str, - default=None, - ) - - parser.add_argument( - "--prompt", - help="User prompts for llama2.", - required=True, - type=str, - ) - - parser.add_argument( - "--seq_len", - help="Ouput sequence length for llama2.", - default=128, - type=int, - ) - - parser.add_argument( - "--temperature", - help="Sampling temperature for llama2.", - default=0.8, - type=float, - ) - - parser.add_argument( - "-d", - "--dtype-override", - default="fp32", - type=str, - choices=["fp32", "fp16"], - help="Override the dtype of the model (default is the checkpoint dtype). Options: fp32", - ) - - parser.add_argument( - "--pre_gen_pte", - help="Run the Pre-generated llama2 in the given directory", - type=str, - ) - - parser.add_argument( - "--num_sharding", - type=int, - default=0, - help="Specify the number of splits by inserting the fallback custom op. The graph will be split evenly by layers.", - ) - - parser.add_argument( - "--model_mode", - help="Export and inference batch_prefill mode, kv mode or hybrid(TBD) mode", - default="kv", - choices=["batch_prefill", "kv", "hybrid"], - type=str, - ) - - args = parser.parse_args() - if args.compile_only and args.pre_gen_pte: - exit("Cannot set both compile_only and pre_gen_pte as true") - - if args.pre_gen_pte: - inference(args, args.pre_gen_pte) - exit(f"Finish the running pre_gen_pte from {args.pre_gen_pte}") - - if args.compile_only: - compile(args) - exit(f"Finish compile_only and save to {args.artifact}") - - try: - compile(args) - inference(args) - except Exception as e: - if args.ip and args.port != -1: - with Client((args.ip, args.port)) as conn: - conn.send(json.dumps({"Error": str(e)})) - else: - raise Exception(e) - - -# flake8: noqa: C901 -if __name__ == "__main__": - main() diff --git a/examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp b/examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp deleted file mode 100644 index 1e46f919dca..00000000000 --- a/examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright (c) Qualcomm Innovation Center, Inc. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -/** - * @file - * - * This tool can run ExecuTorch model files with Qualcomm AI Engine Direct. - * - * User could specify arguments like desired prompt, temperature, etc. - */ - -#include -#include -#include - -#include - -#include -#include - -DEFINE_string( - model_path, - "qnn_llama2.pte", - "Model serialized in flatbuffer format."); - -DEFINE_string( - output_folder_path, - "outputs", - "Executorch inference data output path."); - -DEFINE_string(tokenizer_path, "tokenizer.bin", "Tokenizer stuff."); - -DEFINE_string(prompt, "The answer to the ultimate question is", "Prompt."); - -DEFINE_double( - temperature, - 0.8f, - "Temperature; Default is 0.8f. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic"); - -DEFINE_int32( - seq_len, - 128, - "Total number of tokens to generate (prompt + output). Defaults to max_seq_len. If the number of input tokens + seq_len > max_seq_len, the output will be truncated to max_seq_len tokens."); - -using executorch::runtime::Error; -using executorch::runtime::MemoryAllocator; -using executorch::runtime::MethodMeta; -using executorch::runtime::Result; - -int main(int argc, char** argv) { - gflags::ParseCommandLineFlags(&argc, &argv, true); - - const char* tokenizer_path = FLAGS_tokenizer_path.c_str(); - const char* prompt = FLAGS_prompt.c_str(); - double temperature = FLAGS_temperature; - int32_t seq_len = FLAGS_seq_len; - - // create llama runner - example::Runner runner(FLAGS_model_path, tokenizer_path, temperature); - ET_CHECK_MSG(runner.load() == Error::Ok, "Runner failed to load method"); - - // MethodMeta describes the memory requirements of the method. - Result method_meta = runner.get_method_meta(); - ET_CHECK_MSG( - method_meta.ok(), - "Failed to get method_meta 0x%x", - (unsigned int)method_meta.error()); - ET_CHECK_MSG( - runner.mem_alloc(MemoryAllocator::kDefaultAlignment, seq_len) == - Error::Ok, - "Runner failed to allocate memory"); - - // generate tokens - std::string inference_output; - // prompt are determined by command line arguments - // pos_ids, atten_mask are infered inside runner - runner.generate(prompt, seq_len, [&](const std::string& piece) { - inference_output += piece; - }); - - size_t inference_index = 0; - auto output_file_name = FLAGS_output_folder_path + "/output_" + - std::to_string(inference_index++) + "_0.raw"; - std::ofstream fout(output_file_name.c_str()); - fout << inference_output; - fout.close(); - - return 0; -} diff --git a/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp deleted file mode 100644 index 3f055127324..00000000000 --- a/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp +++ /dev/null @@ -1,671 +0,0 @@ -/* - * Copyright (c) Qualcomm Innovation Center, Inc. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -// A simple llama2 runner that includes preprocessing and post processing logic. -// The module takes in a string as input and emits a string as output. - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -using executorch::aten::ScalarType; -using executorch::aten::SizesType; -using executorch::aten::Tensor; -using executorch::extension::from_blob; -using executorch::extension::Module; -using executorch::extension::TensorPtr; -using executorch::extension::llm::BPETokenizer; -using executorch::extension::llm::Sampler; -using executorch::extension::llm::time_in_ms; -using executorch::runtime::Error; -using executorch::runtime::EValue; -using executorch::runtime::MethodMeta; -using executorch::runtime::Result; -using executorch::runtime::TensorInfo; - -// TODO: Remove this usage of an internal-only function. -using executorch::runtime::internal::set_tensor_data; - -namespace example { - -namespace { -static constexpr auto kTopp = 0.9f; -void printReport(const Runner::Stats& stats); -std::string statsToJsonString(const Runner::Stats& stats); -} // namespace - -Runner::Runner( - const std::string& model_path, - const std::string& tokenizer_path, - const float temperature) - : module_(std::make_unique( - model_path, - Module::LoadMode::MmapUseMlockIgnoreErrors)), - tokenizer_path_(tokenizer_path), - model_path_(model_path), - temperature_(temperature) { - ET_LOG( - Info, - "Creating LLaMa runner: model_path=%s, tokenizer_path=%s", - model_path.c_str(), - tokenizer_path.c_str()); -} - -bool Runner::is_loaded() const { - return module_->is_loaded() && tokenizer_ && sampler_; -} - -Error Runner::load() { - if (is_loaded()) { - return Error::Ok; - } - stats_.model_load_start_ms = time_in_ms(); - ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method("forward")); - - // Read out metadata from the model - ET_LOG(Info, "Reading metadata from model"); - const auto method_names = module_->method_names(); - ET_CHECK_MSG(method_names.ok(), "Failed to read method names from model"); - model_methods_ = method_names.get(); - vocab_size_ = getMetadataHelper("get_vocab_size", 32000); - bos_id_ = getMetadataHelper("get_bos_id", 1); - eos_id_ = getMetadataHelper("get_eos_id", 2); - n_bos_ = getMetadataHelper("get_n_bos", 1); - n_eos_ = getMetadataHelper("get_n_eos", 1); - max_seq_len_ = getMetadataHelper("get_max_seq_len", 128); - head_dim_ = getMetadataHelper("get_head_dim", 32); - dim_ = getMetadataHelper("get_dim", 4096); - - // Load tokenizer - tokenizer_ = std::make_unique(); - tokenizer_->load(tokenizer_path_); - if (tokenizer_->bos_tok() != bos_id_) { - ET_LOG( - Error, - "Tokenizer's BOS id %lu does not match model's BOS id %ld, will override tokenizer's BOS.", - tokenizer_->bos_tok(), - bos_id_); - } - if (tokenizer_->eos_tok() != eos_id_) { - ET_LOG( - Error, - "Tokenizer's EOS id %lu does not match model's EOS id %ld, will override tokenizer's EOS.", - tokenizer_->eos_tok(), - eos_id_); - } - // Create sampler - sampler_ = std::make_unique( - vocab_size_, - temperature_, - kTopp, - static_cast(std::time(nullptr))); - stats_.model_load_end_ms = time_in_ms(); - - return Error::Ok; -} - -template -T Runner::getMetadataHelper(std::string method_name, T default_val) { - T res = default_val; - if (model_methods_.count(method_name)) { - Result> outputs = module_->execute(method_name); - if (outputs.ok()) { - std::vector outs = outputs.get(); - if (outs.size() > 0) { - res = outs[0].to(); - } - } - } else { - ET_LOG( - Info, - "The model does not contain %s method, using default value %lld", - method_name.c_str(), - (long long)default_val); - } - ET_LOG(Info, "%s: %lld", method_name.c_str(), (long long)res); - return res; -} - -template -int32_t Runner::logitsToToken(const Tensor& logits_tensor) { - T* logits = logits_tensor.mutable_data_ptr(); - - // Since the logits are for all tokens, get the last token probabilities - T* logits_last = logits; - return sampler_->sample(logits_last); -} - -// Given an input token. Set up the inputs for the model and execute a single -// step. Returning the logits tensor. -Result Runner::run_model_step( - int64_t input_token, - TensorPtr& token, - TensorPtr& atten_mask, - TensorPtr& start_pos, - std::vector& kv_tensors, - std::vector& kv_outputs) { - token->mutable_data_ptr()[0] = input_token; - - // inputs:[tokens, start_pos, atten_mask, k_cache, v_cache] - std::vector inputs = { - token, atten_mask, start_pos}; - inputs.insert(inputs.end(), kv_tensors.begin(), kv_tensors.end()); - auto outputs_res = module_->forward(inputs); - ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error()); - - // TODO: need to handle batch size != 1 - size_t v_offset = kv_outputs[0]->nbytes(); - size_t el_size = kv_outputs[0]->element_size(); - size_t k_input_step = (max_seq_len_ - 1) * el_size; - int k_tensors_end = kv_tensors.size() / 2; - // update k caches - for (int j = 0; j < k_tensors_end; ++j) { - uint8_t* input_addr = - static_cast(kv_tensors[j]->mutable_data_ptr()); - uint8_t* output_addr = - static_cast(kv_outputs[j]->mutable_data_ptr()); - // fill the output k values back - for (int src = 0, dst = k_input_step; src < kv_outputs[j]->nbytes(); - src += el_size, dst += k_input_step) { - input_addr[dst] = output_addr[src]; - } - char* new_inp_addr = io_mem_mgr_.update_k_caches_read(j, el_size); - // inputs - ET_CHECK_MSG( - set_tensor_data( - *kv_tensors[j], new_inp_addr, kv_tensors[j]->nbytes()) == Error::Ok, - "Failed to set input tensor when updating k_cache"); - } - // update v caches - for (int j = k_tensors_end, v_idx = 0; j < kv_tensors.size(); ++j, ++v_idx) { - // inputs - char* new_inp_addr = io_mem_mgr_.update_v_caches_read(v_idx, v_offset); - - ET_CHECK_MSG( - set_tensor_data( - *kv_tensors[j], new_inp_addr, kv_tensors[j]->nbytes()) == Error::Ok, - "Failed to set input tensor when updating v_cache"); - // outputs - char* new_out_addr = io_mem_mgr_.update_v_caches_write(v_idx, v_offset); - ET_CHECK_MSG( - set_tensor_data( - *kv_outputs[j], new_out_addr, kv_outputs[j]->nbytes()) == Error::Ok, - "Failed to set output tensor when updating v_cache"); - ET_CHECK_MSG( - module_->set_output(*kv_outputs[j], j + 1) == Error::Ok, - "Failed to set llama output data pointer"); - } - - // Bump start_pos by 1 - start_pos->mutable_data_ptr()[0]++; - - // update atten_mask - atten_mask->mutable_data_ptr() - [atten_mask->numel() - 1 - start_pos->const_data_ptr()[0]] = 0; - return outputs_res.get()[0].toTensor(); -} -// TODO: add overloaded method for on-device tokenize -Error Runner::generate( - const std::string& prompt, - int32_t seq_len, - std::function token_callback, - std::function stats_callback) { - ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null"); - ET_CHECK_MSG(is_loaded(), "Please invoke load method first"); - - // First token time only measures the time it takes to encode the prompt and - // return a response token. - stats_.inference_start_ms = time_in_ms(); - shouldStop_ = false; - - // Set the sequence length to the max seq length if not provided - seq_len = (seq_len > 0 && seq_len <= max_seq_len_) ? seq_len : max_seq_len_; - - Result> encode_res = - tokenizer_->encode(prompt, n_bos_, 0); - - ET_CHECK_OK_OR_RETURN_ERROR( - encode_res.error(), "Failed to encode prompt %s", prompt.c_str()); - - // encode the (string) prompt into tokens sequence - std::vector prompt_tokens = encode_res.get(); - int num_prompt_tokens = prompt_tokens.size(); - - ET_CHECK_MSG( - num_prompt_tokens < max_seq_len_, - "Max seq length exceeded - please increase max seq len value in static_llama.py"); - - ET_CHECK_MSG( - num_prompt_tokens < seq_len, - "Sequence length exceeded - please increase the seq_len value passed to generate()"); - - int32_t pos = 0, prev_token, cur_token = prompt_tokens[0]; - std::vector token_shape = {1, 1}; - - io_mem_mgr_.get_input_token_ptr()[0] = 0; - std::vector start_pos_shape = {1, 1}; - - float* atten_mask_ptr = - reinterpret_cast(io_mem_mgr_.get_atten_mask_ptr()); - std::fill(atten_mask_ptr, atten_mask_ptr + max_seq_len_, -255); - atten_mask_ptr[max_seq_len_ - 1] = 0; - - std::vector atten_mask_shape = {1, max_seq_len_}; - - std::vector logits_data_shape = {1, vocab_size_}; - - std::vector hidden_states_data_shape = {1, 1, dim_}; - - // initialize tensor wrappers - auto token = from_blob( - io_mem_mgr_.get_input_token_ptr(), token_shape, ScalarType::Int); - auto start_pos = from_blob( - io_mem_mgr_.get_pos_idx_ptr(), start_pos_shape, ScalarType::Int); - auto atten_mask = from_blob( - io_mem_mgr_.get_atten_mask_ptr(), atten_mask_shape, ScalarType::Float); - - std::vector kv_tensors, kv_outputs; - - Result method_meta = get_method_meta(); - size_t num_inputs = method_meta->num_inputs(); - int k_caches_num = (num_inputs - 3) / 2; - - // TODO: need to handle batch size != 1 - // k caches init - for (int input_index = 3, i = 0; input_index < k_caches_num + 3; - ++input_index, ++i) { - // inputs - Result tensor_meta = - method_meta->input_tensor_meta(input_index); - - auto tensor_shape = tensor_meta->sizes(); - std::vector sizes( - tensor_shape.data(), tensor_shape.data() + tensor_shape.size()); - kv_tensors.emplace_back(from_blob( - io_mem_mgr_.get_k_caches_read_ptr(i), - sizes, - tensor_meta->scalar_type())); - - // outpus - Result out_tensor_meta = method_meta->output_tensor_meta(i + 1); - tensor_shape = out_tensor_meta->sizes(); - sizes = std::vector{ - tensor_shape.data(), tensor_shape.data() + tensor_shape.size()}; - kv_outputs.emplace_back(from_blob( - io_mem_mgr_.get_k_caches_write_ptr(i), - sizes, - kv_tensors.back()->scalar_type())); - ET_CHECK_MSG( - module_->set_output(kv_outputs.back(), i + 1) == Error::Ok, - "Failed to set output tensor for kv cache"); - } - - // v caches init - for (int i = 0, input_index = k_caches_num + 3; input_index < num_inputs; - ++input_index, ++i) { - int output_index = i + k_caches_num + 1; - // inputs - Result tensor_meta = - method_meta->input_tensor_meta(input_index); - auto tensor_shape = tensor_meta->sizes(); - std::vector sizes( - tensor_shape.data(), tensor_shape.data() + tensor_shape.size()); - - kv_tensors.emplace_back(from_blob( - io_mem_mgr_.get_v_caches_read_ptr(i), - sizes, - tensor_meta->scalar_type())); - - // outputs - Result out_tensor_meta = - method_meta->output_tensor_meta(output_index); - tensor_shape = out_tensor_meta->sizes(); - sizes = std::vector{ - tensor_shape.data(), tensor_shape.data() + tensor_shape.size()}; - - kv_outputs.push_back(from_blob( - io_mem_mgr_.get_v_caches_write_ptr(i), - sizes, - kv_tensors.back()->scalar_type())); - ET_CHECK_MSG( - module_->set_output(kv_outputs.back(), output_index) == Error::Ok, - "Failed to set output tensor for llama block"); - } - - auto affine_logits = from_blob( - reinterpret_cast(io_mem_mgr_.get_logit_ptr()), - logits_data_shape, - ScalarType::Float); - ET_CHECK_MSG( - module_->set_output(affine_logits) == Error::Ok, - "Failed to set output tensor for affine module - logits"); - - // Start consuming user's prompts and generating new tokens - std::string final_output; - while (pos < seq_len - 1) { - // Run the model - auto logits_res = run_model_step( - cur_token, token, atten_mask, start_pos, kv_tensors, kv_outputs); - if (pos == num_prompt_tokens) { - stats_.first_token_ms = time_in_ms(); - } else if (pos == num_prompt_tokens - 1) { - stats_.prompt_eval_end_ms = time_in_ms(); - } - - ET_CHECK_OK_OR_RETURN_ERROR(logits_res.error()); - Tensor& logits_tensor = logits_res.get(); - prev_token = cur_token; - long sample_start_time_ms = time_in_ms(); - - cur_token = logitsToToken(logits_tensor); - stats_.aggregate_sampling_time_ms += time_in_ms() - sample_start_time_ms; - - // advance the state machine - if (pos < num_prompt_tokens - 1) { - // prefill, force the next token to be the next prompt token - cur_token = prompt_tokens[pos + 1]; - } - pos++; - - // print the token as string, decode it with the Tokenizer object - auto piece_res = tokenizer_->decode(prev_token, cur_token); - ET_CHECK(piece_res.ok()); - - if (token_callback) { - token_callback(piece_res.get()); - } - - if (shouldStop_) { - break; - } - - // data-dependent terminating condition: we have n_eos_ number of EOS - if (pos >= num_prompt_tokens && cur_token == eos_id_) { - ET_LOG(Info, "Reached to the end of generation"); - break; - } - } - stats_.inference_end_ms = time_in_ms(); - - if (pos == seq_len) { - ET_LOG(Info, "Sequence length (%i tokens) reached!", seq_len); - } - - stats_.num_prompt_tokens = num_prompt_tokens; - stats_.num_generated_tokens = pos - num_prompt_tokens; - printReport(stats_); - if (stats_callback) { - stats_callback(stats_); - } - - return Error::Ok; -} - -namespace { -void printReport(const Runner::Stats& stats) { - printf("PyTorchObserver %s\n", statsToJsonString(stats).c_str()); - - ET_LOG( - Info, - "\tPrompt Tokens: %" PRIu64 " Generated Tokens: %" PRIu64, - stats.num_prompt_tokens, - stats.num_generated_tokens); - - ET_LOG( - Info, - "\tModel Load Time:\t\t%f (seconds)", - ((double)(stats.model_load_end_ms - stats.model_load_start_ms) / - stats.SCALING_FACTOR_UNITS_PER_SECOND)); - double inference_time_ms = - (double)(stats.inference_end_ms - stats.inference_start_ms); - ET_LOG( - Info, - "\tTotal inference time:\t\t%f (seconds)\t\t Rate: \t%f (tokens/second)", - inference_time_ms / stats.SCALING_FACTOR_UNITS_PER_SECOND, - - (stats.num_generated_tokens) / - (double)(stats.inference_end_ms - stats.inference_start_ms) * - stats.SCALING_FACTOR_UNITS_PER_SECOND); - double prompt_eval_time = - (double)(stats.prompt_eval_end_ms - stats.inference_start_ms); - ET_LOG( - Info, - "\t\tPrompt evaluation:\t%f (seconds)\t\t Rate: \t%f (tokens/second)", - prompt_eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND, - (stats.num_prompt_tokens) / prompt_eval_time * - stats.SCALING_FACTOR_UNITS_PER_SECOND); - - double eval_time = - (double)(stats.inference_end_ms - stats.prompt_eval_end_ms); - ET_LOG( - Info, - "\t\tGenerated %" PRIu64 - " tokens:\t%f (seconds)\t\t Rate: \t%f (tokens/second)", - stats.num_generated_tokens, - eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND, - stats.num_generated_tokens / eval_time * - stats.SCALING_FACTOR_UNITS_PER_SECOND); - - // Time to first token is measured from the start of inference, excluding - // model load time. - ET_LOG( - Info, - "\tTime to first generated token:\t%f (seconds)", - ((double)(stats.first_token_ms - stats.inference_start_ms) / - stats.SCALING_FACTOR_UNITS_PER_SECOND)); - - ET_LOG( - Info, - "\tSampling time over %" PRIu64 " tokens:\t%f (seconds)", - stats.num_prompt_tokens + stats.num_generated_tokens, - (double)stats.aggregate_sampling_time_ms / - stats.SCALING_FACTOR_UNITS_PER_SECOND); -} - -std::string statsToJsonString(const Runner::Stats& stats) { - std::stringstream ss; - ss << "{\"prompt_tokens\":" << stats.num_prompt_tokens << "," - << "\"generated_tokens\":" << stats.num_generated_tokens << "," - << "\"model_load_start_ms\":" << stats.model_load_start_ms << "," - << "\"model_load_end_ms\":" << stats.model_load_end_ms << "," - << "\"inference_start_ms\":" << stats.inference_start_ms << "," - << "\"inference_end_ms\":" << stats.inference_end_ms << "," - << "\"prompt_eval_end_ms\":" << stats.prompt_eval_end_ms << "," - << "\"first_token_ms\":" << stats.first_token_ms << "," - << "\"aggregate_sampling_time_ms\":" << stats.aggregate_sampling_time_ms - << "," << "\"SCALING_FACTOR_UNITS_PER_SECOND\":" - << stats.SCALING_FACTOR_UNITS_PER_SECOND << "}"; - return ss.str(); -} -} // namespace - -IoMemMgr::IoMemMgr(MethodMeta method_meta) { - method_meta_ = std::make_unique(method_meta); - init_io_info(); - compute_total_nbytes(); -} - -void IoMemMgr::init_io_info() { - set_tensor_meta(); - for (auto info : io_info_.tensor_info) { - info->size = info->tensor_meta->nbytes(); - info->rank = info->tensor_meta->sizes().size(); - info->shape.resize(info->rank); - for (int i = 0; i < info->rank; i++) { - info->shape[i] = - static_cast(info->tensor_meta->sizes().data()[i]); - } - info->dtype = info->tensor_meta->scalar_type(); - info->element_size = scalar_type_to_size[info->tensor_meta->scalar_type()]; - } -}; - -void IoMemMgr::set_tensor_meta() { - io_info_.input_token.tensor_meta = - std::make_unique(method_meta_->input_tensor_meta(0).get()); - io_info_.atten_mask.tensor_meta = - std::make_unique(method_meta_->input_tensor_meta(1).get()); - io_info_.pos_idx.tensor_meta = - std::make_unique(method_meta_->input_tensor_meta(2).get()); - - io_info_.k_caches_read.tensor_meta = - std::make_unique(method_meta_->input_tensor_meta(3).get()); - io_info_.k_caches_write.tensor_meta = - std::make_unique(method_meta_->output_tensor_meta(1).get()); - - io_info_.v_caches_read.tensor_meta = std::make_unique( - method_meta_->input_tensor_meta(method_meta_->num_inputs() - 1).get()); - io_info_.v_caches_write.tensor_meta = std::make_unique( - method_meta_->output_tensor_meta(method_meta_->num_outputs() - 1).get()); - - io_info_.logit.tensor_meta = - std::make_unique(method_meta_->output_tensor_meta(0).get()); -} - -void IoMemMgr::compute_total_nbytes() { - total_nbytes_ = io_info_.input_token.size + io_info_.pos_idx.size + - io_info_.atten_mask.size + io_info_.logit.size; - size_t num_heads = (method_meta_->num_inputs() - 3) / 2; - - // To update v cache via shifting pointer, v caches need a buffer with size - // of (max_seq_len_ - 1) * head_dim_. It is equivalent to one more cache - size_t num_v_cache = num_heads + 1; - // To update v cache via shifting pointer, k buffer need the size of - // max_seq_len - 1 - size_t k_buffer = io_info_.k_caches_read.size / io_info_.k_caches_write.size; - - // k_caches_read need a buffer with size of head_dim_ - total_nbytes_ += num_heads * io_info_.k_caches_read.size + k_buffer; - total_nbytes_ += num_heads * io_info_.k_caches_write.size; - total_nbytes_ += num_v_cache * io_info_.v_caches_read.size; - // Add a head dim size for the convinience of shifting ptr from the last - // non-used v cache write - total_nbytes_ += io_info_.v_caches_write.size; -} - -bool IoMemMgr::init_tensors() { - size_t cur_pos = input_token_pos_; - pos_idx_pos_ = cur_pos += io_info_.input_token.size; - atten_mask_pos_ = cur_pos += io_info_.pos_idx.size; - logit_pos_ = cur_pos += io_info_.atten_mask.size; - set_input_token_ptr(); - set_pos_idx_ptr(); - set_atten_mask_ptr(); - set_logit_ptr(); - - // set start point of kv caches - cur_pos += io_info_.logit.size; - - size_t num_heads = (method_meta_->num_inputs() - 3) / 2; - k_caches_read_pos_.resize(num_heads); - k_caches_write_pos_.resize(num_heads); - v_caches_read_pos_.resize(num_heads); - v_caches_write_pos_.resize(num_heads); - - for (int i = 0; i < num_heads; i++) { - set_k_caches_read(i, cur_pos); - cur_pos += io_info_.k_caches_read.size; - } - // add a size of k caches buffer - cur_pos += io_info_.k_caches_read.size / io_info_.k_caches_write.size; - for (int i = 0; i < num_heads; i++) { - set_k_caches_write(i, cur_pos); - cur_pos += io_info_.k_caches_write.size; - } - - for (int i = 0; i < num_heads; i++) { - set_v_caches_read(i, cur_pos); - set_v_caches_write(i, cur_pos + io_info_.v_caches_read.size); - cur_pos += io_info_.v_caches_read.size; - } - // add a caches as the b caches buffer - cur_pos += io_info_.v_caches_read.size; - return cur_pos <= total_nbytes_; -} - -void IoMemMgr::set_all_shifted_ptrs(size_t seq_len) { - auto iter_setter = [&](std::vector& cache, - size_t shift_size, - InfoAttrs& tensor_info) { - for (int i = 0; i < cache.size(); ++i) { - size_t pos = cache[i] + shift_size; - CustomMemTensorInfo info = { - ptr_, - ptr_ + pos, - pos, - tensor_info.size, - tensor_info.shape.data(), - tensor_info.rank, - tensor_info.dtype}; - QnnExecuTorchAddCustomMemTensorInfo(info); - } - }; - for (int i = 0; i < seq_len; ++i) { - iter_setter( - k_caches_read_pos_, - i * io_info_.k_caches_read.element_size, - io_info_.k_caches_read); - iter_setter( - v_caches_read_pos_, - i * io_info_.v_caches_write.size, - io_info_.v_caches_read); - iter_setter( - v_caches_write_pos_, - i * io_info_.v_caches_write.size, - io_info_.v_caches_write); - } -} - -void Runner::stop() { - shouldStop_ = true; -} - -Result Runner::get_method_meta() { - return module_->method_meta("forward"); -} - -Error Runner::mem_alloc(size_t alignment, size_t seq_len) { - Result method_meta_result = get_method_meta(); - io_mem_mgr_ = IoMemMgr(method_meta_result.get()); - ET_CHECK_MSG( - io_mem_mgr_.allocate(alignment), - "IoMemMgr failed to allocate custom memory"); - - ET_CHECK_MSG( - io_mem_mgr_.init_tensors(), - "IoMemMgr required more bytes than allocated bytes"); - - io_mem_mgr_.set_all_shifted_ptrs(seq_len); - // To register rpc_mem_handle from SharedBuffer - // Reset and re-init again to trigger registered function - module_.reset(); - module_ = std::make_unique( - model_path_, Module::LoadMode::MmapUseMlockIgnoreErrors); - ET_CHECK_MSG(load() == Error::Ok, "Runner failed to load method"); - - return Error::Ok; -} - -// explicit instantiation of template methods -template int64_t Runner::getMetadataHelper( - std::string method_name, - int64_t default_val); -template bool Runner::getMetadataHelper( - std::string method_name, - bool default_val); - -} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama2/runner/runner.h b/examples/qualcomm/oss_scripts/llama2/runner/runner.h deleted file mode 100644 index aa0e5eb0ece..00000000000 --- a/examples/qualcomm/oss_scripts/llama2/runner/runner.h +++ /dev/null @@ -1,280 +0,0 @@ -/* - * Copyright (c) Qualcomm Innovation Center, Inc. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -// A simple llama2 runner that includes preprocessing and post processing logic. -// The module takes in a string as input and emits a string as output. - -#pragma once - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -class RpcMemAllocator { - public: - RpcMemAllocator(QnnMemDescriptor shared_buffer_type) - : shared_buffer_type_(shared_buffer_type){}; - bool allocate(size_t bytes, size_t alignment) { - ptr_ = QnnExecuTorchAllocCustomMem(bytes, alignment); - if (ptr_ == nullptr) { - ET_LOG( - Info, - "Allocate Rpc mem falied, fallback to nromal ptr: bytes=%zu, alignment=%zu", - bytes, - alignment); - input_data_.resize(bytes); - ptr_ = input_data_.data(); - } - return ptr_ != nullptr; - } - - ~RpcMemAllocator() { - if (shared_buffer_type_ == QnnMemDescriptor::kIon || - shared_buffer_type_ == QnnMemDescriptor::kCustom) { - if (ptr_ != nullptr) { - QnnExecuTorchFreeCustomMem(ptr_); - } - } - } - - void* GetPtr() { - return ptr_; - } - - private: - QnnMemDescriptor shared_buffer_type_; - void* ptr_{nullptr}; - std::vector input_data_; - std::vector tensor_base_addrs_; -}; - -#define DEFINE_IOMEMMGR_ACCESSOR(name) \ - size_t get_##name##_pos() const { \ - return name##_pos_; \ - } \ - char* get_##name##_ptr() const { \ - return reinterpret_cast(ptr_) + name##_pos_; \ - } \ - char* set_##name##_ptr() { \ - CustomMemTensorInfo info = { \ - ptr_, \ - ptr_ + name##_pos_, \ - name##_pos_, \ - io_info_.name.size, \ - io_info_.name.shape.data(), \ - io_info_.name.rank, \ - io_info_.name.dtype}; \ - QnnExecuTorchAddCustomMemTensorInfo(info); \ - return reinterpret_cast(ptr_) + name##_pos_; \ - } - -#define DEFINE_IOMEMMGR_VEC_ACCESSOR(name) \ - const std::vector& get_##name##_pos_vec() const { \ - return name##_pos_; \ - } \ - char* get_##name##_ptr(int idx) { \ - return ptr_ + name##_pos_[idx]; \ - } \ - char* set_##name(int idx, size_t pos) { \ - name##_pos_[idx] = pos; \ - CustomMemTensorInfo info = { \ - ptr_, \ - ptr_ + name##_pos_[idx], \ - name##_pos_[idx], \ - io_info_.name.size, \ - io_info_.name.shape.data(), \ - io_info_.name.rank, \ - io_info_.name.dtype}; \ - QnnExecuTorchAddCustomMemTensorInfo(info); \ - return reinterpret_cast(ptr_) + pos; \ - } \ - char* update_##name(int idx, size_t shift_size) { \ - name##_pos_[idx] += shift_size; \ - return reinterpret_cast(ptr_) + name##_pos_[idx]; \ - } - -namespace example { -class IoMemMgr { - public: - // Allocate a big memory which is capable to contain all IO of all modules - IoMemMgr(){}; - IoMemMgr(executorch::runtime::MethodMeta method_meta); - - struct InfoAttrs { - std::unique_ptr tensor_meta; - size_t size = 0; - std::vector shape; - uint32_t rank; - size_t element_size; - executorch::aten::ScalarType dtype; - }; - - struct IoInfo { - InfoAttrs input_token; - InfoAttrs atten_mask; - InfoAttrs pos_idx; - InfoAttrs k_caches_read; - InfoAttrs k_caches_write; - InfoAttrs v_caches_read; - InfoAttrs v_caches_write; - InfoAttrs logit; - std::vector tensor_info{ - &input_token, - &atten_mask, - &pos_idx, - &k_caches_read, - &k_caches_write, - &v_caches_read, - &v_caches_write, - &logit, - }; - }; - - bool allocate(size_t alignment) { - bool ret = rpc_mem_allocator.allocate(total_nbytes_, alignment); - ptr_ = reinterpret_cast(rpc_mem_allocator.GetPtr()); - return ret; - } - bool init_tensors(); - - char* get_custom_mem_ptr() { - return ptr_; - } - - // Pointers of k cache read, v cache read and write are shifted every step. - // Set them first to register mem handle during qnn delegation init. - void set_all_shifted_ptrs(size_t max_seq_len); - - DEFINE_IOMEMMGR_ACCESSOR(atten_mask); - DEFINE_IOMEMMGR_ACCESSOR(input_token); - DEFINE_IOMEMMGR_ACCESSOR(pos_idx); - DEFINE_IOMEMMGR_ACCESSOR(logit); - - DEFINE_IOMEMMGR_VEC_ACCESSOR(k_caches_read); - DEFINE_IOMEMMGR_VEC_ACCESSOR(k_caches_write); - DEFINE_IOMEMMGR_VEC_ACCESSOR(v_caches_read); - DEFINE_IOMEMMGR_VEC_ACCESSOR(v_caches_write); - - private: - size_t total_nbytes_{0}; - char* ptr_{nullptr}; - void compute_total_nbytes(); - void set_tensor_meta(); - void init_io_info(); - - size_t atten_mask_pos_; - size_t input_token_pos_{0}; - size_t logit_pos_; - size_t pos_idx_pos_; - std::vector k_caches_read_pos_; - std::vector k_caches_write_pos_; - std::vector v_caches_read_pos_; - std::vector v_caches_write_pos_; - - IoInfo io_info_; - std::unique_ptr method_meta_; - RpcMemAllocator rpc_mem_allocator{QnnMemDescriptor::kCustom}; - std::unordered_map scalar_type_to_size = - { - {executorch::aten::ScalarType::Int, sizeof(int32_t)}, - {executorch::aten::ScalarType::Float, sizeof(float)}, - {executorch::aten::ScalarType::Char, sizeof(int8_t)}, - {executorch::aten::ScalarType::Short, sizeof(int16_t)}, - {executorch::aten::ScalarType::Byte, sizeof(uint8_t)}, - {executorch::aten::ScalarType::Bits16, sizeof(uint16_t)}, - }; -}; - -class Runner { - public: - explicit Runner( - const std::string& model_path, - const std::string& tokenizer_path, - const float temperature = 0.8f); - - struct Stats { - // Scaling factor for timestamps - in this case, we use ms. - const long SCALING_FACTOR_UNITS_PER_SECOND = 1000; - // Time stamps for the different stages of the execution - // model_load_start_ms: Start of model loading. - long model_load_start_ms; - // model_load_end_ms: End of model loading. - long model_load_end_ms; - // inference_start_ms: Immediately after the model is loaded (or we check - // for model load), measure the inference time. - long inference_start_ms; - // prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right - // before the inference loop starts - long prompt_eval_end_ms; - // first_token: Timestamp when the first generated token is emitted - long first_token_ms; - // inference_end_ms: End of inference/generation. - long inference_end_ms; - // Keep a running total of the time spent in sampling. - long aggregate_sampling_time_ms; - // Token count from prompt - int64_t num_prompt_tokens; - // Token count from generated (total - prompt) - int64_t num_generated_tokens; - }; - - bool is_loaded() const; - executorch::runtime::Error load(); - executorch::runtime::Error mem_alloc(size_t alignment, size_t seq_len); - executorch::runtime::Error generate( - const std::string& prompt, - int32_t seq_len, - std::function token_callback = {}, - std::function stats_callback = {}); - void stop(); - executorch::runtime::Result - get_method_meta(); - - private: - // metadata - template - T getMetadataHelper(std::string method_name, T default_val); - template - int32_t logitsToToken(const executorch::aten::Tensor& logits_tensor); - executorch::runtime::Result run_model_step( - int64_t input_token, - ::executorch::extension::TensorPtr& token, - ::executorch::extension::TensorPtr& atten_mask, - ::executorch::extension::TensorPtr& start_pos, - std::vector<::executorch::extension::TensorPtr>& kv_tensors, - std::vector<::executorch::extension::TensorPtr>& kv_outputs); - // metadata - int32_t vocab_size_; - int64_t bos_id_; - int64_t eos_id_; - int32_t n_bos_; - int32_t n_eos_; - int32_t max_seq_len_; - int32_t head_dim_; - int32_t dim_; - std::unordered_set model_methods_; - std::unique_ptr module_; - std::string tokenizer_path_; - std::string model_path_; - float temperature_; - std::unique_ptr tokenizer_; - std::unique_ptr sampler_; - bool shouldStop_{false}; - Stats stats_; - IoMemMgr io_mem_mgr_; -}; - -} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama3_2/README.md b/examples/qualcomm/oss_scripts/llama3_2/README.md deleted file mode 100644 index 51de982b1b1..00000000000 --- a/examples/qualcomm/oss_scripts/llama3_2/README.md +++ /dev/null @@ -1,39 +0,0 @@ -# Summary - -## Overview -This file provides instructions to run LLAMA3.2 1B and 3B (WIP) with different parameters via the Qualcomm HTP backend. In LLAMA3.2, we offer the following modes to execute the model: - -Prefill Mode: This is also known as batch prefill mode, where the model takes in a list of tokens as input and generates the next token along with the key-value (KV) cache for all tokens. This mode is efficient for generating the initial sequence of tokens (usually the user's prompt). - -KV Cache Mode: In KV Cache mode, the model takes in a single previous token and generates the next predicted token along with its KV cache. It is efficient for generating subsequent tokens after the initial prompt. - -Hybrid Mode: Hybrid mode leverages the strengths of both batch prefill and KV cache modes to optimize token generation speed. Initially, it uses prefill mode to efficiently generate the prompt's key-value (KV) cache. Then, the mode switches to KV cache mode, which excels at generating subsequent tokens. - -## Instructions -### Note -1. For hybrid mode, the export time will be longer and can take up to 2-4 hours to complete. -2. When exporting a hybrid mode model, please ensure the device has at least 80 GB of memory and swap space. - -### Step 1: Setup -1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. -2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm AI Engine Direct Backend. - -### Step 2: Prepare Model -1. Follow the [instructions](https://www.llama.com/) to download models. -At the end of this step, users should have the following files ready: consolidated.00.pth, params.json, and tokenizer.model. - -### Step3: Run default examples using hybrid mode. -Default example using hybrid mode. -```bash -python examples/qualcomm/oss_scripts/llama3_2/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --prompt "what is 1+1" --temperature 0 --model_size 1B --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 -``` - -If you would like to compile the model only, we have provided the flag `--compile_only`. -```bash -python examples/qualcomm/oss_scripts/llama3_2/llama.py -b build-android -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --prompt "what is 1+1" --temperature 0 --model_size 1B --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 --compile_only -``` - -On the other hand, if you already have a pre-compiled .pte model, you can perform inference by providing the flag `--pre_gen_pte` and specifying the folder that contains the .pte model. -```bash -python examples/qualcomm/oss_scripts/llama3_2/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --prompt "what is 1+1" --temperature 0 --model_size 1B --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 --pre_gen_pte ${FOLDER_TO_PRE_GEN_PTE} -``` \ No newline at end of file diff --git a/examples/qualcomm/oss_scripts/llama3_2/TARGETS b/examples/qualcomm/oss_scripts/llama3_2/TARGETS index cab2076f8d6..e69de29bb2d 100644 --- a/examples/qualcomm/oss_scripts/llama3_2/TARGETS +++ b/examples/qualcomm/oss_scripts/llama3_2/TARGETS @@ -1,37 +0,0 @@ -load("@fbcode_macros//build_defs:python_library.bzl", "python_library") -load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision") -load("@fbcode_macros//build_defs:python_binary.bzl", "python_binary") -load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") - -oncall("executorch") - -python_binary( - name = "llama", - srcs = ["llama.py"], - main_function = "executorch.examples.qualcomm.oss_scripts.llama3_2.llama.main", - preload_deps = [ - "//executorch/extension/llm/custom_ops:model_sharding_py", - ], - deps = [ - "//executorch/examples/qualcomm/oss_scripts/llama2:static_llama", - "//caffe2:torch", - "//executorch/extension/pybindings:aten_lib", - "//executorch/backends/qualcomm/partition:partition", - "//executorch/backends/qualcomm/quantizer:quantizer", - "//executorch/devtools:lib", - "//executorch/examples/models:models", - "//executorch/examples/qualcomm:utils", - "//executorch/extension/export_util:export_util", - "//executorch/extension/llm/export:export_lib", - ], -) - -runtime.command_alias( - name = "llama_qnn", - env = { - "LD_LIBRARY_PATH": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:qnn_offline_compile_libs)".format(get_qnn_library_verision()), - # Place holder to pass the QNN_SDK_ROOT check in executorch/examples/qualcomm/utils.py - "QNN_SDK_ROOT": "", - }, - exe = ":llama", -) diff --git a/examples/qualcomm/oss_scripts/llama3_2/targets.bzl b/examples/qualcomm/oss_scripts/llama3_2/targets.bzl index 64adc7eca9a..811a50629e1 100644 --- a/examples/qualcomm/oss_scripts/llama3_2/targets.bzl +++ b/examples/qualcomm/oss_scripts/llama3_2/targets.bzl @@ -1,54 +1,2 @@ -load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_oss_build_kwargs", "runtime") -load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision") - def define_common_targets(): - runtime.cxx_library( - name = "runner_lib", - srcs = glob( - [ - "runner/*.cpp", - ], - ), - exported_headers = glob([ - "runner/*.h", - ]), - compiler_flags = [ - "-Wno-global-constructors", - "-Wunused-command-line-argument", - ], - deps = [ - "//executorch/extension/llm/runner:stats", - "//executorch/extension/tensor:tensor", - "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()), - ], - exported_deps = [ - "//executorch/extension/module:module", - "//executorch/extension/llm/sampler:sampler", - "//executorch/examples/models/llama/tokenizer:tiktoken", - "//executorch/extension/llm/tokenizer:bpe_tokenizer", - "//executorch/extension/evalue_util:print_evalue", - "//executorch/backends/qualcomm/runtime:runtime", - ], - external_deps = [ - "gflags", - ], - **get_oss_build_kwargs() - ) - - runtime.cxx_binary( - name = "qnn_llama3_2_runner", - srcs = [ - "qnn_llama3_2_runner.cpp", - ], - compiler_flags = [ - "-Wno-global-constructors", - ], - deps = [ - ":runner_lib", - "//executorch/extension/threadpool:threadpool", # this depeneency shouldn't be needed. But it fails to build.. - ], - external_deps = [ - "gflags", - ], - **get_oss_build_kwargs() - ) + return None