pytorch · shewu-quic · Oct 21, 2024 · Oct 22, 2024 · Oct 25, 2024 · Oct 25, 2024
@@ -62,6 +62,7 @@ class LayoutTransform(ExportPass):
         exir_ops.edge.aten.prelu.default,
         exir_ops.edge.aten.relu.default,
         exir_ops.edge.aten._softmax.default,  # TODO: Need to find a new solution to do "axis_order" to transform axis.
+        exir_ops.edge.aten.sigmoid.default,
         exir_ops.edge.aten.sqrt.default,
         exir_ops.edge.aten.sub.Tensor,
         exir_ops.edge.aten.sum.dim_IntList,

@@ -14,6 +14,7 @@
     op_ceil,
     op_clamp,
     op_conv2d,
+    op_copy,
     op_depth_to_space,
     op_dequantize,
     op_div,
@@ -70,6 +71,7 @@
     op_ceil,
     op_clamp,
     op_conv2d,
+    op_copy,
     op_depth_to_space,
     op_dequantize,
     op_div,

@@ -36,6 +36,8 @@
     get_parameter,
     is_graph_input,
     is_graph_output,
+    is_mutable_buffer_input,
+    is_mutable_buffer_output,
     is_parameter,
 )
 
@@ -214,7 +216,7 @@ def get_tensor_type(
         node: torch.fx.Node,
         tensor_type: PyQnnWrapper.Qnn_TensorType_t,
     ) -> PyQnnWrapper.Qnn_TensorType_t:
-        is_input = is_graph_input(node, self.edge_program)
+        is_input = is_graph_input(node, self.edge_program) or is_mutable_buffer_input(node, self.edge_program)
         is_output = is_graph_output(node)
         # handle logic for input/output tensors
         if is_input or is_output:
@@ -245,6 +247,29 @@ def get_data_type(
 
         return QNN_TENSOR_TYPE_MAP[tensor.dtype]
 
+    def get_tensor_name(
+        self,
+        node: torch.fx.Node,
+        wrapper_idx: int = 0,
+    ):
+        tensor_name = f"{node.name}_{wrapper_idx}"
+        # The `input_{id}` is utilized for sorting at runtime. Due to multiple passes in qnn_preprocess, 
+        # the input order between QNN and the original graph’s forward function may differ.
+        # The `mutbuf_{id}` is utilized for mapping I/O of mutable buffer at runtime.
+        # The `output_` is identified as the graph’s output at runtime to prevent confusion with per_tensor_dump.
+        if is_mutable_buffer_input(node, self.edge_program):
+            fqn = self.edge_program.graph_signature.inputs_to_buffers[node.target]
+            position_index = list(self.edge_program.graph_signature.buffers_to_mutate.values()).index(fqn)
+            tensor_name = f"input_{str(self.external_ids[node])}_mutbuf_{str(position_index)}_{tensor_name}"
+        elif is_graph_input(node, self.edge_program):
+            tensor_name = f"input_{str(self.external_ids[node])}_{tensor_name}"
+        elif is_mutable_buffer_output(node, self.edge_program):
+            position_index = list(self.edge_program.graph_signature.buffers_to_mutate.keys()).index(node.name)
+            tensor_name = f"output_mutbuf_{position_index}_{tensor_name}"
+        elif is_graph_output(node):
+            tensor_name = f"output_{tensor_name}"
+        return tensor_name
+
     def define_custom_tensor_wrapper(
         self,
         node_name: str,
@@ -305,11 +330,7 @@ def define_tensor(
         if cached := nodes_to_wrappers[node_name].get(wrapper_idx, None):
             return cached
 
-        tensor_name = f"{node.name}_{wrapper_idx}"
-        if is_graph_input(node, self.edge_program):
-            tensor_name = "input_" + str(self.external_ids[node]) + "_" + tensor_name
-        if is_graph_output(node):
-            tensor_name = "output_" + tensor_name
+        tensor_name = self.get_tensor_name(node, wrapper_idx)
         dims = [1] if len(tensor.size()) == 0 else tensor.size()
         tensor_type = self.get_tensor_type(node, tensor_type)
         quant_encoding, quant_configs = self.get_quant_encoding_conf(
@@ -381,7 +402,7 @@ def generate_node_to_external_map(
         # The order in which we visit the placeholder node is same as the *args
         # order for the forward(*args) signature for this gm. Using the order of
         # the nodes as external_id to extract the right arg from *args at runtime
-        if is_graph_input(node, edge_program):
+        if is_graph_input(node, edge_program) or is_mutable_buffer_input(node, edge_program):
             node_to_external_map[node] = len(node_to_external_map)
     for node in edge_program.graph_module.graph.nodes:
         if is_graph_output(node):

@@ -0,0 +1,68 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Dict
+
+import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
+
+import torch
+from executorch.backends.qualcomm.utils.constants import (
+    QCOM_QUANT_ATTRS,
+    QCOM_SCALE,
+    QCOM_ZERO_POINT,
+)
+from executorch.exir.dialects._ops import ops as exir_ops
+
+from .node_visitor import NodeVisitor, register_node_visitor
+from .qnn_constants import OpReshape, QNN_OP_PACKAGE_NAME_QTI_AISW
+
+
+@register_node_visitor
+class Copy(NodeVisitor):
+    target = ["aten.copy.default"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
+    ) -> PyQnnWrapper.PyQnnOpWrapper:
+        input_node = node.args[1]
+        input_tensor = self.get_tensor(input_node, node)
+        copy_inp_tensor_wrapper = self.define_tensor(
+            input_node,
+            input_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+            is_input_tensor=True,
+        )
+
+        copy_input_tensors = [copy_inp_tensor_wrapper]
+
+        if quant_attrs := input_node.meta.get(QCOM_QUANT_ATTRS):
+            quant_attrs = quant_attrs.copy()
+            # Because there is no output after convert_pt2e, the QCOM_QUANT_ATTRS of node is none
+            node.meta[QCOM_QUANT_ATTRS] = quant_attrs
+        output_tensor = self.get_tensor(node, node)
+        output_tensor_wrapper = self.define_tensor(
+            node,
+            output_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+            is_input_tensor=False,
+        )
+        copy_output_tensors = [output_tensor_wrapper]
+
+        copy_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpReshape.op_name,
+        )
+        copy_op.AddInputTensors(copy_input_tensors)
+        copy_op.AddOutputTensors(copy_output_tensors)
+
+        return copy_op
@@ -75,6 +75,23 @@ def is_graph_input(
     return tensor.op == "placeholder" and not is_parameter(tensor, edge_program)
 
 
+def is_mutable_buffer_input(
+    tensor: torch.fx.Node, edge_program: torch.export.ExportedProgram
+) -> bool:
+    """
+    Check if the given tensor is a mutable buffer input
+
+    Args:
+        tensor: EdgeIR Tensor that is being checked for mutable buffer input
+    """
+    if tensor.op == "placeholder" and is_buffer(edge_program, tensor):
+        fqn = edge_program.graph_signature.inputs_to_buffers[tensor.target]
+        # if the buffer is mutated then record that
+        if fqn in edge_program.graph_signature.buffers_to_mutate.values():
+            return True
+    return False
+
+
 def is_graph_output(tensor: torch.fx.Node) -> bool:
     """
     Check if the given tensor is used as a graph output
@@ -91,6 +108,26 @@ def is_graph_output(tensor: torch.fx.Node) -> bool:
     return False
 
 
+def is_mutable_buffer_output(
+    tensor: torch.fx.Node, edge_program: torch.export.ExportedProgram
+) -> bool:
+    """
+    Check if the given tensor is a mutable buffer output
+
+    Args:
+        tensor: EdgeIR Tensor that is being checked for mutable buffer output
+    """
+    for user in tensor.users.keys():
+        # getitem node is skiped, check the op_skip_ops.py
+        if user.op == "output" or (
+            user.target.__name__ == "getitem" and is_graph_output(user)
+        ):
+            # if the buffer is mutated then record that
+            if tensor.name in edge_program.graph_signature.buffers_to_mutate.keys():
+                return True
+    return False
+
+
 def is_constant(
     tensor: torch.fx.Node, edge_program: torch.export.ExportedProgram
 ) -> bool:

@@ -13,7 +13,7 @@
     exir_ops.edge.aten.clone.default,
     exir_ops.edge.aten.full.default,
     exir_ops.edge.aten.slice_scatter.default,
-    exir_ops.edge.aten.copy.default,
+    exir_ops.edge.quantized_decomposed.embedding_4bit.dtype,
 ]
 
 to_be_implemented_operator = [

@@ -23,7 +23,7 @@
     Partitioner,
     PartitionResult,
 )
-from executorch.exir.backend.utils import tag_constant_data
+from executorch.exir.backend.utils import tag_constant_data, tag_mutated_buffer
 from torch.fx.passes.infra.partitioner import Partition
 from torch.fx.passes.operator_support import OperatorSupportBase
 
@@ -136,27 +136,13 @@ def tag_nodes(
                 node.meta["delegation_tag"] = delegation_tag
                 self.partition_tags[delegation_tag] = self.delegation_spec
 
-        # need to take care of consumed constants
-        consumed_constants = (
-            *edge_program.graph_signature.inputs_to_buffers,
-            *edge_program.graph_signature.inputs_to_parameters,
-        )
-        for node in edge_program.graph_module.graph.nodes:
-            # find placeholders as lifted_constants
-            if node.op != "placeholder" or len(node.users) != 0:
-                continue
-
-            if node.name in consumed_constants:
-                # does no harm to merge them into last partition,
-                # since they will all be removed in following stage
-                node.meta["delegation_tag"] = delegation_tag
-
     # override
     def partition(self, edge_program: torch.export.ExportedProgram) -> PartitionResult:
         partitions = self.generate_partitions(edge_program)
         if len(partitions) != 0:
             self.tag_nodes(partitions, edge_program)
             tag_constant_data(edge_program)
+            tag_mutated_buffer(edge_program)
         for node in edge_program.graph_module.graph.nodes:
             if hasattr(node, "meta"):
                 # pop certain keys in meta for not affecting the passes in compilation

@@ -12,6 +12,7 @@
     QuantizationConfig,
 )
 from executorch.backends.qualcomm.quantizer.utils import QUANT_ANNOTATION_KEY
+from executorch.exir.dialects._ops import ops as exir_ops
 from torch.ao.quantization.quantizer import (
     QuantizationAnnotation,
     SharedQuantizationSpec,
@@ -144,3 +145,35 @@ def annotate_matmul(node: Node, quantization_config: QuantizationConfig):
     for node in gm.graph.nodes:
         if node.op == "call_function" and node.target == torch.ops.aten.matmul.default:
             annotate_matmul(node, quantization_config_16a8w)
+
+
+def get_custom_quant_ios_dtype(
+    cache_shape: torch.Size,
+    node: torch.fx.Node,
+    kv_dtype=torch.uint8,
+    sharding_dtype=torch.uint16,
+):
+    """
+    This function is specific for llama inputs and outputs
+    """
+    if node.op == "placeholder" and "attention_sdpa_kv_cache_past_" in node.name:
+        return kv_dtype
+
+    # Tag index put node before copy node, because copy is a skipped node in qnn
+    if (
+        exir_ops.edge.aten.copy.default == node.target
+        and node.meta["val"].shape == cache_shape
+    ):
+        return kv_dtype
+
+    # Tag sharding io
+    if exir_ops.edge.llama.fallback.default in [
+        u.target for u in list(node.users.keys())
+    ] + [node.target]:
+        return sharding_dtype
+
+    # Tag index op as quantized tensors. It is caused by sharding
+    if exir_ops.edge.aten.index.Tensor in [
+        u.target for u in list(node.users.keys())
+    ] + [node.target]:
+        return sharding_dtype
@@ -11,6 +11,8 @@
 #include <executorch/backends/qualcomm/runtime/QnnExecuTorchBackend.h>
 #include <executorch/backends/qualcomm/runtime/QnnManager.h>
 #include <executorch/backends/qualcomm/schema_generated.h>
+#include <chrono>
+
 namespace executorch {
 namespace backends {
 namespace qnn {
@@ -185,6 +187,7 @@ Error QnnExecuTorchBackend::execute(
     BackendExecutionContext& context,
     DelegateHandle* handle,
     EValue** args) const {
+          auto begin = std::chrono::high_resolution_clock::now();
   QnnManager* qnn_manager = static_cast<QnnManager*>(handle);
 
   std::vector<std::shared_ptr<TensorWrapper>> input_tensors =
@@ -194,29 +197,34 @@ Error QnnExecuTorchBackend::execute(
   std::vector<Qnn_Tensor_t> input_tensor_structs;
   std::vector<Qnn_Tensor_t> output_tensor_structs;
 
+  int args_index = 0;
   input_tensor_structs.reserve(input_tensors.size());
-  for (int i = 0; i < input_tensors.size(); ++i) {
-    if (qnn_manager->RegisterMem(
-            args[i]->toTensor().mutable_data_ptr(), input_tensors[i]) !=
-        Error::Ok) {
-      // update data ptr only should be fine
-      input_tensors[i]->FillDataBuffer(
-          args[i]->toTensor().const_data_ptr(), false /* copy_data */);
+  for (const auto& input_tensor : input_tensors){
+    if (input_tensor->GetName().find("mutbuf_") == std::string::npos){
+      if (qnn_manager->RegisterMem(
+                args[args_index]->toTensor().mutable_data_ptr(), input_tensor) !=
+            Error::Ok) {
+          // update data ptr only should be fine
+          input_tensor->FillDataBuffer(
+              args[args_index]->toTensor().const_data_ptr(), false /* copy_data */);
+      }
+      args_index++;
     }
-    input_tensor_structs.push_back(input_tensors[i]->CloneTensorStruct());
+
+    input_tensor_structs.push_back(input_tensor->CloneTensorStruct());
   }
 
-  int output_index = input_tensors.size();
+
   for (const auto& output_tensor : output_tensors) {
     // pos=0 limits the search to the prefix
-    if (output_tensor->GetName().rfind("output_", 0) == 0) {
+    if (output_tensor->GetName().rfind("output_", 0) == 0 && output_tensor->GetName().find("mutbuf_") == std::string::npos) {
       void* mutable_data_ptr =
-          args[output_index]->toTensor().mutable_data_ptr();
+          args[args_index]->toTensor().mutable_data_ptr();
       if (qnn_manager->RegisterMem(mutable_data_ptr, output_tensor) !=
           Error::Ok) {
         output_tensor->FillDataBuffer(mutable_data_ptr, false /* copy_data */);
       }
-      output_index++;
+      args_index++;
     }
     output_tensor_structs.push_back(output_tensor->CloneTensorStruct());
   }
@@ -232,7 +240,12 @@ Error QnnExecuTorchBackend::execute(
       qnn_manager->ProfileExecuteData(context.event_tracer()) == Error::Ok,
       Internal,
       "Fail to profile graph");
+    auto end = std::chrono::high_resolution_clock::now();
 
+    auto elapsed = std::chrono::duration_cast<std::chrono::microseconds>(end -
+                                                                          begin);
+    QNN_EXECUTORCH_LOG_INFO(
+        "QNN Graph Execute Time in QnnExecuTorchBackend: %ld us", elapsed.count());
   return Error::Ok;
 }