resolve uint16 type and reorder input in runtime

shewu-quic · shewu-quic · commit 3b6af6422263 · 2024-04-22T11:05:53.000+08:00
diff --git a/backends/qualcomm/builders/node_visitor.py b/backends/qualcomm/builders/node_visitor.py
@@ -14,8 +14,6 @@
 
 from executorch.exir.dialects._ops import ops as exir_ops
 
-from .qnn_constants import QNN_uint16
-
 from .utils import get_parameter, is_graph_input, is_graph_output, is_parameter
 
 
@@ -26,7 +24,7 @@
     # Note that there is no int64 tensor data type in Qnn.
     torch.int64: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UNDEFINED,
     torch.uint8: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UFIXED_POINT_8,
-    QNN_uint16: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UFIXED_POINT_16,
+    torch.uint16: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UFIXED_POINT_16,
 }
 QNN_TENSOR_TYPE_MAP = {
     torch.float32: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_FLOAT_32,
@@ -35,7 +33,7 @@
     torch.int32: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_INT_32,
     torch.int64: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_INT_64,
     torch.uint8: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_8,
-    QNN_uint16: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_16,
+    torch.uint16: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_16,
     float: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_FLOAT_32,
 }
 
@@ -169,7 +167,7 @@ def get_quant_encoding_conf(
         return self.make_qnn_per_tensor_config(quant_attrs)
 
     def get_quant_tensor_value(
-        self, tensor: torch.Tensor, quant_attrs: Dict, dtype, bitwidth
+        self, tensor: torch.Tensor, quant_attrs: Dict, quant_configs: Dict
     ) -> torch.Tensor:
         if quant_attrs["encoding"] in PER_TENSOR_ENCODING:
             scale = quant_attrs["scale"]
@@ -178,16 +176,11 @@ def get_quant_tensor_value(
             scale = quant_attrs["scales"]
             zero_point = quant_attrs["zero_points"]
 
-        # To bypass torch.uint16 quantization is not supported
-        dtype = (
-            torch.int32
-            if dtype == PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_16
-            else quant_attrs["dtype"]
-        )
+        dtype = quant_configs["dtype"]
 
         tensor = tensor.div(scale).add(zero_point).round().to(dtype)
         # Make the backends access data correctly
-        if bitwidth == 4:
+        if quant_configs.get("bitwidth") == 4:
             mask = torch.full(tensor.size(), 0x0F, dtype=torch.int8)
             tensor = torch.bitwise_and(mask, tensor)
         return tensor
@@ -236,7 +229,7 @@ def get_data_type(
                 <= torch.iinfo(torch.int16).max - torch.iinfo(torch.int16).min
             ):
                 if unsigned:
-                    quant_config["dtype"] = QNN_uint16
+                    quant_config["dtype"] = torch.uint16
                 else:
                     quant_config["dtype"] = torch.int16
             return QNN_QUANT_TYPE_MAP[quant_config["dtype"]]
@@ -304,6 +297,8 @@ def define_tensor(
             return cached
 
         tensor_name = node.name
+        if is_graph_input(node, self.edge_program):
+            tensor_name = "QnnInput_"+str(self.external_ids[node])+"_"+ tensor_name
         if is_graph_output(node):
             tensor_name = "output_" + tensor_name
         dims = [1] if len(tensor.size()) == 0 else tensor.size()
@@ -329,8 +324,7 @@ def define_tensor(
                 tensor = self.get_quant_tensor_value(
                     tensor,
                     node.meta["quant_attrs"],
-                    dtype,
-                    quant_configs.get("bitwidth"),
+                    quant_configs,
                 )
             tensor_wrapper = PyQnnWrapper.TensorWrapper(
                 tensor_name,
diff --git a/backends/qualcomm/builders/op_split.py b/backends/qualcomm/builders/op_split.py
@@ -15,7 +15,7 @@
 
 
 @register_node_visitor
-class Softmax(NodeVisitor):
+class Split(NodeVisitor):
     target = ["aten.split_with_sizes.default"]
 
     def __init__(self, *args) -> None:
diff --git a/backends/qualcomm/builders/qnn_constants.py b/backends/qualcomm/builders/qnn_constants.py
@@ -8,7 +8,6 @@
 from enum import IntEnum, unique
 
 QNN_OP_PACKAGE_NAME_QTI_AISW = "qti.aisw"
-QNN_uint16 = "uint16"
 
 # Below constants should be same as those in QNN headers.
 # Maybe someday we should expose these constants by pybind
diff --git a/backends/qualcomm/quantizer/utils.py b/backends/qualcomm/quantizer/utils.py
@@ -113,14 +113,14 @@ def get_default_8bit_qnn_ptq_config() -> QuantizationConfig:
 
 
 # 4 bits quantization only supports specific ops.
-def get_16a4w_qnn_ptq_config() -> QuantizationConfig:
+def get_16a4w_qnn_ptq_config(act_observer=MovingAverageMinMaxObserver) -> QuantizationConfig:
     extra_args: Dict[str, Any] = {"eps": 2**-20}
     act_quantization_spec = QuantizationSpec(
         dtype=torch.int32,
         quant_min=torch.iinfo(torch.uint16).min,
         quant_max=torch.iinfo(torch.uint16).max,
         qscheme=torch.per_tensor_affine,
-        observer_or_fake_quant_ctr=MovingAverageMinMaxObserver.with_args(**extra_args),
+        observer_or_fake_quant_ctr=act_observer.with_args(**extra_args),
     )
 
     weight_quantization_spec = QuantizationSpec(
@@ -150,14 +150,14 @@ def get_16a4w_qnn_ptq_config() -> QuantizationConfig:
     return quantization_config
 
 
-def get_default_16bit_qnn_ptq_config() -> QuantizationConfig:
+def get_default_16bit_qnn_ptq_config(act_observer=MovingAverageMinMaxObserver) -> QuantizationConfig:
     extra_args: Dict[str, Any] = {"eps": 2**-20}
     act_quantization_spec = QuantizationSpec(
         dtype=torch.int32,
         quant_min=torch.iinfo(torch.uint16).min,
         quant_max=torch.iinfo(torch.uint16).max,
         qscheme=torch.per_tensor_affine,
-        observer_or_fake_quant_ctr=MovingAverageMinMaxObserver.with_args(**extra_args),
+        observer_or_fake_quant_ctr=act_observer.with_args(**extra_args),
     )
 
     weight_quantization_spec = QuantizationSpec(
diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
@@ -11,7 +11,7 @@
 #include <executorch/backends/qualcomm/runtime/QnnExecuTorchBackend.h>
 #include <executorch/backends/qualcomm/runtime/QnnManager.h>
 #include <executorch/backends/qualcomm/schema_generated.h>
-
+#include <algorithm>
 #include <string>
 namespace torch {
 namespace executor {
@@ -20,6 +20,12 @@ using namespace qnn;
 using namespace qnn_delegate;
 constexpr const char* QNN_COMPILE_SPEC = "qnn_compile_spec";
 
+bool CompareQnnInput(const std::shared_ptr<TensorWrapper>& a, const std::shared_ptr<TensorWrapper>& b) {
+    int numA = std::stoi(a->GetName().substr(a->GetName().find('_') + 1));
+    int numB = std::stoi(b->GetName().substr(b->GetName().find('_') + 1));
+    return numA < numB;
+}
+
 Result<DelegateHandle*> QnnExecuTorchBackend::init(
     BackendInitContext& context,
     FreeableBuffer* processed,
@@ -187,6 +193,9 @@ Error QnnExecuTorchBackend::execute(
       qnn_manager->GetGraphOutputs();
   std::vector<Qnn_Tensor_t> input_tensor_structs;
   std::vector<Qnn_Tensor_t> output_tensor_structs;
+  // Using the order of the nodes as external_id in AOT 
+  // to extract the right arg from *args at runtime
+  std::sort(input_tensors.begin(), input_tensors.end(), CompareQnnInput);
 
   input_tensor_structs.reserve(input_tensors.size());
   for (int i = 0; i < input_tensors.size(); ++i) {
diff --git a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp
@@ -87,7 +87,8 @@ QnnBackendCache::QnnBackendCache(
     state_ = SERIALIZE;
     QNN_EXECUTORCH_LOG_INFO("Caching: Caching is in SAVE MODE.");
     return;
-  } else {
+  } 
+  /*else {
     // TODO: need fix on this since qnn context binary could somehow
     //       pass the check of flatbuffer verifier
     // check if context binary came from flatbuffer
@@ -100,7 +101,7 @@ QnnBackendCache::QnnBackendCache(
       state_ = ONLINE_PREPARE;
       return;
     }
-  }
+  }*/
 
   if (qnn_sys_impl_.Load() != Error::Ok) {
     QNN_EXECUTORCH_LOG_ERROR(
diff --git a/examples/qualcomm/executor_runner/qnn_llama_runner.cpp b/examples/qualcomm/executor_runner/qnn_llama_runner.cpp
@@ -100,7 +100,7 @@ int main(int argc, char** argv) {
     if (input_files.size() == 0) {
       break;
     }
-    // inputs: [tokens, pos_ids, atten_mask, kv_mask, k_cache, v_cache]
+    // inputs: [tokens, pos_ids, kv_mask, *k_cache, *v_cache]
     // tokens are determined by command line arguments
     // pos_ids are infered inside runner
     std::vector<ManagedTensor> managed_inputs;
@@ -120,10 +120,6 @@ int main(int argc, char** argv) {
           tensor_meta->nbytes());
 
       inputs[input_index].resize(tensor_meta->nbytes());
-      if (input_index <= 2) {
-        fin.seekg(0, fin.beg);
-        fin.read(inputs[input_index].data(), file_size);
-      }
       fin.close();
 
       auto tensor_shape = tensor_meta->sizes();
diff --git a/examples/qualcomm/llama2/llama.py b/examples/qualcomm/llama2/llama.py
@@ -12,6 +12,7 @@
 from functools import partial
 
 import torch
+from torch.ao.quantization.observer import MinMaxObserver
 
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
 from executorch.backends.qualcomm.utils.utils import convert_linear_to_conv2d
@@ -206,6 +207,7 @@ def sample_top_p(probs: torch.Tensor, top_p: float) -> torch.Tensor:
             shared_buffer=args.shared_buffer,
             metadata=instance.get_metadata(),
             direct_io=True,
+            act_observer=MinMaxObserver
         )
 
     if args.compile_only:
diff --git a/examples/qualcomm/llama2/runner/runner.cpp b/examples/qualcomm/llama2/runner/runner.cpp
@@ -139,7 +139,8 @@ Result<torch::executor::Tensor> Runner::run_model_step(
     Tensor& start_pos,
     std::vector<Tensor>& input_tensors) {
   token.mutable_data_ptr<int32_t>()[0] = input_token;
-  // inputs:[tokens, start_pos, atten_mask, kv_mask, k_cache, v_cache]
+  // inputs:[tokens, start_pos, kv_mask, k_cache, v_cache]
+  // input_tensors:[kv_mask, k_cache, v_cache]
   std::vector<EValue> inputs = {token, start_pos};
   inputs.insert(inputs.end(), input_tensors.begin(), input_tensors.end());
 
diff --git a/examples/qualcomm/scripts/utils.py b/examples/qualcomm/scripts/utils.py
@@ -15,6 +15,7 @@
 import numpy as np
 
 import torch
+from torch.ao.quantization.observer import MovingAverageMinMaxObserver
 from executorch.backends.qualcomm.partition.qnn_partitioner import QnnPartitioner
 from executorch.backends.qualcomm.quantizer.quantizer import (
     get_16a4w_qnn_ptq_config,
@@ -184,6 +185,7 @@ def build_executorch_binary(
     direct_io=False,  # TODO: temporal workaround for llama
     shared_buffer=False,
     metadata=None,
+    act_observer=MovingAverageMinMaxObserver
 ):
     if quant_dtype is not None:
         quantizer = QnnQuantizer()
@@ -194,10 +196,10 @@ def build_executorch_binary(
             pass  # default setting
         elif quant_dtype == QuantDtype.use_16a16w:
             quantizer.add_16bit_quant_ops(quantizer.SUPPORTED_OPS)
-            quantizer.set_bit16_op_quant_config(get_default_16bit_qnn_ptq_config())
+            quantizer.set_bit16_op_quant_config(get_default_16bit_qnn_ptq_config(act_observer=act_observer))
         elif quant_dtype == QuantDtype.use_16a4w:
             quantizer.add_16bit_quant_ops(quantizer.SUPPORTED_OPS)
-            quantizer.set_bit16_op_quant_config(get_16a4w_qnn_ptq_config())
+            quantizer.set_bit16_op_quant_config(get_16a4w_qnn_ptq_config(act_observer=act_observer))
             quantizer.set_per_channel_weight_dtype(weight_dtype_for_16bit_act="int4")
         else:
             raise AssertionError(f"No support for QuantDtype {quant_dtype}.")