pytorch
diff --git a/‎backends/cadence/aot/fuse_ops.py
Lines changed: 70 additions & 0 deletions b/‎backends/cadence/aot/fuse_ops.py
Lines changed: 70 additions & 0 deletions
diff --git a/‎backends/cadence/aot/tests/test_fusion_ops_passes.py
Lines changed: 43 additions & 0 deletions b/‎backends/cadence/aot/tests/test_fusion_ops_passes.py
Lines changed: 43 additions & 0 deletions
diff --git a/‎backends/qualcomm/CMakeLists.txt
Lines changed: 2 additions & 1 deletion b/‎backends/qualcomm/CMakeLists.txt
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/qualcomm/builders/README.md
Lines changed: 2 additions & 1 deletion b/‎backends/qualcomm/builders/README.md
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/qualcomm/builders/node_visitor.py
Lines changed: 2 additions & 48 deletions b/‎backends/qualcomm/builders/node_visitor.py
Lines changed: 2 additions & 48 deletions
diff --git a/‎backends/qualcomm/builders/node_visitor_manager.py
Lines changed: 77 additions & 0 deletions b/‎backends/qualcomm/builders/node_visitor_manager.py
Lines changed: 77 additions & 0 deletions
diff --git a/‎backends/qualcomm/builders/op_abs.py
Lines changed: 2 additions & 1 deletion b/‎backends/qualcomm/builders/op_abs.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/qualcomm/builders/op_adaptive_avg_pool2d.py
Lines changed: 2 additions & 1 deletion b/‎backends/qualcomm/builders/op_adaptive_avg_pool2d.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/qualcomm/builders/op_add.py
Lines changed: 2 additions & 1 deletion b/‎backends/qualcomm/builders/op_add.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/qualcomm/builders/op_amax.py
Lines changed: 2 additions & 1 deletion b/‎backends/qualcomm/builders/op_amax.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/qualcomm/builders/op_and.py
Lines changed: 2 additions & 1 deletion b/‎backends/qualcomm/builders/op_and.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/qualcomm/builders/op_arange.py
Lines changed: 2 additions & 1 deletion b/‎backends/qualcomm/builders/op_arange.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/qualcomm/builders/op_argmin.py
Lines changed: 2 additions & 1 deletion b/‎backends/qualcomm/builders/op_argmin.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/qualcomm/builders/op_avg_pool2d.py
Lines changed: 2 additions & 1 deletion b/‎backends/qualcomm/builders/op_avg_pool2d.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/qualcomm/builders/op_batch_norm.py
Lines changed: 2 additions & 1 deletion b/‎backends/qualcomm/builders/op_batch_norm.py
Lines changed: 2 additions & 1 deletion
@@ -863,6 +863,76 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
         return result
 
 
+@register_cadence_pass(CadencePassAttribute(opt_level=1))
+class FuseMulTensorIntoQuantPass(ExportPass):
+    """
+    Looks for the pattern where aten.mul.Tensor is followed by quant node.
+    If found, updates the quant scale to reflect the multiplication and
+    removes the mul node.
+    """
+
+    def attempt_fusion(
+        self, graph_module: torch.fx.GraphModule, mul_node: torch.fx.Node
+    ) -> None:
+        full_nodes = [
+            arg
+            for arg in mul_node.args
+            if isinstance(arg, torch.fx.Node)
+            and arg.target == exir_ops.edge.aten.full.default
+        ]
+
+        if len(full_nodes) != 1 or len(mul_node.users) != 1:
+            return
+
+        full_node = full_nodes[0]
+        mul_user = list(mul_node.users.keys())[0]
+
+        if mul_user.target not in {
+            exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+            exir_ops.edge.cadence.quantize_per_tensor.default,
+        }:
+            return
+
+        quant_node = mul_user
+
+        # Calculate the new scale value.
+        prev_scale = quant_node.args[1]
+        assert isinstance(prev_scale, (int, float))
+        mul_scalar = full_node.args[1]
+        assert isinstance(mul_scalar, (int, float))
+        new_scale = float(prev_scale) * float(mul_scalar)
+
+        logging.debug(
+            f"Fused {mul_node} and {full_node} into {quant_node}. Updated scale from {quant_node.args[1]} to {new_scale}"
+        )
+
+        # Replace the input first
+        quant_node.replace_input_with(
+            cast(torch.fx.Node, quant_node.args[0]),
+            cast(torch.fx.Node, mul_node.args[0]),
+        )
+
+        # Now update the scale in the args
+        new_quant_args = list(quant_node.args)
+        new_quant_args[1] = new_scale
+        quant_node.args = tuple(new_quant_args)
+
+        # Clean up the mul_node
+        mul_node.args = ()
+        mul_node.users = {}
+
+        graph_module.graph.erase_node(mul_node)
+        graph_module.graph.erase_node(full_node)
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        for node in graph_module.graph.find_nodes(
+            op="call_function", target=exir_ops.edge.aten.mul.Tensor
+        ):
+            self.attempt_fusion(graph_module, node)
+        graph_module.graph.eliminate_dead_code()
+        return super().call(graph_module)
+
+
 @register_cadence_pass(CadencePassAttribute(opt_level=1))
 class FuseMulTensorIntoDequantPass(ExportPass):
     """
 
@@ -20,6 +20,7 @@
     FuseMMWithAdd,
     FuseMulScalarIntoDequantPass,
     FuseMulTensorIntoDequantPass,
+    FuseMulTensorIntoQuantPass,
     FuseQuantDequantToRequantizePass,
     FuseTransposeOrPermuteOpPairsPass,
 )
@@ -587,6 +588,48 @@ def test_fuse_mul_scalar_into_dequant(self):
                 deq_scale = node.args[1]
         self.assertEqual(deq_scale, dequant_scale * mul_value)
 
+    def test_fuse_mul_into_quant(self):
+        quant_scale = 1.5
+        mul_value = 10
+
+        builder = GraphBuilder()
+        x = builder.placeholder("x", torch.randn(4, 32, dtype=torch.float32))
+        full = builder.call_operator(
+            op=exir_ops.edge.aten.full.default,
+            args=([1], mul_value),
+        )
+        mul = builder.call_operator(
+            op=exir_ops.edge.aten.mul.Tensor,
+            args=(x, full),
+        )
+        quant = builder.call_operator(
+            op=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+            args=(mul, quant_scale, 0, 0, 255, torch.uint8),
+        )
+        builder.output(quant)
+        graph_module = FuseMulTensorIntoQuantPass()(
+            builder.get_graph_module()
+        ).graph_module
+
+        # verify that the mul and full ops were removed
+        self.check_op_counts(
+            graph_module,
+            expected_op_counts={
+                exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: 1,
+                exir_ops.edge.aten.full.default: 0,
+                exir_ops.edge.aten.mul.Tensor: 0,
+            },
+        )
+
+        # verify that the quant scale value was updated correctly
+        for node in graph_module.graph.nodes:
+            if (
+                node.target
+                == exir_ops.edge.quantized_decomposed.quantize_per_tensor.default
+            ):
+                deq_scale = node.args[1]
+        self.assertEqual(deq_scale, quant_scale * mul_value)
+
     def test_fuse_then_transpose_pass(self):
         # Create a graph with full -> transpose.
         builder = GraphBuilder()
 
@@ -130,6 +130,7 @@ add_library(qnn_implementation STATIC)
 add_library(qnn_logger STATIC)
 add_library(qnn_manager STATIC)
 add_library(qnn_mem_manager STATIC)
+add_library(qnn_op_package_manager STATIC)
 add_library(qnn_profiler STATIC)
 add_library(qnn_schema INTERFACE ${_qnn_schema__outputs})
 add_library(qnn_sys_function_interface INTERFACE)
@@ -152,7 +153,7 @@ target_link_libraries(
 target_link_libraries(qnn_executorch_logging PRIVATE qnn_schema)
 target_link_libraries(qnn_profiler PRIVATE qnn_executorch_logging)
 target_link_libraries(qnn_logger PRIVATE qnn_implementation ${android_log})
-target_link_libraries(qnn_backend PRIVATE qnn_implementation qnn_logger)
+target_link_libraries(qnn_backend PRIVATE qnn_implementation qnn_logger qnn_op_package_manager)
 target_link_libraries(qnn_custom_protocol PRIVATE qnn_logger)
 target_link_libraries(
   qnn_device PRIVATE qnn_executorch_logging qnn_implementation qnn_logger
 
@@ -176,7 +176,8 @@ import torch
 from executorch.backends.qualcomm.utils.constants import QCOM_DATA
 # op builder will inherit NodeVisitor and have its own implementation
 # register_node_visitor for book-keeping the dictionary of target name v.s. callback
-from .node_visitor import NodeVisitor, register_node_visitor
+from .node_visitor import NodeVisitor
+from .node_visitor_manager import register_node_visitor
 # the definitions required to build operator in QNN
 from .qnn_constants import OpLayerNorm, QNN_OP_PACKAGE_NAME_QTI_AISW
 # utility to get parameter value when creating tensor in QNN
 
@@ -63,7 +63,9 @@
     torch.int64: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_INT_64,
     torch.uint8: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_8,
     torch.uint16: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_16,
+    torch.uint32: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
     float: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_FLOAT_32,
+    int: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
 }
 
 PER_CHANNEL_ENCODING = {
@@ -470,51 +472,3 @@ def define_node(
     ) -> PyQnnWrapper.PyQnnOpWrapper:
         """Convert torch.fx.Node to OpWrapper"""
         raise NotImplementedError("NodeVisitor must be extended!")
-
-
-# This will hold mapping of all node names to the visitor class
-_node_visitor_dict = {}
-
-
-def register_node_visitor(visitor):
-    """Register node visitor into _node_visitor_dict"""
-    assert (
-        isinstance(visitor, type)
-        and issubclass(visitor, NodeVisitor)
-        and hasattr(visitor, "target")
-    ), f"Illformed NodeVisitor subclass, can't register!, got: {visitor}"
-    for target in visitor.target:
-        _node_visitor_dict[target] = visitor
-
-
-def generate_node_to_external_map(
-    edge_program: torch.export.ExportedProgram,
-) -> Dict[torch.fx.Node, int]:
-    node_to_external_map = {}
-    for node in edge_program.graph_module.graph.nodes:
-        # The order in which we visit the placeholder node is same as the *args
-        # order for the forward(*args) signature for this gm. Using the order of
-        # the nodes as external_id to extract the right arg from *args at runtime
-        if is_graph_input(node, edge_program):
-            node_to_external_map[node] = len(node_to_external_map)
-    for node in edge_program.graph_module.graph.nodes:
-        if is_graph_output(node):
-            node_to_external_map[node] = len(node_to_external_map)
-    return node_to_external_map
-
-
-def get_node_visitors(
-    edge_program: torch.export.ExportedProgram,
-    enable_tensor_dump=False,
-) -> Dict[str, NodeVisitor]:
-    """Create a new class instance at runtime, and put them in a dict"""
-    node_to_external_map = generate_node_to_external_map(edge_program)
-    node_visitors = {}
-    for target, visitor in _node_visitor_dict.items():
-        assert callable(
-            visitor
-        ), f"Expeting a callable class, but got {visitor} of type {type(visitor)}"
-        node_visitors[target] = visitor(
-            node_to_external_map, edge_program, enable_tensor_dump
-        )
-    return node_visitors
@@ -0,0 +1,77 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, List
+
+import torch
+from executorch.backends.qualcomm.serialization.qc_schema import (
+    QnnExecuTorchOpPackageInfo,
+)
+
+from .node_visitor import NodeVisitor
+from .op_custom_op import CustomOp
+from .utils import is_graph_input, is_graph_output
+
+
+# This will hold mapping of all node names to the visitor class
+_node_visitor_dict = {}
+
+
+def register_node_visitor(visitor):
+    """Register node visitor into _node_visitor_dict"""
+    assert (
+        isinstance(visitor, type)
+        and issubclass(visitor, NodeVisitor)
+        and hasattr(visitor, "target")
+    ), f"Informed NodeVisitor subclass, can't register!, got: {visitor}"
+    for target in visitor.target:
+        _node_visitor_dict[target] = visitor
+
+
+def generate_node_to_external_map(
+    edge_program: torch.export.ExportedProgram,
+) -> Dict[torch.fx.Node, int]:
+    node_to_external_map = {}
+    for node in edge_program.graph_module.graph.nodes:
+        # The order in which we visit the placeholder node is same as the *args
+        # order for the forward(*args) signature for this gm. Using the order of
+        # the nodes as external_id to extract the right arg from *args at runtime
+        if is_graph_input(node, edge_program):
+            node_to_external_map[node] = len(node_to_external_map)
+    for node in edge_program.graph_module.graph.nodes:
+        if is_graph_output(node):
+            node_to_external_map[node] = len(node_to_external_map)
+    return node_to_external_map
+
+
+def get_node_visitors(
+    edge_program: torch.export.ExportedProgram,
+    enable_tensor_dump=False,
+    op_package_infos: List[QnnExecuTorchOpPackageInfo] = None,
+) -> Dict[str, NodeVisitor]:
+    """Create a new class instance at runtime, and put them in a dict"""
+    node_to_external_map = generate_node_to_external_map(edge_program)
+    node_visitors = {}
+    for target, visitor in _node_visitor_dict.items():
+        assert callable(
+            visitor
+        ), f"Expecting a callable class, but got {visitor} of type {type(visitor)}"
+        node_visitors[target] = visitor(
+            node_to_external_map, edge_program, enable_tensor_dump
+        )
+    if op_package_infos:
+        custom_ops = []
+        for op_package_info in op_package_infos:
+            if op_package_info.custom_op_name not in custom_ops:
+                custom_op_builder = CustomOp(
+                    op_package_info,
+                    node_to_external_map,
+                    edge_program,
+                    enable_tensor_dump,
+                )
+                node_visitors[op_package_info.custom_op_name] = custom_op_builder
+                custom_ops.append(op_package_info.custom_op_name)
+    return node_visitors
@@ -9,7 +9,8 @@
 
 import torch
 
-from .node_visitor import NodeVisitor, register_node_visitor
+from .node_visitor import NodeVisitor
+from .node_visitor_manager import register_node_visitor
 from .qnn_constants import OpElementWiseAbs, QNN_OP_PACKAGE_NAME_QTI_AISW
 
 
 
@@ -11,7 +11,8 @@
 
 import torch
 
-from .node_visitor import NodeVisitor, register_node_visitor
+from .node_visitor import NodeVisitor
+from .node_visitor_manager import register_node_visitor
 from .qnn_constants import OpPoolAvg2d, QNN_OP_PACKAGE_NAME_QTI_AISW
 
 
 
@@ -9,7 +9,8 @@
 
 import torch
 
-from .node_visitor import NodeVisitor, register_node_visitor
+from .node_visitor import NodeVisitor
+from .node_visitor_manager import register_node_visitor
 from .qnn_constants import OpElementWiseAdd, QNN_OP_PACKAGE_NAME_QTI_AISW
 
 
 
@@ -12,7 +12,8 @@
 import torch
 from executorch.backends.qualcomm.utils.constants import QCOM_AXIS_ORDER, QCOM_DATA
 
-from .node_visitor import NodeVisitor, register_node_visitor
+from .node_visitor import NodeVisitor
+from .node_visitor_manager import register_node_visitor
 from .qnn_constants import OpReduceMax, QNN_OP_PACKAGE_NAME_QTI_AISW
 
 
 
@@ -9,7 +9,8 @@
 
 import torch
 
-from .node_visitor import NodeVisitor, register_node_visitor
+from .node_visitor import NodeVisitor
+from .node_visitor_manager import register_node_visitor
 from .qnn_constants import OpElementWiseAnd, QNN_OP_PACKAGE_NAME_QTI_AISW
 
 
 
@@ -9,7 +9,8 @@
 
 import torch
 
-from .node_visitor import NodeVisitor, register_node_visitor
+from .node_visitor import NodeVisitor
+from .node_visitor_manager import register_node_visitor
 
 
 @register_node_visitor
 
@@ -10,7 +10,8 @@
 import torch
 from executorch.backends.qualcomm.utils.constants import QCOM_AXIS_ORDER, QCOM_DATA
 
-from .node_visitor import NodeVisitor, register_node_visitor
+from .node_visitor import NodeVisitor
+from .node_visitor_manager import register_node_visitor
 from .qnn_constants import OpArgmin, QNN_OP_PACKAGE_NAME_QTI_AISW
 
 
 
@@ -12,7 +12,8 @@
 import torch
 from executorch.backends.qualcomm.utils.constants import QCOM_DATA
 
-from .node_visitor import NodeVisitor, register_node_visitor
+from .node_visitor import NodeVisitor
+from .node_visitor_manager import register_node_visitor
 from .qnn_constants import OpPoolAvg2d, QNN_OP_PACKAGE_NAME_QTI_AISW
 
 
 
@@ -18,7 +18,8 @@
 )
 from executorch.exir.dialects._ops import ops as exir_ops
 
-from .node_visitor import NodeVisitor, register_node_visitor
+from .node_visitor import NodeVisitor
+from .node_visitor_manager import register_node_visitor
 from .qnn_constants import OpBatchnorm, QNN_OP_PACKAGE_NAME_QTI_AISW
 from .utils import get_parameter
Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,8 @@`
`18`	`18`	`)`
`19`	`19`	`from executorch.exir.dialects._ops import ops as exir_ops`
`20`	`20`
`21`		`-from .node_visitor import NodeVisitor, register_node_visitor`
	`21`	`+from .node_visitor import NodeVisitor`
	`22`	`+from .node_visitor_manager import register_node_visitor`
`22`	`23`	`from .qnn_constants import OpBatchnorm, QNN_OP_PACKAGE_NAME_QTI_AISW`
`23`	`24`	`from .utils import get_parameter`
`24`	`25`