Remove redundancy pass & add dequant const param check

winskuo-quic · winskuo-quic · commit b41626fba61b · 2024-07-29T13:28:58.000+08:00
diff --git a/backends/qualcomm/passes/annotate_quant_attrs.py b/backends/qualcomm/passes/annotate_quant_attrs.py
@@ -94,9 +94,11 @@ def _dequant_fold_params(self, n, quant_attrs, param):
     def _annotate_quant_attrs(
         self, graph_module: torch.fx.GraphModule
     ) -> torch.fx.GraphModule:
+        # Keep track of const params that has been dequant, so it does not get
+        # dequant multiple times if the const param has more than 1 user
+        visited_const_param = set()
         for n in graph_module.graph.nodes:
             self._annotate_requant(n)
-
             # With fold_quant enabled, check if the input of dq op is quantized param.
             param = None
             if n.target in dq_ops:
@@ -106,7 +108,8 @@ def _annotate_quant_attrs(
             quant_attrs = get_quant_attrs(self.edge_program, n)
             self._annotate_source_nodes(n, quant_attrs)
 
-            if param is not None:
+            if param is not None and n.args[0] not in visited_const_param:
+                visited_const_param.add(n.args[0])
                 self._dequant_fold_params(n, quant_attrs, param)
 
         return graph_module
diff --git a/backends/qualcomm/passes/recompose_pixel_unshuffle.py b/backends/qualcomm/passes/recompose_pixel_unshuffle.py
@@ -35,7 +35,13 @@ def call(self, graph_module: torch.fx.GraphModule):
         for node in graph.nodes:
             if node.op == "call_function" and node.target == self.reshape_target:
                 with graph.inserting_after(node):
-                    premute_node = node.args[0]
+
+                    # Clone op still exists between permute and reshape_target during quantization,
+                    # so we need to check for args[0].args[0] to get permute node
+                    if self.quantization_capture:
+                        premute_node = node.args[0].args[0]
+                    else:
+                        premute_node = node.args[0]
                     if any(
                         [
                             len(node.args[1]) != 4,
diff --git a/backends/qualcomm/quantizer/quantizer.py b/backends/qualcomm/quantizer/quantizer.py
@@ -12,7 +12,6 @@
     RecomposePixelUnshuffle,
 )
 from executorch.backends.qualcomm.passes.reduce_dynamic_range import ReduceDynamicRange
-from executorch.backends.qualcomm.passes.remove_redundancy import RemoveRedundancy
 from executorch.backends.qualcomm.passes.replace_inf_buffer import ReplaceInfBuffer
 from executorch.backends.transforms.decompose_sdpa import (
     DecomposeScaledDotProductAttention,
@@ -182,7 +181,6 @@ def set_per_channel_linear_quant(self, enable: bool) -> None:
         self._update_per_channel_weight_quant_ops(linear_ops, enable)
 
     def transform_for_annotation(self, model: GraphModule) -> GraphModule:
-        model = RemoveRedundancy()(model).graph_module
         model = ReduceDynamicRange()(model).graph_module
         model = RecomposePixelUnshuffle(quantization_capture=True)(model).graph_module
         model = DecomposeScaledDotProductAttention()(model).graph_module
diff --git a/examples/qualcomm/scripts/utils.py b/examples/qualcomm/scripts/utils.py
@@ -187,6 +187,7 @@ def build_executorch_binary(
         quantizer = QnnQuantizer()
         quantizer.add_custom_quant_annotations(custom_annotations)
         quantizer.set_per_channel_linear_quant(per_channel_linear)
+        quantizer.set_per_channel_conv_quant(True)
 
         if quant_dtype == QuantDtype.use_8a8w:
             pass  # default setting
@@ -214,7 +215,6 @@ def build_executorch_binary(
             for data in dataset:
                 annotated_model(*data)
         quantized_model = convert_pt2e(annotated_model)
-
         edge_prog = capture_program(quantized_model, inputs)
     else:
         edge_prog = capture_program(model, inputs)