Qualcomm AI Engine Direct - oss model enablement (EfficientSAM)

DannyYuyang-quic · DannyYuyang-quic · commit 42412d7435a6 · 2025-04-10T14:29:34.000+08:00
- e2e script for https://github.com/yformer/EfficientSAM - Fastvit breakage fix - Add support for cum_sum - Add bicubic interpolate transform pass - Fix stack op
diff --git a/backends/qualcomm/_passes/__init__.py b/backends/qualcomm/_passes/__init__.py
@@ -9,6 +9,7 @@
 from .annotate_unbind import AnnotateUnbind
 from .convert_bmm_to_matmul import ConvertBmmToMatmul
 from .convert_conv1d_to_conv2d import ConvertConv1dToConv2d
+from .convert_upsample_bicubic2d import ConvertUpsampleBicubicWithBilinear
 from .decompose_any import DecomposeAny
 from .decompose_einsum import DecomposeEinsum
 from .decompose_expm1 import DecomposeExpM1
@@ -40,6 +41,7 @@
     ConvertBmmToMatmul,
     ConvertConv1dToConv2d,
     DecomposeAny,
+    ConvertUpsampleBicubicWithBilinear,
     DecomposeEinsum,
     DecomposeExpM1,
     DecomposeLinalgVectorNorm,
diff --git a/backends/qualcomm/_passes/convert_upsample_bicubic2d.py b/backends/qualcomm/_passes/convert_upsample_bicubic2d.py
@@ -0,0 +1,27 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+
+class ConvertUpsampleBicubicWithBilinear(ExportPass):
+    """
+    Qnn does not support bicubic interpolation, so we need to convert it to bilinear.
+    This pass will convert bicubic interpolation to bilinear interpolation.
+    """
+
+    bicubic_op_targets = {
+        exir_ops.edge.aten.upsample_bicubic2d.vec,
+    }
+    upsample_bilinear_op = exir_ops.edge.aten.upsample_bilinear2d.default
+
+    def __init__(self):
+        super(ConvertUpsampleBicubicWithBilinear, self).__init__()
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in self.bicubic_op_targets:
+            return super().call_operator(op, args, kwargs, meta)
+        return super().call_operator(self.upsample_bilinear_op, args[:-1], kwargs, meta)
diff --git a/backends/qualcomm/_passes/layout_transform.py b/backends/qualcomm/_passes/layout_transform.py
@@ -55,6 +55,7 @@ class LayoutTransform(ExportPass):
         exir_ops.edge.aten.ceil.default,
         exir_ops.edge.aten.clamp.default,
         exir_ops.edge.aten.constant_pad_nd.default,
+        exir_ops.edge.aten.cumsum.default,
         exir_ops.edge.aten.div.Tensor,
         exir_ops.edge.aten.elu.default,
         exir_ops.edge.aten.eq.Tensor,
diff --git a/backends/qualcomm/_passes/qnn_pass_manager.py b/backends/qualcomm/_passes/qnn_pass_manager.py
@@ -14,6 +14,7 @@
     AnnotateUnbind,
     ConvertBmmToMatmul,
     ConvertConv1dToConv2d,
+    ConvertUpsampleBicubicWithBilinear,
     DecomposeAny,
     DecomposeEinsum,
     DecomposeExpM1,
@@ -74,6 +75,7 @@ def get_capture_program_passes():
         (AnnotateUnbind, True),
         (ConvertBmmToMatmul, True),
         (ConvertConv1dToConv2d, True),
+        (ConvertUpsampleBicubicWithBilinear, False),
         (DecomposeAny, True),
         (ExpandBroadcastTensorShape, False),
         (FixedLinearKeepDim, True),
diff --git a/backends/qualcomm/_passes/recompose_pixel_unshuffle.py b/backends/qualcomm/_passes/recompose_pixel_unshuffle.py
@@ -45,13 +45,11 @@ def call(self, graph_module: torch.fx.GraphModule):
                         continue
 
                     view_node = premute_node.args[0]
-                    if any(
-                        [
-                            view_node.op != "call_function",
-                            view_node.target != self.view_target,
-                            len(view_node.args[1]) != 6,
-                            len(premute_node.args[1]) != 6,
-                        ]
+                    if (
+                        view_node.op != "call_function"
+                        or view_node.target != self.view_target
+                        or len(view_node.args[1]) != 6
+                        or len(premute_node.args[1]) != 6
                     ):
                         continue
 
diff --git a/backends/qualcomm/_passes/utils.py b/backends/qualcomm/_passes/utils.py
@@ -78,6 +78,7 @@ def get_passes_dependency_for_capture_program():
         AnnotateUnbind,
         ConvertBmmToMatmul,
         ConvertConv1dToConv2d,
+        ConvertUpsampleBicubicWithBilinear,
         DecomposeAny,
         DecomposeLinalgVectorNorm,
         ExpandBroadcastTensorShape,
@@ -96,18 +97,20 @@ def get_passes_dependency_for_capture_program():
         AnnotateQuantAttrs: [
             RecomposePixelUnshuffle,
             ConvertBmmToMatmul,
+            ConvertUpsampleBicubicWithBilinear,
             RemoveRedundancy,
         ],
         AnnotateStack: [RemoveRedundancy],
         AnnotateUnbind: [RemoveRedundancy],
         ConvertBmmToMatmul: [RecomposePixelUnshuffle],
         ConvertConv1dToConv2d: [FoldQDQ],
+        ConvertUpsampleBicubicWithBilinear: [RemoveRedundancy],
         DecomposeAny: [RemoveRedundancy],
         DecomposeLinalgVectorNorm: [RemoveRedundancy],
         ExpandBroadcastTensorShape: [RemoveRedundancy],
         FixedLinearKeepDim: [FoldQDQ],
         FoldQDQ: [AnnotateQuantAttrs, AnnotateStack, AnnotateUnbind],
-        I64toI32: [RemoveRedundancy],
+        I64toI32: [ConvertUpsampleBicubicWithBilinear, RemoveRedundancy],
         LayoutTransform: [
             AnnotateQuantAttrs,
             ConvertConv1dToConv2d,
diff --git a/backends/qualcomm/builders/__init__.py b/backends/qualcomm/builders/__init__.py
@@ -21,6 +21,7 @@
     op_clamp,
     op_conv2d,
     op_cos,
+    op_cum_sum,
     op_depth_to_space,
     op_dequantize,
     op_div,
@@ -108,6 +109,7 @@
     op_clamp,
     op_conv2d,
     op_cos,
+    op_cum_sum,
     op_depth_to_space,
     op_dequantize,
     op_div,
diff --git a/backends/qualcomm/builders/op_cos.py b/backends/qualcomm/builders/op_cos.py
@@ -3,7 +3,6 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-
 from typing import Dict
 
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
diff --git a/backends/qualcomm/builders/op_cum_sum.py b/backends/qualcomm/builders/op_cum_sum.py
@@ -0,0 +1,84 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import cast, Dict
+
+import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
+
+import numpy as np
+import torch
+from executorch.backends.qualcomm.utils.constants import QCOM_AXIS_ORDER, QCOM_DATA
+
+from .node_visitor import NodeVisitor, register_node_visitor
+from .qnn_constants import OpCumulativeSum, QNN_OP_PACKAGE_NAME_QTI_AISW
+
+
+@register_node_visitor
+class CumulativeSum(NodeVisitor):
+    target = ["aten.cumsum.default"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def get_param(self, node, input_tensor):
+        dim = node.args[1]
+
+        if dim < 0:
+            dim = dim % len(input_tensor.shape)
+        if QCOM_AXIS_ORDER in node.meta:
+            dim = node.meta[QCOM_AXIS_ORDER].index(dim)
+
+        return cast(np.uint32, dim)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
+    ) -> PyQnnWrapper.PyQnnOpWrapper:
+        input_node = node.args[0]
+        input_tensor = self.get_tensor(input_node, node)
+        input_tensor_wrapper = self.define_tensor(
+            input_node,
+            node,
+            input_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+
+        dim = self.get_param(node, input_tensor)
+
+        output_tensor = self.get_tensor(node, node)
+        output_tensor_wrapper = self.define_tensor(
+            node,
+            node,
+            output_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+
+        cumsum_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpCumulativeSum.op_name,
+        )
+        cumsum_op.AddInputTensors([input_tensor_wrapper])
+        cumsum_op.AddOutputTensors([output_tensor_wrapper])
+        cumsum_op.AddScalarParam(
+            OpCumulativeSum.param_axis,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+            {QCOM_DATA: dim},
+        )
+        cumsum_op.AddScalarParam(
+            OpCumulativeSum.param_exclusive,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_BOOL_8,
+            {QCOM_DATA: False},
+        )
+        cumsum_op.AddScalarParam(
+            OpCumulativeSum.param_reverse,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_BOOL_8,
+            {QCOM_DATA: False},
+        )
+
+        return cumsum_op
diff --git a/backends/qualcomm/builders/op_sin.py b/backends/qualcomm/builders/op_sin.py
@@ -3,7 +3,6 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-
 from typing import Dict
 
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
diff --git a/backends/qualcomm/builders/op_stack.py b/backends/qualcomm/builders/op_stack.py
@@ -51,7 +51,7 @@ def define_node(
 
         dim = 0 if len(node.args) == 1 else cast(int, node.args[1])
         if dim < 0:
-            dim = dim % len(input_tensor.shape)
+            dim = dim % len(output_tensor.shape)
         if QCOM_AXIS_ORDER in node.meta:
             dim = node.meta[QCOM_AXIS_ORDER].index(dim)
         stack_op = PyQnnWrapper.PyQnnOpWrapper(
diff --git a/backends/qualcomm/builders/qnn_constants.py b/backends/qualcomm/builders/qnn_constants.py
@@ -57,6 +57,14 @@ class OpConvert:
     op_name: str = "Convert"
 
 
+@dataclass(init=False, frozen=True)
+class OpCumulativeSum:
+    op_name = "CumulativeSum"
+    param_axis = "axis"
+    param_exclusive = "exclusive"
+    param_reverse = "reverse"
+
+
 @dataclass(init=False, frozen=True)
 class OpDepthToSpace:
     op_name: str = "DepthToSpace"
diff --git a/backends/qualcomm/partition/common_defs.py b/backends/qualcomm/partition/common_defs.py
@@ -13,6 +13,7 @@
     exir_ops.edge.aten.clone.default,
     exir_ops.edge.aten.slice_scatter.default,
     exir_ops.edge.aten.copy.default,
+    exir_ops.edge.aten.upsample_bicubic2d.vec,
     exir_ops.edge.quantized_decomposed.embedding_4bit.dtype,
 ]
 
diff --git a/backends/qualcomm/partition/utils.py b/backends/qualcomm/partition/utils.py
@@ -39,6 +39,7 @@ def get_skip_decomp_table() -> List[torch._ops.OperatorBase]:
         torch.ops.aten.rms_norm.default,
         torch.ops.aten._safe_softmax.default,
         torch.ops.aten.stack.default,
+        torch.ops.aten.upsample_bicubic2d.vec,
         # This request is ignored because it is in a blocklist. Refer to exir/program/_program.py
         # torch.ops.aten.unbind.int,
         torch.ops.pt2e_quant.quantize_affine.default,
diff --git a/backends/qualcomm/quantizer/annotators.py b/backends/qualcomm/quantizer/annotators.py
@@ -976,6 +976,11 @@ def annotate_conv2d(node: Node, quantization_config: QuantizationConfig) -> None
     )
 
 
+@register_annotator([torch.ops.aten.cumsum.default])
+def annotate_cumsum(node: Node, quantization_config: QuantizationConfig) -> None:
+    annotate_single_in_single_out(node, quantization_config)
+
+
 @register_annotator([torch.ops.aten.linear.default])
 def annotate_linear(node: Node, quantization_config: QuantizationConfig) -> None:
     act_node = node.args[0]
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
@@ -568,6 +568,14 @@ def forward(self, x):
         return torch.cos(x)
 
 
+class CumSum(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return x.cumsum(dim=0)
+
+
 class Div(torch.nn.Module):
     def __init__(self):
         super().__init__()
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -233,6 +233,11 @@ def test_qnn_backend_cos(self):
         sample_input = (torch.randn(2, 5, 1, 3),)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_cumsum(self):
+        module = CumSum()  # noqa: F405
+        sample_input = (torch.randn(4),)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_einsum_outer_product(self):
         module = EinsumOuterProduct()  # noqa: F405
         x = torch.randn(5)
@@ -1297,6 +1302,12 @@ def test_qnn_backend_cos(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_cumsum(self):
+        module = CumSum()  # noqa: F405
+        sample_input = (torch.randn(4),)
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_einsum_outer_product(self):
         module = EinsumOuterProduct()  # noqa: F405
         x = torch.randn(5)
@@ -3537,7 +3548,6 @@ def test_conv_former(self):
                 self.assertGreaterEqual(msg["top_1"], 60)
                 self.assertGreaterEqual(msg["top_5"], 80)
 
-    @unittest.skip("bicubic resize is not supported")
     def test_dino_v2(self):
         if not self.required_envs([self.image_dataset]):
             self.skipTest("missing required envs")
@@ -3573,6 +3583,46 @@ def test_dino_v2(self):
                 self.assertGreaterEqual(msg["top_1"], 70)
                 self.assertGreaterEqual(msg["top_5"], 85)
 
+    def test_efficientSAM(self):
+        if not self.required_envs(
+            [self.image_dataset, self.pretrained_weight, self.oss_repo]
+        ):
+            self.skipTest("missing required envs")
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/efficientSAM.py",
+            "--dataset",
+            self.image_dataset,
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--device",
+            self.device,
+            "--model",
+            self.model,
+            "--oss_repo",
+            self.oss_repo,
+            "--pretrained_weight",
+            self.pretrained_weight,
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+        ]
+        if self.host:
+            cmds.extend(["--host", self.host])
+
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                self.assertGreaterEqual(msg["MIoU"], 0.55)
+
     def test_esrgan(self):
         if not self.required_envs():
             self.skipTest("missing required envs")
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
@@ -438,12 +438,14 @@ def lower_module_and_test_output(
         skip_node_id_set: set = None,
         skip_node_op_set: set = None,
         dynamic_shapes: Dict = None,
+        passes_job: collections.OrderedDict = None,
     ):
         delegated_program = to_edge_transform_and_lower_to_qnn(
             module,
             sample_inputs,
             self.compiler_specs,
             dynamic_shapes=dynamic_shapes,
+            passes_job=passes_job,
             skip_node_id_set=skip_node_id_set,
             skip_node_op_set=skip_node_op_set,
         )
diff --git a/examples/qualcomm/oss_scripts/dino_v2.py b/examples/qualcomm/oss_scripts/dino_v2.py
diff --git a/examples/qualcomm/oss_scripts/efficientSAM/efficientSAM.py b/examples/qualcomm/oss_scripts/efficientSAM/efficientSAM.py
diff --git a/examples/qualcomm/oss_scripts/efficientSAM/source_transformation/__init__.py b/examples/qualcomm/oss_scripts/efficientSAM/source_transformation/__init__.py
diff --git a/examples/qualcomm/oss_scripts/efficientSAM/source_transformation/mask_decoder.py b/examples/qualcomm/oss_scripts/efficientSAM/source_transformation/mask_decoder.py
diff --git a/examples/qualcomm/oss_scripts/efficientSAM/source_transformation/pos_emb.py b/examples/qualcomm/oss_scripts/efficientSAM/source_transformation/pos_emb.py
diff --git a/examples/qualcomm/oss_scripts/fastvit.py b/examples/qualcomm/oss_scripts/fastvit.py
diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py

Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,6 @@`
`3`	`3`	`#`
`4`	`4`	`# This source code is licensed under the BSD-style license found in the`
`5`	`5`	`# LICENSE file in the root directory of this source tree.`
`6`		`-`
`7`	`6`	`from typing import Dict`
`8`	`7`
`9`	`8`	`import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper`
Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,7 @@`
`13`	`13`	`exir_ops.edge.aten.clone.default,`
`14`	`14`	`exir_ops.edge.aten.slice_scatter.default,`
`15`	`15`	`exir_ops.edge.aten.copy.default,`
	`16`	`+ exir_ops.edge.aten.upsample_bicubic2d.vec,`
`16`	`17`	`exir_ops.edge.quantized_decomposed.embedding_4bit.dtype,`
`17`	`18`	`]`
`18`	`19`