[ET-VK][int4] Wrap int4 linear calls with view_copy nodes to squeeze/unsqueeze inputs

Nathanael See · Nathanael See · commit 13da2d56c92e · 2025-02-05T16:56:03.000-08:00
Pull Request resolved: #8226 This is done automatically for full-precision linear/mm nodes in the graph at torch.export graph tracing time, but is not done for the int4 op. The new pass adds view_copy nodes, as there are subsequent passes which can fuse view_copy nodes if redundant, and convert view_copy nodes to squeeze/unsqueeze nodes. ghstack-source-id: 264952606 @exported-using-ghexport Differential Revision: [D69065866](https://our.internmc.facebook.com/intern/diff/D69065866/)
diff --git a/backends/vulkan/_passes/TARGETS b/backends/vulkan/_passes/TARGETS
@@ -30,6 +30,21 @@ runtime.python_library(
     ]
 )
 
+runtime.python_library(
+    name = "squeeze_int4_linear_inputs",
+    srcs = [
+        "squeeze_int4_linear_inputs.py",
+    ],
+    visibility = [
+        "//executorch/backends/...",
+    ],
+    deps = [
+        "//executorch/backends/vulkan:custom_ops_lib",
+        "//executorch/exir:pass_base",
+        "//executorch/exir/dialects:lib",
+    ]
+)
+
 runtime.python_library(
     name = "remove_asserts",
     srcs = ["remove_asserts.py"],
@@ -99,6 +114,7 @@ runtime.python_library(
         ":remove_asserts",
         ":remove_local_scalar_dense",
         ":remove_redundant_ops",
-        ":tag_memory_meta_pass"
+        ":squeeze_int4_linear_inputs",
+        ":tag_memory_meta_pass",
     ]
 )
diff --git a/backends/vulkan/_passes/__init__.py b/backends/vulkan/_passes/__init__.py
@@ -1,3 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
 from executorch.backends.vulkan._passes.insert_prepack_nodes import insert_prepack_nodes
 from executorch.backends.vulkan._passes.int4_weight_only_quantizer import (
     VkInt4WeightOnlyQuantizer,
@@ -12,6 +20,9 @@
 from executorch.backends.vulkan._passes.remove_redundant_ops import (
     RemoveRedundantOpsTransform,
 )
+from executorch.backends.vulkan._passes.squeeze_int4_linear_inputs import (
+    SqueezeInt4LinearInputs,
+)
 from executorch.backends.vulkan._passes.tag_memory_meta_pass import TagMemoryMetaPass
 
 __all__ = [
@@ -21,5 +32,6 @@
     "RemoveAssertsTransform",
     "RemoveLocalScalarDenseOpsTransform",
     "RemoveRedundantOpsTransform",
+    "SqueezeInt4LinearInputs",
     "TagMemoryMetaPass",
 ]
diff --git a/backends/vulkan/_passes/squeeze_int4_linear_inputs.py b/backends/vulkan/_passes/squeeze_int4_linear_inputs.py
@@ -0,0 +1,64 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+from typing import Dict, List, Tuple
+
+import executorch.backends.vulkan.custom_ops_lib  # noqa: needed to access vk op
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, NodeMetadata, ProxyValue
+
+from torch.fx.node import Argument
+
+
+class SqueezeInt4LinearInputs(ExportPass):
+    def call_operator(
+        self,
+        op,  # pyre-ignore
+        args: Tuple[Argument, ...],
+        kwargs: Dict[str, Argument],
+        meta: NodeMetadata,
+    ) -> ProxyValue:
+        def _squeezable(shape: List[int]) -> bool:
+            return len(shape) > 2 and 1 in shape
+
+        if op != exir_ops.edge.et_vk.linear_weight_int4.default:
+            return super().call_operator(op, args, kwargs, meta)
+
+        # pyre-ignore[16]: `None` has no attribute `node`
+        input_shape = args[0].node.meta["val"].shape
+        output_shape = meta["val"].shape
+        if not _squeezable(input_shape):
+            return super().call_operator(op, args, kwargs, meta)
+
+        # squeeze input tensor
+        squeeze_shape = list(input_shape)
+        while _squeezable(squeeze_shape):
+            squeeze_shape.remove(1)
+
+        squeeze_out = super().call_operator(
+            exir_ops.edge.aten.view_copy.default,
+            (args[0], squeeze_shape),
+            kwargs,
+            meta,
+        )
+        # call linear on squeezed output
+        new_args = (squeeze_out, *args[1:])
+        linear_out = super().call_operator(
+            op,
+            new_args,
+            kwargs,
+            meta,
+        )
+        # unsqueeze output
+        unsqueeze_shape = list(output_shape)
+        return super().call_operator(
+            exir_ops.edge.aten.view_copy.default,
+            (linear_out, unsqueeze_shape),
+            kwargs,
+            meta,
+        )
diff --git a/backends/vulkan/targets.bzl b/backends/vulkan/targets.bzl
@@ -328,6 +328,7 @@ def define_common_targets(is_fbcode = False):
                 "//executorch/backends/transforms:fuse_dequant_linear",
                 "//executorch/backends/transforms:fuse_view_copy",
                 "//executorch/backends/transforms:remove_clone_ops",
+                "//executorch/backends/transforms:view_copy_to_squeeze_unsqueeze",
                 "//executorch/backends/vulkan/_passes:vulkan_passes",
                 "//executorch/backends/vulkan/serialization:lib",
                 "//executorch/exir/backend:backend_details",
diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py
@@ -19,10 +19,14 @@
 from executorch.backends.transforms.fuse_conv_with_clamp import FuseClampPass
 from executorch.backends.transforms.fuse_dequant_linear import FuseDequantLinearPass
 from executorch.backends.transforms.fuse_view_copy import FuseViewCopyTransform
+from executorch.backends.transforms.view_copy_to_squeeze_unsqueeze import (
+    ViewCopyToSqueezeUnsqueezePass,
+)
 from executorch.backends.vulkan._passes import (
     insert_prepack_nodes,
     RemoveLocalScalarDenseOpsTransform,
     RemoveRedundantOpsTransform,
+    SqueezeInt4LinearInputs,
     TagMemoryMetaPass,
 )
 
@@ -149,7 +153,9 @@ def preprocess(  # noqa: C901
                 RemoveRedundantOpsTransform(),
                 AddmmToLinearTransform(),
                 FuseDequantLinearPass(),
+                SqueezeInt4LinearInputs(),
                 FuseViewCopyTransform(),
+                ViewCopyToSqueezeUnsqueezePass(),
                 FuseBatchNormWithConvPass(program),
                 FuseClampPass(),
             ],