pytorch
diff --git a/‎backends/vulkan/_passes/int4_weight_only_quantizer.py
Lines changed: 1 addition & 13 deletions b/‎backends/vulkan/_passes/int4_weight_only_quantizer.py
Lines changed: 1 addition & 13 deletions
diff --git a/‎backends/vulkan/_passes/squeeze_unsqueeze_inputs.py
Lines changed: 18 additions & 5 deletions b/‎backends/vulkan/_passes/squeeze_unsqueeze_inputs.py
Lines changed: 18 additions & 5 deletions
diff --git a/‎backends/vulkan/op_registry.py
Lines changed: 2 additions & 0 deletions b/‎backends/vulkan/op_registry.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/api/containers/Tensor.cpp
Lines changed: 1 addition & 3 deletions b/‎backends/vulkan/runtime/api/containers/Tensor.cpp
Lines changed: 1 addition & 3 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.glsl
Lines changed: 136 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.glsl
Lines changed: 136 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.yaml
Lines changed: 13 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.yaml
Lines changed: 13 additions & 0 deletions
@@ -118,9 +118,6 @@ def _vk_replace_linear_int4(
     # Use custom vulkan linear layer as default
     linear_class: Type[torch.nn.Module] = VkWeightOnlyInt4Linear,
     copy_weights: bool = False,
-    # Serves the same purpose as `tensor_dim_limit` in
-    # executorch.backends.vulkan.partitioner.VulkanSupportedOperators
-    feature_limit: int = 16384,
 ):
     for name, child in module.named_children():
         if isinstance(child, torch.nn.Linear) and (
@@ -131,8 +128,6 @@ def _vk_replace_linear_int4(
             if (
                 _check_linear_int4_k(child.in_features, groupsize, inner_k_tiles)
                 or padding_allowed
-            ) and (
-                child.out_features < feature_limit and child.in_features < feature_limit
             ):
                 new_linear = linear_class(
                     child.in_features,
@@ -175,7 +170,6 @@ def __init__(
         inner_k_tiles: Optional[int] = 8,
         device: torch.device = torch.device("cpu"),  # noqa
         precision: torch.dtype = torch.float32,
-        feature_limit: int = 16384,
     ) -> None:
         super().__init__()
         assert inner_k_tiles in [2, 4, 8]
@@ -186,9 +180,6 @@ def __init__(
         self.padding_allowed: bool = padding_allowed
         self.device: torch.device = device
         self.precision: torch.dtype = precision
-        # Serves the same purpose as `tensor_dim_limit` in
-        # executorch.backends.vulkan.partitioner.VulkanSupportedOperators
-        self.feature_limit = feature_limit
 
     @torch.no_grad()
     def _create_quantized_state_dict(
@@ -197,10 +188,7 @@ def _create_quantized_state_dict(
         cur_state_dict = model.state_dict()
         for fqn, mod in model.named_modules():
             # Add additional check to make sure features do not exceed feature limit
-            if isinstance(mod, torch.nn.Linear) and (
-                mod.out_features < self.feature_limit
-                and mod.in_features < self.feature_limit
-            ):
+            if isinstance(mod, torch.nn.Linear):
                 out_features = mod.out_features
                 in_features = mod.in_features
                 logging.info(f"linear: {fqn}, in={in_features}, out={out_features}")
 
@@ -27,25 +27,38 @@ class SqueezeUnsqueezeInputs(ExportPass):
         exir_ops.edge.aten.gelu.default,
     }
 
+    def should_squeeze(self, op, shape: List[int]) -> bool:  # pyre-ignore
+        if len(shape) == 3:
+            return shape[1] == 1 and shape[0] > 1
+        if len(shape) == 4:
+            # No need to squeeze if all dims are 1 except the width dim
+            if all(dim == 1 for dim in shape[:-1]):
+                return False
+            # Otherwise, check for squeezable dim
+            return 1 in shape[:-1]
+
+        # Prefer not to introduce additional orchestration ops by default
+        return False
+
     def call_operator(
         self,
         op,  # pyre-ignore
         args: Tuple[Argument, ...],
         kwargs: Dict[str, Argument],
         meta: NodeMetadata,
     ) -> ProxyValue:
-        def _squeezable(shape: List[int]) -> bool:
-            return len(shape) > 2 and 1 in shape
-
         if op not in self._squeezable_ops:
             return super().call_operator(op, args, kwargs, meta)
-
         # pyre-ignore[16]: `None` has no attribute `node`
         input_shape = args[0].node.meta["val"].shape
         output_shape = meta["val"].shape
-        if not _squeezable(input_shape):
+
+        if not self.should_squeeze(op, input_shape):
             return super().call_operator(op, args, kwargs, meta)
 
+        def _squeezable(shape: List[int]) -> bool:
+            return len(shape) > 2 and 1 in shape
+
         # squeeze input tensor
         squeeze_shape = list(input_shape)
         while _squeezable(squeeze_shape):
 
@@ -393,6 +393,7 @@ def register_int8_mm_op(features: OpFeatures):
 
 @update_features(exir_ops.edge.et_vk.linear_weight_int4.default)
 def register_int4_mm_op(features: OpFeatures):
+    features.buffer_impl = True
     features.texture_impl = TextureImplFeatures(
         uses_axis_map=False,
         valid_packed_dims={PackedDim.WIDTH},
@@ -401,6 +402,7 @@ def register_int4_mm_op(features: OpFeatures):
     features.optimal_storage = VkStorageType.TEXTURE_3D
     features.optimal_layout = VkMemoryLayout.TENSOR_WIDTH_PACKED
     features.handles_own_prepacking = True
+    features.skip_limits_check = {1}
     return features
 
 
 
@@ -497,9 +497,7 @@ vTensor::vTensor(
   VK_CHECK_COND(
       dim_order_is_valid(dim_order_), "computed dim order is invalid");
 
-  if (storage_type != utils::kBuffer) {
-    set_logical_limits(storage_.image_extents_);
-  }
+  set_logical_limits(storage_.image_extents_);
 }
 
 // NOLINTNEXTLINE
 
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+${define_required_extensions("uint8")}
+${define_required_extensions("int8")}
+
+layout(std430) buffer;
+
+${layout_declare_tensor(B, "w", "t_qmat2", "uint8", STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "nchw_4x2", "uint8", "buffer")}
+
+layout(push_constant) uniform restrict Block {
+  ivec4 qmat2_sizes;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+uint8_t get_first(const uint8_t packed) {
+  return uint8_t((packed & 0xF0) >> 4);
+}
+
+uint8_t get_second(const uint8_t packed) {
+  return uint8_t(packed & 0x0F);
+}
+
+uint8_t combine(const uint8_t first, const uint8_t second) {
+  return uint8_t(first << 4 | second);
+}
+
+/*
+ * This shader packs the weight tensor into a texture.
+ *
+ * The original tensor has a (W, H) shape of (K / 2, N) and each scalar element
+ * is a uint8_t, which contains 2 packed 4 bit uint values.
+ *
+ * The transform performed by this shader is to first transpose the tensor, so
+ * the shape of the packed tensor becomes (N / 2, K). Then, the 4 bit integers
+ * are re-packed in groups of 8. For each 4 uint8_t values, the "left" 4-bits
+ * of each value contain the 0, 1, 2, 3 4-bit values, and the "right" 4-bits of
+ * each value contain the 4, 5, 6, 7 4-bit values.
+ *
+ * As a concrete example, consider the following weight tensor. The | demarks
+ * the packing boundary, so 1| 2 represents a single uint8_t value with 1 in the
+ * leftmost 4 bits and 2 in the rightmost 4 bits.
+ *
+ *  1| 2,  3| 4,  5| 6,  7| 8,
+ *  9|10, 11|12, 13|14, 15|16,
+ * 17|18, 19|20, 21|22, 23|24,
+ * 25|26, 27|28, 29|30, 31|32,
+ * 33|34, 35|36, 37|38, 39|40,
+ * 41|42, 43|44, 45|46, 47|48,
+ * 49|50, 51|52, 53|54, 55|56,
+ * 57|58, 59|60, 61|62, 63|64,
+ *
+ * After packing, the packed tensor would contain
+ *
+ *  1|33,  9|41, 17|49, 25|57,
+ *  2|34, 10|42, 18|50, 26|58,
+ *  3|35, 11|43, 19|51, 27|59,
+ *  4|36, 12|44, 20|52, 28|60,
+ *  5|37, 13|45, 21|53, 29|61,
+ *  6|38, 14|46, 22|54, 30|62,
+ *  7|39, 15|47, 23|55, 31|63,
+ *  8|40, 16|48, 24|56, 32|64,
+ *
+ * The purpose of interleaving is to make it easier to extract the unpacked
+ * values in order using the u8vec4 vectorized type. With the packing in place,
+ * The 4-bit values can be extracted via
+ *
+ * u8vec4 packed;
+ * u8vec4 vals_0123 = (packed & 0xF0) >> 4;
+ * u8vec4 vals_4567 = (packed | 0x0F);
+ */
+void main() {
+  // Each thread writes 2 output texels along the height axis
+  ivec2 packed_pos = ivec2(
+      gl_GlobalInvocationID.x,
+      gl_GlobalInvocationID.y << 1);
+
+  // The packed tensor is width packed
+  if ((packed_pos.x << 2) >= qmat2_sizes.x || packed_pos.y >= qmat2_sizes.y) {
+    return;
+  }
+
+  int out_col = packed_pos.x << 3;
+  int out_row = packed_pos.y;
+
+  int in_col = out_row;
+  int in_int8_col = in_col >> 1;
+  int in_row = out_col;
+
+  int in_numrows = qmat2_sizes.x << 1;
+  int in_numcols = qmat2_sizes.y;
+  int in_num_int8_cols = qmat2_sizes.y >> 1;
+
+  uint8_t in_vals[8][2];
+  for (int r = 0; r < 8; ++r) {
+    if (in_row + r < in_numrows) {
+      uint8_t in_val_packed = nchw_4x2[(in_row + r) * in_num_int8_cols + in_int8_col];
+      in_vals[r][0] = get_first(in_val_packed);
+      in_vals[r][1] = get_second(in_val_packed);
+    } else {
+      in_vals[r][0] = uint8_t(254);
+      in_vals[r][1] = uint8_t(254);
+    }
+  }
+
+  u8vec4 out_tex_1 = u8vec4(
+      combine(in_vals[0][0], in_vals[4][0]),
+      combine(in_vals[1][0], in_vals[5][0]),
+      combine(in_vals[2][0], in_vals[6][0]),
+      combine(in_vals[3][0], in_vals[7][0]));
+
+  u8vec4 out_tex_2 = u8vec4(
+      combine(in_vals[0][1], in_vals[4][1]),
+      combine(in_vals[1][1], in_vals[5][1]),
+      combine(in_vals[2][1], in_vals[6][1]),
+      combine(in_vals[3][1], in_vals[7][1]));
+
+  $if STORAGE == "buffer":
+    int stride = qmat2_sizes.x >> 2;
+    t_qmat2[packed_pos.y * stride + packed_pos.x] = out_tex_1;
+    t_qmat2[(packed_pos.y + 1) * stride + packed_pos.x] = out_tex_2;
+  $else:
+    imageStore(t_qmat2, ivec3(packed_pos.xy, 0), out_tex_1);
+    imageStore(t_qmat2, ivec3(packed_pos.x, packed_pos.y + 1, 0), out_tex_2);
+}
@@ -0,0 +1,13 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+pack_int4_linear_weight_transposed_interleaved:
+  parameter_names_with_default_values:
+    STORAGE: texture3d
+  shader_variants:
+    - NAME: pack_int4_linear_weight_transposed_interleaved_texture3d
+    - NAME: pack_int4_linear_weight_transposed_interleaved_buffer
+      STORAGE: buffer
Original file line number	Diff line number	Diff line change
`@@ -497,9 +497,7 @@ vTensor::vTensor(`
`497`	`497`	`VK_CHECK_COND(`
`498`	`498`	`dim_order_is_valid(dim_order_), "computed dim order is invalid");`
`499`	`499`
`500`		`- if (storage_type != utils::kBuffer) {`
`501`		`- set_logical_limits(storage_.image_extents_);`
`502`		`- }`
	`500`	`+ set_logical_limits(storage_.image_extents_);`
`503`	`501`	`}`
`504`	`502`
`505`	`503`	`// NOLINTNEXTLINE`