pytorch · SS-JIA · Mar 18, 2026 · Mar 17, 2026 · Mar 17, 2026
diff --git a/backends/vulkan/custom_ops_lib.py b/backends/vulkan/custom_ops_lib.py
@@ -8,7 +8,6 @@
 
 import executorch.backends.vulkan.patterns as vk_patterns
 import torch.library
-
 from torch._subclasses.fake_tensor import FakeTensor
 
 namespace = "et_vk"
@@ -259,7 +258,7 @@ def linear_q4gsw(
         weights, [1, group_size], weight_scales, weight_zeros, torch.int8, -8, 7
     )
 
-    out = torch.nn.functional.linear(x, weights)
+    out = torch.nn.functional.linear(x, weights, bias)
     return out
 
 
@@ -273,7 +272,7 @@ def linear_dq8ca_q4gsw(
     group_size: int,
     bias: Optional[torch.Tensor] = None,
 ):
-    return linear_q4gsw(x, weights, weight_scales, group_size)
+    return linear_q4gsw(x, weights, weight_scales, group_size, bias)
 
 
 name = "linear_q4gsw"

diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
@@ -159,6 +159,8 @@ def update_features_impl(op: OpKey):
         torch.ops.aten.sym_size.int,
         operator.add,
         operator.sub,
+        operator.floordiv,
+        operator.mul,
         operator.lt,
         operator.gt,
         operator.ge,
@@ -279,6 +281,26 @@ def register_bitwise_and():
     )
 
 
+@update_features(exir_ops.edge.aten.bitwise_not.default)
+def register_bitwise_not():
+    return OpFeatures(
+        inputs_storage=utils.ANY_STORAGE,
+        inputs_dtypes=utils.BOOL_T,
+        supports_resize=True,
+        supports_highdim=True,
+    )
+
+
+@update_features(exir_ops.edge.aten.logical_and.default)
+def register_logical_and():
+    return OpFeatures(
+        inputs_storage=utils.ANY_STORAGE,
+        inputs_dtypes=utils.BOOL_T,
+        supports_resize=True,
+        supports_highdim=True,
+    )
+
+
 # =============================================================================
 # BinaryScalarOp.cpp
 # =============================================================================
@@ -301,16 +323,22 @@ def register_pow_tensor_scalar():
 
 @update_features(exir_ops.edge.aten._to_copy.default)
 def register_to_copy():
-    def check_to_copy_node(node: torch.fx.Node) -> bool:
-        # Only single-arg _to_copy is supported
-        return len(node.args) == 1
+    def pick_to_copy_storage(
+        node: torch.fx.Node,
+    ) -> Tuple[utils.TensorRepSet, utils.TensorRepSet]:
+        in_dtype = node.args[0].meta["val"].dtype  # type: ignore[union-attr]
+        out_dtype = node.meta["val"].dtype
+        fp_types = {torch.float16, torch.float32}
+        if in_dtype in fp_types and out_dtype in fp_types:
+            return utils.ANY_STORAGE, utils.ANY_STORAGE
+        return utils.CONTIGUOUS_BUFFER, utils.CONTIGUOUS_BUFFER
 
     return OpFeatures(
         inputs_storage=utils.ANY_STORAGE,
-        inputs_dtypes=utils.FP_INT_T,
-        outputs_dtypes=utils.FP_INT_T,
+        inputs_dtypes=utils.FP_INT_BOOL_T,
+        outputs_dtypes=utils.FP_INT_BOOL_T,
         supports_resize=True,
-        are_node_inputs_supported_fn=check_to_copy_node,
+        pick_io_storage_fn=pick_to_copy_storage,
     )
 
 
@@ -1336,6 +1364,7 @@ def register_scalar_tensor():
     return OpFeatures(
         inputs_storage=utils.CHANNELS_PACKED_TEXTURE,
         inputs_dtypes=utils.FP_INT_T,
+        supports_resize=True,
     )
 
 

diff --git a/backends/vulkan/patterns/quantized_linear.py b/backends/vulkan/patterns/quantized_linear.py
@@ -5,28 +5,22 @@
 # LICENSE file in the root directory of this source tree.
 
 import operator
-
 from typing import Optional
 
 import executorch.backends.vulkan.utils as utils
-
 import torch
 import torch.nn.functional as F
-
 from executorch.backends.transforms.utils import (
     create_constant_placeholder,
     get_param_tensor,
 )
-
 from executorch.backends.vulkan.patterns.pattern_registry import (
     PatternMatch,
     register_pattern_detector,
     register_pattern_replacement,
 )
-
 from executorch.exir import ExportedProgram
 from executorch.exir.dialects._ops import ops as exir_ops
-
 from torch.export.graph_signature import InputKind
 
 
@@ -398,6 +392,12 @@ def make_linear_q4gsw_op(
         force_update=True,
     )
 
+    # Pad bias to multiple of 4 if present
+    if match.bias_node is not None:
+        bias_tensor = get_param_tensor(ep, match.bias_node)
+        if bias_tensor is not None:
+            utils.align_width_and_update_state_dict(ep, match.bias_node, bias_tensor)
+
     with graph_module.graph.inserting_before(match.output_node):
         linear_q4gsw_node = graph_module.graph.create_node(
             "call_function",
@@ -407,6 +407,7 @@ def make_linear_q4gsw_op(
                 match.weight_node,
                 match.weight_scales_node,
                 group_size,
+                match.bias_node,
             ),
         )
 
@@ -445,6 +446,12 @@ def make_linear_dq8ca_q4gsw_op(
         force_update=True,
     )
 
+    # Pad bias to multiple of 4 if present
+    if match.bias_node is not None:
+        bias_tensor = get_param_tensor(ep, match.bias_node)
+        if bias_tensor is not None:
+            utils.align_width_and_update_state_dict(ep, match.bias_node, bias_tensor)
+
     first_graph_node = list(graph_module.graph.nodes)[0]
     with graph_module.graph.inserting_before(first_graph_node):
         weight_tensor_name = utils.get_tensor_name(ep, match.weight_node)
@@ -474,6 +481,7 @@ def make_linear_dq8ca_q4gsw_op(
                 weight_sums_node,
                 match.weight_scales_node,
                 group_size,
+                match.bias_node,
             ),
         )
 
@@ -538,6 +546,7 @@ def make_linear_q8ta_q8csw_custom_op(
                 match.weight_node,
                 weight_sums_node,
                 match.weight_scales_node,
+                match.bias_node,
             ),
         )
 
@@ -637,7 +646,6 @@ def replace_quantized_linear_patterns(
     assert weight_zeros_tensor is not None
 
     # Route to appropriate custom op.
-    # q8ta_linear supports bias, so check it first before the bias guard.
     if (
         match.is_input_static_per_tensor_quantized()
         and match.is_weight_perchannel_quantized()
@@ -646,10 +654,6 @@ def replace_quantized_linear_patterns(
         make_q8ta_linear_custom_op(ep, graph_module, match, weight_tensor)
         return
 
-    # Remaining ops do not support bias
-    if match.bias_node is not None:
-        return
-
     if (
         match.is_weight_only_quantized()
         and match.is_weight_pergroup_quantized()

diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_dq8ca_q4gsw_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_dq8ca_q4gsw_tiled.glsl
@@ -144,5 +144,11 @@ void main() {
         group_size);
   }
 
+  if (apply_bias > 0) {
+    FPPerOutChannelParams bias_tile;
+    load_bias_tile(bias_tile, n4);
+    add_bias_to_out_tile(out_tile, bias_tile);
+  }
+
   write_output_tile_with_checks(out_tile, n4, m, N4, M);
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_fp_compute.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_fp_compute.glslh
@@ -73,6 +73,16 @@ void apply_weight_scales_and_biases(
   }
 }
 
+void add_bias_to_out_tile(
+    inout FPOutTile tile,
+    const FPPerOutChannelParams bias) {
+  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
+    [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
+      tile.data[m][n4] = tile.data[m][n4] + bias.data[n4];
+    }
+  }
+}
+
 void accumulate_out_tile_with_out_tile(
     inout FPOutTile accum,
     const FPOutTile other) {

diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_coop.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_coop.glsl
@@ -142,6 +142,11 @@ void main() {
   // Only the first thread will write out result
   if (lid == 0) {
     out_tile = partial_sums[0];
+    if (apply_bias > 0) {
+      FPPerOutChannelParams bias_tile;
+      load_bias_tile(bias_tile, n4);
+      add_bias_to_out_tile(out_tile, bias_tile);
+    }
     write_output_tile_with_checks(out_tile, n4, 0, N4, 1);
   }
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_tiled.glsl
@@ -110,5 +110,11 @@ void main() {
     }
   }
 
+  if (apply_bias > 0) {
+    FPPerOutChannelParams bias_tile;
+    load_bias_tile(bias_tile, n4);
+    add_bias_to_out_tile(out_tile, bias_tile);
+  }
+
   write_output_tile_with_checks(out_tile, n4, m, N4, M);
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml b/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml
@@ -46,3 +46,6 @@ unary_op:
       OPERATOR: leaky_relu(X, A)
     - NAME: round
       OPERATOR: round(X)
+    - NAME: bitwise_not_uint8
+      OPERATOR: 1 - X
+      DTYPE: uint8
diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
@@ -214,6 +214,7 @@ REGISTER_OPERATORS {
   VK_REGISTER_OP(aten.gt.Tensor, gt);
   VK_REGISTER_OP(aten.ge.Tensor, ge);
   VK_REGISTER_OP(aten.bitwise_and.Tensor, bitwise_and);
+  VK_REGISTER_OP(aten.logical_and.default, bitwise_and);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Split.cpp b/backends/vulkan/runtime/graph/ops/impl/Split.cpp
@@ -9,52 +9,13 @@
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
 
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
-#include <executorch/backends/vulkan/runtime/utils/StorageUtils.h>
-
 namespace vkcompute {
 
-using utils::GPUMemoryLayout;
-using utils::StorageType;
-
-void resize_split_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)resize_args;
-  const ValueRef input = args.at(0).refs.at(0);
-  const ValueRef split_sizes_ref = args.at(1).refs.at(0);
-  const ValueRef dim_ref = args.at(2).refs.at(0);
-  const ValueRef out_list_ref = args.at(3).refs.at(0);
-
-  const ValueListPtr out_list = graph->get_value_list(out_list_ref);
-  const std::vector<int64_t> split_sizes =
-      *(graph->get_int_list(split_sizes_ref));
-  const int64_t dim = graph->extract_scalar<int64_t>(dim_ref);
-
-  const int64_t input_ndim = graph->dim_of(input);
-  const DimIndex dim_index = dim < 0 ? static_cast<DimIndex>(dim)
-                                     : static_cast<DimIndex>(dim - input_ndim);
-
-  std::vector<int64_t> input_sizes = graph->sizes_of(input);
-
-  for (int split_idx = 0; split_idx < split_sizes.size(); split_idx++) {
-    const int64_t split_size = split_sizes.at(split_idx);
-    const ValueRef out_ref = out_list->at(split_idx);
-
-    std::vector<int64_t> out_sizes = input_sizes;
-    out_sizes.at(dim_index) = split_size;
-
-    graph->virtual_resize(out_ref, out_sizes);
-  }
-}
-
 void add_split_node(
     ComputeGraph& graph,
     const ValueRef input,
@@ -125,7 +86,8 @@ void split_with_sizes_copy_default(
   ValueRef out_list_ref = args[3];
 
   int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
-  std::vector<int64_t> split_sizes = *(graph.get_int_list(split_sizes_ref));
+  std::vector<int64_t> split_sizes =
+      graph.extract_int_or_symint_list(split_sizes_ref);
 
   add_split_with_sizes_node(graph, input, split_sizes, dim, out_list_ref);
 }