[ET-VK][qlinear] Add bias support to q4gsw and dq8ca_q4gsw quantized linear ops

ssjia · SS-JIA · commit ce21031facff · 2026-03-17T21:54:37.000-04:00
Pull Request resolved: #18061 Wire bias through the q4gsw and dq8ca_q4gsw quantized linear operators. Add add_bias_to_out_tile() helper in the output tile computation header and call it from all three shader variants (tiled, coop, dq8ca_tiled). Remove the bias guard in the pattern matcher to allow biased linear layers. ghstack-source-id: 353546681 @exported-using-ghexport Differential Revision: [D95970172](https://our.internmc.facebook.com/intern/diff/D95970172/)
diff --git a/backends/vulkan/custom_ops_lib.py b/backends/vulkan/custom_ops_lib.py
@@ -8,7 +8,6 @@
 
 import executorch.backends.vulkan.patterns as vk_patterns
 import torch.library
-
 from torch._subclasses.fake_tensor import FakeTensor
 
 namespace = "et_vk"
@@ -259,7 +258,7 @@ def linear_q4gsw(
         weights, [1, group_size], weight_scales, weight_zeros, torch.int8, -8, 7
     )
 
-    out = torch.nn.functional.linear(x, weights)
+    out = torch.nn.functional.linear(x, weights, bias)
     return out
 
 
@@ -273,7 +272,7 @@ def linear_dq8ca_q4gsw(
     group_size: int,
     bias: Optional[torch.Tensor] = None,
 ):
-    return linear_q4gsw(x, weights, weight_scales, group_size)
+    return linear_q4gsw(x, weights, weight_scales, group_size, bias)
 
 
 name = "linear_q4gsw"
diff --git a/backends/vulkan/patterns/quantized_linear.py b/backends/vulkan/patterns/quantized_linear.py
@@ -5,28 +5,22 @@
 # LICENSE file in the root directory of this source tree.
 
 import operator
-
 from typing import Optional
 
 import executorch.backends.vulkan.utils as utils
-
 import torch
 import torch.nn.functional as F
-
 from executorch.backends.transforms.utils import (
     create_constant_placeholder,
     get_param_tensor,
 )
-
 from executorch.backends.vulkan.patterns.pattern_registry import (
     PatternMatch,
     register_pattern_detector,
     register_pattern_replacement,
 )
-
 from executorch.exir import ExportedProgram
 from executorch.exir.dialects._ops import ops as exir_ops
-
 from torch.export.graph_signature import InputKind
 
 
@@ -398,6 +392,12 @@ def make_linear_q4gsw_op(
         force_update=True,
     )
 
+    # Pad bias to multiple of 4 if present
+    if match.bias_node is not None:
+        bias_tensor = get_param_tensor(ep, match.bias_node)
+        if bias_tensor is not None:
+            utils.align_width_and_update_state_dict(ep, match.bias_node, bias_tensor)
+
     with graph_module.graph.inserting_before(match.output_node):
         linear_q4gsw_node = graph_module.graph.create_node(
             "call_function",
@@ -407,6 +407,7 @@ def make_linear_q4gsw_op(
                 match.weight_node,
                 match.weight_scales_node,
                 group_size,
+                match.bias_node,
             ),
         )
 
@@ -445,6 +446,12 @@ def make_linear_dq8ca_q4gsw_op(
         force_update=True,
     )
 
+    # Pad bias to multiple of 4 if present
+    if match.bias_node is not None:
+        bias_tensor = get_param_tensor(ep, match.bias_node)
+        if bias_tensor is not None:
+            utils.align_width_and_update_state_dict(ep, match.bias_node, bias_tensor)
+
     first_graph_node = list(graph_module.graph.nodes)[0]
     with graph_module.graph.inserting_before(first_graph_node):
         weight_tensor_name = utils.get_tensor_name(ep, match.weight_node)
@@ -474,6 +481,7 @@ def make_linear_dq8ca_q4gsw_op(
                 weight_sums_node,
                 match.weight_scales_node,
                 group_size,
+                match.bias_node,
             ),
         )
 
@@ -538,6 +546,7 @@ def make_linear_q8ta_q8csw_custom_op(
                 match.weight_node,
                 weight_sums_node,
                 match.weight_scales_node,
+                match.bias_node,
             ),
         )
 
@@ -637,7 +646,6 @@ def replace_quantized_linear_patterns(
     assert weight_zeros_tensor is not None
 
     # Route to appropriate custom op.
-    # q8ta_linear supports bias, so check it first before the bias guard.
     if (
         match.is_input_static_per_tensor_quantized()
         and match.is_weight_perchannel_quantized()
@@ -646,10 +654,6 @@ def replace_quantized_linear_patterns(
         make_q8ta_linear_custom_op(ep, graph_module, match, weight_tensor)
         return
 
-    # Remaining ops do not support bias
-    if match.bias_node is not None:
-        return
-
     if (
         match.is_weight_only_quantized()
         and match.is_weight_pergroup_quantized()
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_dq8ca_q4gsw_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_dq8ca_q4gsw_tiled.glsl
@@ -144,5 +144,11 @@ void main() {
         group_size);
   }
 
+  if (apply_bias > 0) {
+    FPPerOutChannelParams bias_tile;
+    load_bias_tile(bias_tile, n4);
+    add_bias_to_out_tile(out_tile, bias_tile);
+  }
+
   write_output_tile_with_checks(out_tile, n4, m, N4, M);
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_fp_compute.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_fp_compute.glslh
@@ -73,6 +73,16 @@ void apply_weight_scales_and_biases(
   }
 }
 
+void add_bias_to_out_tile(
+    inout FPOutTile tile,
+    const FPPerOutChannelParams bias) {
+  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
+    [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
+      tile.data[m][n4] = tile.data[m][n4] + bias.data[n4];
+    }
+  }
+}
+
 void accumulate_out_tile_with_out_tile(
     inout FPOutTile accum,
     const FPOutTile other) {
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_coop.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_coop.glsl
@@ -142,6 +142,11 @@ void main() {
   // Only the first thread will write out result
   if (lid == 0) {
     out_tile = partial_sums[0];
+    if (apply_bias > 0) {
+      FPPerOutChannelParams bias_tile;
+      load_bias_tile(bias_tile, n4);
+      add_bias_to_out_tile(out_tile, bias_tile);
+    }
     write_output_tile_with_checks(out_tile, n4, 0, N4, 1);
   }
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_tiled.glsl
@@ -110,5 +110,11 @@ void main() {
     }
   }
 
+  if (apply_bias > 0) {
+    FPPerOutChannelParams bias_tile;
+    load_bias_tile(bias_tile, n4);
+    add_bias_to_out_tile(out_tile, bias_tile);
+  }
+
   write_output_tile_with_checks(out_tile, n4, m, N4, M);
 }
diff --git a/backends/vulkan/test/custom_ops/q4gsw_linear.cpp b/backends/vulkan/test/custom_ops/q4gsw_linear.cpp
@@ -148,7 +148,7 @@ TestCase create_test_case_from_config(
       input_dtype,
       storage_type,
       utils::kWidthPacked,
-      DataGenType::ZEROS);
+      config.has_bias ? DataGenType::RANDOM : DataGenType::ZEROS);
   bias.set_constant(true);
   if (!config.has_bias) {
     bias.set_none(true);
@@ -237,9 +237,10 @@ std::vector<TestCase> generate_quantized_linear_test_cases() {
       {32, 64, 32, 16},
       {32, 128, 64, 32},
       {32, 256, 128, 64},
-      // No bias tests
-      {32, 128, 64, 32, false},
-      {32, 256, 128, 64, false},
+      // With bias
+      {4, 64, 32, 16, true},
+      {4, 128, 64, 32, true},
+      {32, 128, 64, 32, true},
       // Performance test cases
       {1, 2048, 2048, 128},
       {128, 2048, 2048, 128},

Original file line number	Diff line number	Diff line change
`@@ -144,5 +144,11 @@ void main() {`
`144`	`144`	`group_size);`
`145`	`145`	`}`
`146`	`146`
	`147`	`+ if (apply_bias > 0) {`
	`148`	`+ FPPerOutChannelParams bias_tile;`
	`149`	`+ load_bias_tile(bias_tile, n4);`
	`150`	`+ add_bias_to_out_tile(out_tile, bias_tile);`
	`151`	`+ }`
	`152`	`+`
`147`	`153`	`write_output_tile_with_checks(out_tile, n4, m, N4, M);`
`148`	`154`	`}`
Original file line number	Diff line number	Diff line change
`@@ -142,6 +142,11 @@ void main() {`
`142`	`142`	`// Only the first thread will write out result`
`143`	`143`	`if (lid == 0) {`
`144`	`144`	`out_tile = partial_sums[0];`
	`145`	`+ if (apply_bias > 0) {`
	`146`	`+ FPPerOutChannelParams bias_tile;`
	`147`	`+ load_bias_tile(bias_tile, n4);`
	`148`	`+ add_bias_to_out_tile(out_tile, bias_tile);`
	`149`	`+ }`
`145`	`150`	`write_output_tile_with_checks(out_tile, n4, 0, N4, 1);`
`146`	`151`	`}`
`147`	`152`	`}`
Original file line number	Diff line number	Diff line change
`@@ -110,5 +110,11 @@ void main() {`
`110`	`110`	`}`
`111`	`111`	`}`
`112`	`112`
	`113`	`+ if (apply_bias > 0) {`
	`114`	`+ FPPerOutChannelParams bias_tile;`
	`115`	`+ load_bias_tile(bias_tile, n4);`
	`116`	`+ add_bias_to_out_tile(out_tile, bias_tile);`
	`117`	`+ }`
	`118`	`+`
`113`	`119`	`write_output_tile_with_checks(out_tile, n4, m, N4, M);`
`114`	`120`	`}`