[XNNPACK] resolve ambiguity around 2d affine quantized tensors

mcr229 · mcr229 · commit 4febe74c5802 · 2025-03-04T20:22:24.000-08:00
diff --git a/backends/xnnpack/test/ops/test_linear.py b/backends/xnnpack/test/ops/test_linear.py
@@ -423,10 +423,16 @@ def test_qd8_fp32_per_token_weight_per_channel_group_int4(self):
                     use_bias=use_bias,
                 )
 
+                # rank 3
                 inputs = (torch.randn(1, M, K),)
                 self._test_groupwise_dq_linear(
                     lin_mod, inputs, group_size=bl, use_bias=use_bias
                 )
+                # rank 2
+                inputs = (torch.randn(1, K),)
+                self._test_groupwise_dq_linear(
+                    lin_mod, inputs, group_size=bl, use_bias=use_bias
+                )
 
     @unittest.skipIf(
         not torchao_installed, "Per Channel Group Quantization Required TorchAO"
@@ -437,28 +443,29 @@ def test_qd8_fp16_per_token_weight_per_channel_group_int4(self):
         bl_sizes = [32, 32, 32, 64]
         N_sizes = [2, 17, 92, 128]
 
-        for use_bias in [True, False]:
-            for M, K, bl, N in zip(M_sizes, K_sizes, bl_sizes, N_sizes):
-                lin_mod = BaseLinear(
-                    in_size=M,
-                    input_channels=K,
-                    output_channels=N,
-                    dtype=torch.float16,
-                    use_bias=use_bias,
-                )
+        for input_rank in range(2, 4):
+            for use_bias in [True, False]:
+                for M, K, bl, N in zip(M_sizes, K_sizes, bl_sizes, N_sizes):
+                    lin_mod = BaseLinear(
+                        in_size=M,
+                        input_channels=K,
+                        output_channels=N,
+                        dtype=torch.float16,
+                        use_bias=use_bias,
+                    )
 
-                inputs = lin_mod.get_inputs()
-                # This requires slightly higher atol, but if you look at error it is not that bad:
-                # Difference: max: 0.00140380859375, abs: 0.00140380859375, mean abs error: 0.00042724609375.
-                # -- Model vs. Reference --
-                # Numel: 4, 4
-                # Median: -0.05023193359375, -0.0516357421875
-                # Mean: 0.2373046875, 0.237060546875
-                # Max: 1.0078125, 1.0078125
-                # Min: -0.08465576171875, -0.08441162109375
-                self._test_groupwise_dq_linear(
-                    lin_mod, inputs, group_size=bl, use_bias=use_bias, atol=1e-2
-                )
+                    inputs = lin_mod.get_inputs(rank=input_rank)
+                    # This requires slightly higher atol, but if you look at error it is not that bad:
+                    # Difference: max: 0.00140380859375, abs: 0.00140380859375, mean abs error: 0.00042724609375.
+                    # -- Model vs. Reference --
+                    # Numel: 4, 4
+                    # Median: -0.05023193359375, -0.0516357421875
+                    # Mean: 0.2373046875, 0.237060546875
+                    # Max: 1.0078125, 1.0078125
+                    # Min: -0.08465576171875, -0.08441162109375
+                    self._test_groupwise_dq_linear(
+                        lin_mod, inputs, group_size=bl, use_bias=use_bias, atol=1e-2
+                    )
 
     @unittest.skipIf(
         not torchao_installed, "Per Channel Group Quantization Required TorchAO"
diff --git a/backends/xnnpack/utils/quant_utils.py b/backends/xnnpack/utils/quant_utils.py
@@ -50,7 +50,7 @@ def is_dynamic_qdq(node: torch.fx.Node) -> bool:
     if node.op != "call_function":
         return False
     node_name = format_target_name(node.target.__name__)  # pyre-ignore
-    is_dynamic_affine = is_per_token(node) and not is_per_channel_group(node)
+    is_dynamic_affine = is_per_token(node)
 
     return node_name in _DYNAMIC_OPS or is_dynamic_affine
 
@@ -120,6 +120,9 @@ def is_per_token(node: torch.fx.Node):
 
         flag &= block_size[-1] == input_val.shape[-1]
         flag &= scale_val.numel() == scale_numel_expected
+        scale_node = node.all_input_nodes[1]
+        # per token must have dynamically chosen scale
+        flag &= scale_node.target == operator.getitem
         return flag
 
     return False
@@ -140,6 +143,7 @@ def is_per_channel_group(node: torch.fx.Node):
         scale_numel = list(accumulate(scale_val.shape, operator.mul))[-1]
         input_numel = list(accumulate(input_val.shape, operator.mul))[-1]
         flag &= input_numel == group_size * scale_numel
+        flag &= not is_per_token(node)
         return flag
 
     return False