Fix slice and padding for TensorCoreTiledLayout for int4 weight only quantization

jerryzh168 · jerryzh168 · commit 174cce6ffbb5 · 2025-04-03T19:47:01.000-07:00
Summary:
Previously some of the code paths are not exercised, so the bug was not discovered

but there are some bug related to slice operation and padding, basically
scale and zero_point are not padded before, this results in errors when it is required.

Test Plan:
python test/dtypes/test_affine_quantized.py -k test_slice

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/test/dtypes/test_affine_quantized.py b/test/dtypes/test_affine_quantized.py
@@ -17,6 +17,7 @@
 from torchao.core.config import AOBaseConfig
 from torchao.dtypes import CutlassInt4PackedLayout, Int4CPULayout, SemiSparseLayout
 from torchao.quantization import (
+    Int4WeightOnlyConfig,
     Int8DynamicActivationInt8WeightConfig,
     float8_weight_only,
     int4_dynamic_activation_int4_weight,
@@ -307,6 +308,18 @@ def test_alias(self, device, dtype):
         quantize_(dummy, Int8DynamicActivationInt8WeightConfig())
         _ = dummy.weight[...]
 
+    @common_utils.parametrize("device", ["cuda"] if torch.cuda.is_available() else [])
+    @common_utils.parametrize("dtype", [torch.bfloat16])
+    def test_slice(self, device, dtype):
+        # in_feature not divisible by 1024
+        # out_feature not divisible by 8
+        # to test slice + padding for int4 weight only quantization
+        dummy = nn.Linear(256, 321, dtype=dtype, device=device)
+        quantize_(dummy, Int4WeightOnlyConfig())
+        # make sure these run without error
+        _ = dummy.weight.narrow(0, 0, 64)
+        _ = dummy.weight.narrow(1, 0, 128)
+
 
 common_utils.instantiate_parametrized_tests(TestAffineQuantized)
 common_utils.instantiate_parametrized_tests(TestAffineQuantizedBasic)
diff --git a/torchao/dtypes/affine_quantized_tensor.py b/torchao/dtypes/affine_quantized_tensor.py
@@ -284,7 +284,9 @@ def from_hp_to_intx(
             )
             # Note: output will be uint8 tensor for sub byte tensors for now
 
-        data = _layout.post_process(data)
+        data, scale, zero_point = _layout.post_process(
+            data, scale, zero_point, block_size
+        )
         tensor_impl_ctr = get_tensor_impl_constructor(type(_layout))
         tensor_impl = tensor_impl_ctr(data, scale, zero_point, _layout)
         return cls(
diff --git a/torchao/dtypes/uintx/tensor_core_tiled_layout.py b/torchao/dtypes/uintx/tensor_core_tiled_layout.py
@@ -153,15 +153,30 @@ def pre_process_static(
         zero_point = torch.nn.functional.pad(zero_point, padding_changes)
         return input, scale, zero_point
 
-    def post_process(self, input: torch.Tensor) -> torch.Tensor:
+    def post_process(
+        self,
+        input: torch.Tensor,
+        scale: torch.Tensor,
+        zero_point: torch.Tensor,
+        block_size: Tuple[int, ...],
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         orig_out_features, orig_in_features = input.shape
         in_features = find_multiple(orig_in_features, 1024)
         out_features = find_multiple(orig_out_features, 8)
         input = torch.nn.functional.pad(
             input,
             (0, in_features - orig_in_features, 0, out_features - orig_out_features),
         )
-        return input
+        assert (
+            len(block_size) == 2
+        ), f"TensorCoreTiledLayout only supports len(block_size) == 2, got: {block_size}"
+        scale_pad_dim_0 = (out_features - orig_out_features) // block_size[0]
+        scale_pad_dim_1 = (in_features - orig_in_features) // block_size[1]
+        scale = torch.nn.functional.pad(scale, (0, scale_pad_dim_1, 0, scale_pad_dim_0))
+        zero_point = torch.nn.functional.pad(
+            zero_point, (0, scale_pad_dim_1, 0, scale_pad_dim_0)
+        )
+        return input, scale, zero_point
 
     def extra_repr(self):
         return f"inner_k_tiles={self.inner_k_tiles}"
@@ -335,31 +350,25 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
 
         if func is aten.slice.Tensor:
             self, dim, start, end, step = fill_defaults(args, 5, [0, None, None, 1])
-            if dim == 0:
-                int_data, scale, zero_point = self.get_plain()
-                int_data = aten.slice.Tensor(int_data, dim, start, end, step)
-                # this is to handle padding
-                int_data = self._layout.post_process(int_data)
-                sliced = self.from_plain(int_data, scale, zero_point, self._layout)
-                return return_and_correct_aliasing(func, args, kwargs, sliced)
-            elif dim == 1:
+            if dim in [0, 1]:
                 int_data, scale, zero_point = self.get_plain()
-                assert step == 1, "Only step == 1 is supported in slicing right now"
                 data_len = int_data.shape[dim]
                 scale_len = scale.shape[dim]
                 ratio = data_len / scale_len
                 start_scale = int(start / ratio)
                 end_scale = int(end / ratio)
 
                 int_data = aten.slice.Tensor(int_data, dim, start, end, step)
-                # this is to handle padding
-                int_data = self._layout.post_process(int_data)
                 scale = aten.slice.Tensor(scale, dim, start_scale, end_scale, step)
                 zero_point = aten.slice.Tensor(
                     zero_point, dim, start_scale, end_scale, step
                 )
+                # this is to handle padding
+                int_data, scale, zero_point = self._layout.post_process(
+                    int_data, scale, zero_point, self.block_size
+                )
                 sliced = self.from_plain(int_data, scale, zero_point, self._layout)
-                return sliced
+                return return_and_correct_aliasing(func, args, kwargs, sliced)
             else:
                 raise NotImplementedError(
                     f"TensorCoreTiledAQTTensorImpl dispatch: attempting to run {func}, with dim={dim}, that is not supported"
@@ -371,6 +380,18 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
 
     __torch_function__ = torch._C._disabled_torch_function_impl
 
+    @property
+    def block_size(self):
+        from torchao.quantization.utils import unpack_tinygemm_scales_and_zeros
+
+        scale, zero = unpack_tinygemm_scales_and_zeros(self.scale_and_zero)
+        cur_shape = self.shape
+        assert len(cur_shape) == 4
+        inner_k_tiles = cur_shape[-1] * 2
+        original_shape = (cur_shape[0] * 8, cur_shape[1] * (inner_k_tiles * 16))
+        groupsize = int(original_shape[1] / scale.shape[-2])
+        return (1, groupsize)
+
     def get_plain(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         from torchao.quantization.quant_primitives import (
             ZeroPointDomain,
diff --git a/torchao/dtypes/utils.py b/torchao/dtypes/utils.py
@@ -44,8 +44,14 @@ class Layout:
     def pre_process(self, input: torch.Tensor) -> torch.Tensor:
         return input
 
-    def post_process(self, input: torch.Tensor) -> torch.Tensor:
-        return input
+    def post_process(
+        self,
+        input: torch.Tensor,
+        scale: torch.Tensor,
+        zero_point: torch.Tensor,
+        block_size: Tuple[int, ...],
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        return input, scale, zero_point
 
     def pre_process_static(
         self,