encapsulate version check as helpers

airMeng · airMeng · commit 522b8c75f0dc · 2025-04-09T07:41:26.000Z
remove zero_point_dtype assigning

Signed-off-by: Meng, Hengyu &lt;hengyu.meng@intel.com&gt;

fix import lint

enable zp dtype: u8/s8/s16/s32/s64

Signed-off-by: Meng, Hengyu &lt;hengyu.meng@intel.com&gt;
diff --git a/test/dtypes/test_affine_quantized.py b/test/dtypes/test_affine_quantized.py
@@ -36,8 +36,8 @@
 from torchao.testing.utils import skip_if_no_cuda, skip_if_rocm
 from torchao.utils import (
     TORCH_VERSION_AT_LEAST_2_5,
-    TORCH_VERSION_AT_LEAST_2_6,
-    TORCH_VERSION_AT_LEAST_2_8,
+    check_cpu_version,
+    check_xpu_version,
     is_fbcode,
     is_ROCM,
     is_sm_at_least_89,
@@ -58,11 +58,11 @@ def get_quantization_functions(
         int8_dynamic_activation_int8_weight(act_mapping_type=MappingType.ASYMMETRIC),
     ]
     if do_int4:
-        if device == "cpu" and TORCH_VERSION_AT_LEAST_2_6:
+        if check_cpu_version(device):
             base_functions.append(
                 int4_weight_only(group_size=32, layout=Int4CPULayout())
             )
-        elif device == "xpu" and TORCH_VERSION_AT_LEAST_2_8:
+        elif check_xpu_version(device):
             base_functions.append(
                 int4_weight_only(group_size=32, layout=Int4XPULayout())
             )
diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py
@@ -20,7 +20,6 @@
 
 import torchao
 from torchao.dtypes import Int4CPULayout, Int4XPULayout, TensorCoreTiledLayout
-from torchao.dtypes.utils import is_device
 from torchao.quantization import safe_int_mm
 from torchao.quantization.autoquant import (
     AQFloat8PerRowScalingDynamicallyQuantizedLinearWeight,
@@ -83,8 +82,9 @@
     TORCH_VERSION_AT_LEAST_2_5,
     TORCH_VERSION_AT_LEAST_2_6,
     TORCH_VERSION_AT_LEAST_2_7,
-    TORCH_VERSION_AT_LEAST_2_8,
     benchmark_model,
+    check_cpu_version,
+    check_xpu_version,
     is_fbcode,
     is_sm_at_least_90,
     unwrap_tensor_subclass,
@@ -147,21 +147,15 @@ def _int8da_int8w_api(
 
 
 def _int4wo_api(mod, use_hqq=False):
-    if (
-        is_device(next(mod.parameters()).device.type, "cpu")
-        and TORCH_VERSION_AT_LEAST_2_6
-    ):
+    if check_cpu_version(next(mod.parameters()).device):
         quantize_(
             mod,
             int4_weight_only(
                 layout=Int4CPULayout(), use_hqq=use_hqq, set_inductor_config=False
             ),
         )
         unwrap_tensor_subclass(mod)
-    elif (
-        is_device(next(mod.parameters()).device.type, "xpu")
-        and TORCH_VERSION_AT_LEAST_2_8
-    ):
+    elif check_xpu_version(next(mod.parameters()).device):
         quantize_(
             mod, int4_weight_only(layout=Int4XPULayout()), set_inductor_config=False
         )
@@ -1138,9 +1132,9 @@ def test_int4_weight_only_quant_subclass_api_grouped(self, device, dtype):
         if dtype != torch.bfloat16:
             self.skipTest(f"Fails for {dtype}")
         layout_list = []
-        if device == "cpu" and TORCH_VERSION_AT_LEAST_2_6:
+        if check_cpu_version(device):
             layout_list.append(Int4CPULayout())
-        elif device == "xpu" and TORCH_VERSION_AT_LEAST_2_8:
+        elif check_xpu_version(device):
             layout_list.append(Int4XPULayout())
         else:
             for inner_k_tiles in [4, 2]:
diff --git a/test/quantization/test_quant_primitives.py b/test/quantization/test_quant_primitives.py
@@ -11,7 +11,6 @@
 import torch
 from parameterized import parameterized
 
-from torchao.dtypes.utils import is_device
 from torchao.float8.float8_utils import EPS as float8_eps
 from torchao.quantization.quant_primitives import (
     MappingType,
@@ -38,7 +37,8 @@
     TORCH_VERSION_AT_LEAST_2_4,
     TORCH_VERSION_AT_LEAST_2_5,
     TORCH_VERSION_AT_LEAST_2_6,
-    TORCH_VERSION_AT_LEAST_2_8,
+    check_cpu_version,
+    check_xpu_version,
     is_fbcode,
 )
 
@@ -136,9 +136,7 @@ def _groupwise_affine_quantize_tensor_from_qparams(
         )
 
     if TORCH_VERSION_AT_LEAST_2_5:
-        if (not (is_device(w.device.type, "cpu") and TORCH_VERSION_AT_LEAST_2_6)) and (
-            not (is_device(w.device.type, "xpu") and TORCH_VERSION_AT_LEAST_2_8)
-        ):
+        if (not (check_cpu_version(w.device))) and (not (check_xpu_version(w.device))):
             w_int4x8 = (w_int4x8[::, ::2] << 4 | w_int4x8[::, 1::2]).to(torch.uint8)
 
     return w_int4x8
@@ -747,16 +745,8 @@ def test_groupwise_affine_dequantize_tensor_from_qparams(self):
                 zeros = torch.randint(0, 15, (10, 2), dtype=torch.int32)
             if TORCH_VERSION_AT_LEAST_2_5:
                 input_tmp = input
-                if (
-                    not (
-                        is_device(input.device.type, "cpu")
-                        and TORCH_VERSION_AT_LEAST_2_6
-                    )
-                ) and (
-                    not (
-                        is_device(input.device.type, "xpu")
-                        and TORCH_VERSION_AT_LEAST_2_8
-                    )
+                if (not (check_cpu_version(input.device))) and (
+                    not (check_xpu_version(input.device))
                 ):
                     input_tmp = (input[::, ::2] << 4 | input[::, 1::2]).to(torch.uint8)
                 w_bf16 = groupwise_affine_dequantize_tensor_from_qparams(
diff --git a/torchao/kernel/intmm.py b/torchao/kernel/intmm.py
@@ -8,7 +8,7 @@
 
 import torch
 
-from torchao.utils import TORCH_VERSION_AT_LEAST_2_2, TORCH_VERSION_AT_LEAST_2_6
+from torchao.utils import TORCH_VERSION_AT_LEAST_2_2, check_cpu_version
 
 logger = logging.getLogger(__name__)
 logger.addHandler(logging.NullHandler())
@@ -154,7 +154,7 @@ def int_scaled_matmul(
     scales1 = scales1.expand((M, N))
     assert scales1.dim() == 2
 
-    if scales1.device.type == "cpu" and TORCH_VERSION_AT_LEAST_2_6:
+    if check_cpu_version(scales1.device):
         # CPU prefers decomposed version of int_scaled_matmul
         # to leverage the fusion capability of Inductor
         c = torch._int_mm(a, b)
diff --git a/torchao/prototype/hqq/hqq_tinygemm_linear.py b/torchao/prototype/hqq/hqq_tinygemm_linear.py
@@ -17,7 +17,7 @@
 from torch import Tensor, nn
 
 from torchao.dtypes.utils import is_device
-from torchao.utils import TORCH_VERSION_AT_LEAST_2_5, TORCH_VERSION_AT_LEAST_2_6
+from torchao.utils import TORCH_VERSION_AT_LEAST_2_5, check_cpu_version
 
 
 class HQQLinearTorchWeightOnlyInt4(torch.nn.Module):
@@ -167,7 +167,7 @@ def process_hqq_quants(self, W_q, meta):
         W_q_torch, scales_torch, zeros_torch = self.hqq_quants_to_torch_quants(
             W_q=W_q, scales=scales, zeros=zeros, shape=shape, nbits=self.nbits
         )
-        if is_device(W_q.device.type, "cpu") and TORCH_VERSION_AT_LEAST_2_6:
+        if check_cpu_version(W_q.device):
             self.weight_int4pack = torch.ops.aten._convert_weight_to_int4pack_for_cpu(
                 W_q_torch, self.inner_k_tiles
             )
@@ -243,7 +243,7 @@ def pack_scales_and_zeros(self, scales, zeros):
     def matmul(self, x):
         origin_x_size = x.size()
         x = x.reshape(-1, origin_x_size[-1])
-        if is_device(x.device.type, "cpu") and TORCH_VERSION_AT_LEAST_2_6:
+        if check_cpu_version(x.device):
             c = torch.ops.aten._weight_int4pack_mm_for_cpu(
                 x, self.weight_int4pack, self.groupsize, self.scales_and_zeros
             )
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -1039,6 +1039,9 @@ def _int4_weight_only_transform(
             zero_point_domain in LAYOUT_TO_ZERO_POINT_DOMAIN[type(layout)]
         ), f"Layout only support {LAYOUT_TO_ZERO_POINT_DOMAIN[layout]}"
 
+    if zero_point_domain == ZeroPointDomain.INT and isinstance(layout, Int4XPULayout):
+        zero_point_dtype = torch.int32
+
     preserve_zero = (
         config.preserve_zero
         if config.preserve_zero is not None
diff --git a/torchao/quantization/quant_primitives.py b/torchao/quantization/quant_primitives.py
@@ -958,7 +958,16 @@ def _choose_qparams_affine(
         elif zero_point_domain == ZeroPointDomain.INT.name:
             zero_point = quant_min - torch.round(min_val_neg / scale)
             zero_point = torch.clamp(zero_point, quant_min, quant_max)
-            zero_point_dtype = torch.int32
+            assert (
+                zero_point_dtype
+                in [
+                    torch.int8,
+                    torch.uint8,
+                    torch.int16,
+                    torch.int32,
+                    torch.int64,
+                ]
+            ), "zero_point_dtype must be int8/uint8/int16/int32/int64 if ZeroPointDomain.INT"
         else:
             assert (
                 zero_point_domain == ZeroPointDomain.FLOAT.name
diff --git a/torchao/quantization/subclass.py b/torchao/quantization/subclass.py
@@ -8,7 +8,6 @@
 import torch
 from torch.utils._python_dispatch import return_and_correct_aliasing
 
-from torchao.dtypes.utils import is_device
 from torchao.quantization.utils import (
     dequantize_per_channel,
     dynamically_quantize_per_channel,
@@ -17,8 +16,8 @@
     unpack_tinygemm_scales_and_zeros,
 )
 from torchao.utils import (
-    TORCH_VERSION_AT_LEAST_2_6,
-    TORCH_VERSION_AT_LEAST_2_8,
+    check_cpu_version,
+    check_xpu_version,
     find_multiple,
 )
 
@@ -473,14 +472,14 @@ def _quantized_op(act_mat, w_qtensor, bias):
         act_mat = torch.nn.functional.pad(act_mat, (0, pad_size - act_mat.shape[-1]))
 
         # matmul
-        if is_device(act_mat.device.type, "cpu") and TORCH_VERSION_AT_LEAST_2_6:
+        if check_cpu_version(act_mat.device):
             y = aten._weight_int4pack_mm_for_cpu(
                 act_mat.contiguous(),
                 w_qtensor.int_data,
                 w_qtensor.groupsize,
                 w_qtensor.scales_and_zeros,
             )
-        elif is_device(act_mat.device.type, "xpu") and TORCH_VERSION_AT_LEAST_2_8:
+        elif check_xpu_version(act_mat.device):
             if not w_qtensor.zero_point_domain == ZeroPointDomain.INT:
                 y = aten._weight_int4pack_mm(
                     act_mat.contiguous(),
@@ -694,11 +693,11 @@ def to_qtensor_components(
             zero_point_domain=zero_point_domain,
             preserve_zero=preserve_zero,
         )
-        if is_device(input_float.device.type, "cpu") and TORCH_VERSION_AT_LEAST_2_6:
+        if check_cpu_version(input_float.device):
             int_data = aten._convert_weight_to_int4pack_for_cpu(
                 input_int4x8, inner_k_tiles
             )
-        if is_device(input_float.device.type, "xpu") and TORCH_VERSION_AT_LEAST_2_8:
+        if check_xpu_version(input_float.device):
             from torchao.quantization.utils import convert_weight_to_int4pack_xpu
 
             int_data = convert_weight_to_int4pack_xpu(
diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py
@@ -9,7 +9,6 @@
 import torch
 from torch.utils._python_dispatch import TorchDispatchMode
 
-from torchao.dtypes.utils import is_device
 from torchao.kernel import (
     int_scaled_matmul,
 )
@@ -22,8 +21,8 @@
 )
 from torchao.utils import (
     TORCH_VERSION_AT_LEAST_2_5,
-    TORCH_VERSION_AT_LEAST_2_6,
-    TORCH_VERSION_AT_LEAST_2_8,
+    check_cpu_version,
+    check_xpu_version,
 )
 
 __all__ = [
@@ -431,10 +430,8 @@ def groupwise_affine_quantize_tensor_from_qparams(
         zero_point_domain=zero_point_domain,
     )
     if TORCH_VERSION_AT_LEAST_2_5 and w.shape[-1] > 1:
-        if (
-            not (is_device(int_data.device.type, "cpu") and TORCH_VERSION_AT_LEAST_2_6)
-        ) and (
-            not (is_device(int_data.device.type, "xpu") and TORCH_VERSION_AT_LEAST_2_8)
+        if (not (check_cpu_version(int_data.device))) and (
+            not (check_xpu_version(int_data.device))
         ):
             int_data = (int_data[::, ::2] << 4 | int_data[::, 1::2]).to(torch.uint8)
     return int_data
@@ -454,8 +451,8 @@ def groupwise_affine_dequantize_tensor_from_qparams(
     if (
         TORCH_VERSION_AT_LEAST_2_5
         and (w_int4x8.dtype == torch.uint8 or w_int4x8.shape[-1] > 1)
-        and not (is_device(w_int4x8.device.type, "cpu") and TORCH_VERSION_AT_LEAST_2_6)
-        and not (is_device(w_int4x8.device.type, "xpu") and TORCH_VERSION_AT_LEAST_2_8)
+        and not (check_cpu_version(w_int4x8.device))
+        and not (check_xpu_version(w_int4x8.device))
     ):
         data = w_int4x8.to(torch.int32)
         high_bits = data >> 4
diff --git a/torchao/utils.py b/torchao/utils.py
@@ -676,6 +676,18 @@ def is_sm_at_least_100():
     )
 
 
+def check_cpu_version(device, version="2.6.0"):
+    if isinstance(device, torch.device):
+        device = device.type
+    return device == "cpu" and compare_versions(torch.__version__, version) >= 0
+
+
+def check_xpu_version(device, version="2.8.0"):
+    if isinstance(device, torch.device):
+        device = device.type
+    return device == "xpu" and compare_versions(torch.__version__, version) >= 0
+
+
 TORCH_VERSION_AFTER_2_5 = _torch_version_at_least("2.5.0.dev")
 TORCH_VERSION_AFTER_2_4 = _torch_version_at_least("2.4.0.dev")
 TORCH_VERSION_AFTER_2_3 = _torch_version_at_least("2.3.0.dev")