pytorch · metascroy · Apr 25, 2025 · Apr 21, 2025 · Apr 21, 2025 · Apr 21, 2025
diff --git a/test/quantization/test_qat.py b/test/quantization/test_qat.py
@@ -1474,7 +1474,6 @@ def test_fake_quantize_per_token_vs_convert(self, dtype: torch.dtype):
     @unittest.skipIf(
         not TORCH_VERSION_AT_LEAST_2_4, "skipping when torch version is 2.4 or lower"
     )
-    @unittest.skip("Currently failing on sqnr")
     def test_qat_8da4w_prepare_vs_convert(self, dtype: torch.dtype):
         """
         Test that the prepare and convert steps of Int8DynActInt4QATQuantizer produces
@@ -1493,7 +1492,9 @@ def test_qat_8da4w_prepare_vs_convert(self, dtype: torch.dtype):
             torch.manual_seed(seed)
             x = m.example_inputs()
 
-            quantizer = Int8DynActInt4WeightQATQuantizer(groupsize=group_size)
+            quantizer = Int8DynActInt4WeightQATQuantizer(
+                groupsize=group_size, precision=dtype, scales_precision=dtype
+            )
             prepared = quantizer.prepare(m)
             prepared_out = prepared(*x)
             converted = quantizer.convert(prepared)

diff --git a/torchao/experimental/quant_passes.py b/torchao/experimental/quant_passes.py
@@ -86,7 +86,7 @@ def _get_q_dq_linear_patterns_replacements_and_filters(
     glbs["a_quant_min"] = None
     glbs["a_quant_max"] = None
     glbs["a_mapping_type"] = "ASYMMETRIC"
-    glbs["a_scale_dtype"] = torch.float64
+    glbs["a_scale_dtype"] = torch.float32
     glbs["a_eps"] = None
 
     lcls = {}

diff --git a/torchao/experimental/tests/test_embedding_xbit_quantizer.py b/torchao/experimental/tests/test_embedding_xbit_quantizer.py
@@ -32,6 +32,7 @@
     MappingType,
     quantize_,
 )
+from torchao.quantization.utils import compute_error
 
 
 class TestEmbeddingQuantizer(unittest.TestCase):
@@ -254,7 +255,7 @@ def test_identical_to_IntxWeightOnlyConfig(
             for granularity in [PerGroup(32), PerGroup(128), PerAxis(0)]
             for mapping_type in [MappingType.SYMMETRIC, MappingType.ASYMMETRIC]
             for scale_dtype in [torch.float32, torch.bfloat16, torch.float16]
-            for model_dtype in [torch.float32, torch.bfloat16]
+            for model_dtype in [torch.float32, torch.bfloat16, torch.float16]
         ],
         name_func=lambda f, _, params: f.__name__ + f"_{params.kwargs}",
     )
@@ -292,7 +293,7 @@ def test_identical_to_IntXQuantizationAwareTrainingConfig(
             IntXQuantizationAwareTrainingConfig(weight_config=weight_config),
             embedding_filter,
         )
-        expected_out = model(indices)
+        prepared_out = model(indices)
 
         quantize_(model, FromIntXQuantizationAwareTrainingConfig(), embedding_filter)
         quantize_(
@@ -305,8 +306,14 @@ def test_identical_to_IntXQuantizationAwareTrainingConfig(
             ),
             embedding_filter,
         )
-        actual_out = model(indices)
-        self.assertTrue(torch.allclose(expected_out, actual_out))
+        converted_out = model(indices)
+        sqnr = compute_error(prepared_out, converted_out).item()
+
+        # For torch.int1, sometimes sqnr is nan because both tensors are all 0
+        # so we check torch.equal as well
+        self.assertTrue(
+            sqnr == float("inf") or torch.equal(prepared_out, converted_out)
+        )
 
     @parameterized.expand(
         [
@@ -317,7 +324,7 @@ def test_identical_to_IntXQuantizationAwareTrainingConfig(
             )
             for granularity in [PerGroup(32), PerGroup(128), PerAxis(0)]
             for scale_dtype in [torch.float32, torch.bfloat16, torch.float16]
-            for model_dtype in [torch.float32, torch.bfloat16]
+            for model_dtype in [torch.float32, torch.bfloat16, torch.float16]
         ],
         name_func=lambda f, _, params: f.__name__ + f"_{params.kwargs}",
     )
@@ -346,7 +353,8 @@ def test_identical_to_Int4WeightOnlyEmbeddingQATQuantizer(
             zero_point_precision=torch.int32,
         )
         model = qat_quantizer.prepare(model)
-        expected_out = model(indices)
+        prepared_model_copy = copy.deepcopy(model)
+        prepared_out = model(indices)
 
         # Convert model method 1
         quantize_(model, FromIntXQuantizationAwareTrainingConfig(), embedding_filter)
@@ -360,15 +368,15 @@ def test_identical_to_Int4WeightOnlyEmbeddingQATQuantizer(
             ),
             embedding_filter,
         )
-        actual_out1 = model(indices)
-        self.assertTrue(torch.allclose(expected_out, actual_out1))
+        converted_out1 = model(indices)
+        sqnr1 = compute_error(prepared_out, converted_out1).item()
+        self.assertTrue(sqnr1 == float("inf"))
 
-        # TODO: method 2 does not work because the converted embedding op
-        # incorrectly casts output of to indices.dtype
         # Convert model method 2
-        # qat_quantizer.convert(prepared_model_copy)
-        # actual_out2 = prepared_model_copy(indices)
-        # self.assertTrue(torch.allclose(expected_out, actual_out2))
+        qat_quantizer.convert(prepared_model_copy)
+        converted_out2 = prepared_model_copy(indices)
+        sqnr2 = compute_error(prepared_out, converted_out2).item()
+        self.assertTrue(sqnr2 == float("inf"))
 
 
 if __name__ == "__main__":

diff --git a/torchao/experimental/tests/test_int8_dynamic_activation_intx_weight.py b/torchao/experimental/tests/test_int8_dynamic_activation_intx_weight.py
@@ -26,6 +26,7 @@
     MappingType,
     quantize_,
 )
+from torchao.quantization.utils import compute_error
 
 
 class TestInt8DynamicActivationIntxWeight(unittest.TestCase):
@@ -360,7 +361,7 @@ def test_export_QDQLayout(self):
         self.assertTrue(torch.allclose(eager_results, exported_results))
 
         expected_lines = [
-            "torch.ops.torchao.choose_qparams_affine.default(input_1, 'ASYMMETRIC', [1, 512], torch.int8, None, None, None, torch.float64, torch.int64)",
+            "torch.ops.torchao.choose_qparams_affine.default(input_1, 'ASYMMETRIC', [1, 512], torch.int8, None, None, None, torch.float32, torch.int8)",
             "torch.ops.torchao.quantize_affine.default(input_1, [1, 512], getitem, getitem_1, torch.int8)",
             "torch.ops.torchao.dequantize_affine.default(quantize_affine, [1, 512], getitem, getitem_1, torch.int8)",
             "torch.ops.torchao.dequantize_affine.default",
@@ -475,7 +476,8 @@ def test_identical_to_Int8DynamicActivationInt4WeightConfig(
             ),
         )
         with torch.no_grad():
-            torch.allclose(model(activations), model_copy(activations))
+            sqnr = compute_error(model(activations), model_copy(activations)).item()
+            self.assertTrue(sqnr == float("inf"))
 
     @parameterized.expand(
         [
@@ -492,7 +494,7 @@ def test_identical_to_Int8DynamicActivationInt4WeightConfig(
             for mapping_type in [MappingType.SYMMETRIC, MappingType.ASYMMETRIC]
             for act_mapping_type in [MappingType.ASYMMETRIC, MappingType.SYMMETRIC]
             for scale_dtype in [torch.float32, torch.bfloat16, torch.float16]
-            for model_dtype in [torch.float32, torch.bfloat16]
+            for model_dtype in [torch.float32, torch.bfloat16, torch.float16]
         ],
         name_func=lambda f, _, params: f.__name__ + f"_{params.kwargs}",
     )
@@ -510,11 +512,6 @@ def test_identical_to_IntXQuantizationAwareTrainingConfig(
         if mapping_type == MappingType.ASYMMETRIC:
             return
 
-        # TODO: QAT logic for non-float32 models does not match PTQ right now
-        # QAT's default scale-precision is float32, but PTQ's is None (which defaults to input's dtype)
-        if model_dtype != torch.float32:
-            return
-
         assert mapping_type in [MappingType.SYMMETRIC, MappingType.ASYMMETRIC]
         assert act_mapping_type in [MappingType.SYMMETRIC, MappingType.ASYMMETRIC]
         is_symmetric = mapping_type == MappingType.SYMMETRIC
@@ -550,7 +547,7 @@ def test_identical_to_IntXQuantizationAwareTrainingConfig(
             IntXQuantizationAwareTrainingConfig(activation_config, weight_config),
         )
         try:
-            expected_out = model(activations)
+            prepared_out = model(activations)
         except NotImplementedError as e:
             # QAT does not support act_mapping_type == MappingType.SYMMETRIC yet
             if act_mapping_type == MappingType.SYMMETRIC:
@@ -568,8 +565,10 @@ def test_identical_to_IntXQuantizationAwareTrainingConfig(
                 act_mapping_type=act_mapping_type,
             ),
         )
-        actual_out = model(activations)
-        self.assertTrue(torch.allclose(expected_out, actual_out))
+        converted_out = model(activations)
+
+        sqnr = compute_error(prepared_out, converted_out).item()
+        self.assertTrue(sqnr == float("inf"))
 
     @parameterized.expand(
         [
@@ -580,20 +579,13 @@ def test_identical_to_IntXQuantizationAwareTrainingConfig(
             )
             for group_size in [32, 64, 128]
             for scale_dtype in [torch.float32, torch.bfloat16, torch.float16]
-            for model_dtype in [torch.float32, torch.bfloat16]
+            for model_dtype in [torch.float32, torch.bfloat16, torch.float16]
         ],
         name_func=lambda f, _, params: f.__name__ + f"_{params.kwargs}",
     )
     def test_identical_to_Int8DynActInt4WeightQATQuantizer(
         self, group_size, scale_dtype, model_dtype
     ):
-        # Currently this does not match
-        # TODO: investigat
-        if scale_dtype != torch.float32:
-            return
-        if model_dtype != torch.float32:
-            return
-
         k0 = 512
         k1 = 256
         layers = [
@@ -611,10 +603,10 @@ def test_identical_to_Int8DynActInt4WeightQATQuantizer(
             groupsize=group_size, precision=model_dtype, scales_precision=scale_dtype
         )
         model = qat_quantizer.prepare(model)
-        expected_out = model(activations)
-
         prepared_model_copy = copy.deepcopy(model)
 
+        prepared_out = model(activations)
+
         # Convert model method 1
         quantize_(model, FromIntXQuantizationAwareTrainingConfig())
         quantize_(
@@ -627,13 +619,15 @@ def test_identical_to_Int8DynActInt4WeightQATQuantizer(
                 act_mapping_type=MappingType.ASYMMETRIC,
             ),
         )
-        actual_out1 = model(activations)
-        self.assertTrue(torch.allclose(expected_out, actual_out1))
+        converted_out1 = model(activations)
+        sqnr1 = compute_error(prepared_out, converted_out1).item()
+        self.assertTrue(sqnr1 == float("inf"))
 
         # Convert model method 2
         qat_quantizer.convert(prepared_model_copy)
-        actual_out2 = prepared_model_copy(activations)
-        self.assertTrue(torch.allclose(expected_out, actual_out2))
+        converted_out2 = prepared_model_copy(activations)
+        sqnr2 = compute_error(prepared_out, converted_out2).item()
+        self.assertTrue(sqnr2 == float("inf"))
 
 
 if __name__ == "__main__":

diff --git a/torchao/quantization/GPTQ.py b/torchao/quantization/GPTQ.py
@@ -931,9 +931,16 @@ def linear_forward_8da4w(
     zeros,
     out_features,
     groupsize,
-    precision,
+    output_precision,
 ):
-    x = per_token_dynamic_quant(x, scale_dtype=precision, zero_point_dtype=precision)
+    # uses fp32 to match torchao.quantization.quant_api._int8_asymm_per_token_quant
+    # and activation_scale_dtype in QAT configs
+    # TODO: in future add ability to specify activation_scale_dtype to PTQ configs
+    # and enable similar change here
+    x = per_token_dynamic_quant(
+        x, scale_dtype=torch.float32, zero_point_dtype=torch.float32
+    )
+
     # TODO: verify and remove following reshape code
     # origin_x_size = x.size()
     # x = x.reshape(-1, origin_x_size[-1])
@@ -953,7 +960,7 @@ def linear_forward_8da4w(
         torch.int8,
         quant_min,
         quant_max,
-        output_dtype=precision,
+        output_dtype=output_precision,
     )
 
     # x = x.to(torch.float16)

diff --git a/torchao/quantization/qat/embedding.py b/torchao/quantization/qat/embedding.py
@@ -177,6 +177,7 @@ def replacement_fn(child: torch.nn.Module) -> torch.nn.Module:
                 scale_precision=self.scale_precision,
                 zero_point_precision=self.zero_point_precision,
                 device=child.weight.device,
+                dtype=child.weight.dtype,
             )
             # In distributed training, the model may be instantiated
             # on the meta device, in which case there is no need to
@@ -227,14 +228,19 @@ def _convert_helper(self, module: torch.nn.Module):
                     scale_precision=scale_precision,
                     zero_point_precision=zero_point_precision,
                     device=child.weight.device,
+                    output_dtype=child.weight.dtype,
                 )
                 setattr(module, name, quantized_embedding)
 
                 # Load weights and qparams into quantized embedding
                 (qmin, qmax) = _get_qmin_qmax(self.bit_width)
                 (s, zp) = get_group_qparams_symmetric(
-                    child.weight, self.bit_width, group_size
+                    child.weight,
+                    self.bit_width,
+                    group_size,
+                    precision=scale_precision,
                 )
+                zp = zp.to(zero_point_precision)
                 q_weight = _quantized_decomposed_quantize_per_channel_group_wrapper(
                     child.weight,
                     s,
@@ -324,6 +330,7 @@ def __init__(
         scale_precision: torch.dtype = torch.float32,
         zero_point_precision: torch.dtype = torch.int32,
         device: torch.device = None,
+        output_dtype: torch.dtype = torch.float32,
     ):
         super().__init__()
 
@@ -341,6 +348,7 @@ def __init__(
         self.group_size = group_size
         self.scale_precision = scale_precision
         self.zero_point_precision = zero_point_precision
+        self.output_dtype = output_dtype
 
         # currently storing unpacked int8 weights
         self.register_buffer(
@@ -367,20 +375,24 @@ def __init__(
         )
 
     def forward(self, x):
-        from torchao._executorch_ops import (
-            _quantized_decomposed_dequantize_per_channel_group_wrapper,
+        from torchao.quantization.quant_primitives import (
+            dequantize_affine,
         )
 
         qmin, qmax = _get_qmin_qmax(self.bit_width)
-        w_dq = _quantized_decomposed_dequantize_per_channel_group_wrapper(
+
+        # dequantize_affine casts to output_dtype before scaling
+        # dequantize_per_channel_group scales and then casts to output_dtype
+        # The two do not agree when dtype != torch.float32
+        w_dq = dequantize_affine(
             self.weight,
+            [1, self.group_size],
             self.scale,
             self.zero_point,
+            torch.int8,
             qmin,
             qmax,
-            torch.int8,
-            self.group_size,
-            x.dtype,
+            output_dtype=self.output_dtype,
         )
         return F.embedding(
             x,

diff --git a/torchao/quantization/qat/linear.py b/torchao/quantization/qat/linear.py
@@ -219,8 +219,12 @@ def _convert_qat_linear_8da4w(self, module: torch.nn.Module):
                 n_bit = 4
                 (qmin, qmax) = _get_qmin_qmax(n_bit)
                 (s, zp) = get_group_qparams_symmetric(
-                    child.weight, n_bit, config.group_size
+                    child.weight,
+                    n_bit,
+                    config.group_size,
+                    precision=config.scale_precision,
                 )
+                zp = zp.to(config.zero_point_precision)
                 from torchao._executorch_ops import (
                     _quantized_decomposed_quantize_per_channel_group_wrapper,
                 )
@@ -258,6 +262,10 @@ class Int8DynActInt4WeightQATLinear(FakeQuantizedLinear):
         groupsize: the number of elements in each quantized group for weights
         precision: precision of weights
         scales_precision: precision of per group scales and zero points
+
+    Note: we hardcode activation scales to use torch.fp32, but allow users to specify the weight scales (defaults to torch.fp32).
+    To get an exact numerical match with Int8DynamicActivationInt4WeightConfig, users must use the same dtype for both the weights
+    and the scales. Here scales_precision refers specifically to the weight scales only, not the activation scales.
     """
 
     def __init__(
@@ -270,7 +278,9 @@ def __init__(
         precision: torch.dtype = torch.float32,
         scales_precision: torch.dtype = torch.float32,
     ) -> None:
-        activation_config = _get_8da4w_activation_config(scales_precision)
+        # Use torch.float32 to match torchao.quantization.quant_api._int8_asymm_per_token_quant,
+        # which is used in PTQ routines
+        activation_config = _get_8da4w_activation_config(torch.float32)
 quant_kwargs: Dict[str, Any], 
 quantized_tensor = input_quant_func(input_tensor, **quant_kwargs) 
 quant_kwargs: Dict[str, Any], 
 quantized_tensor = input_quant_func(input_tensor, **quant_kwargs) 
         weight_config = _get_8da4w_weight_config(groupsize, scales_precision)
         super().__init__(
             in_features,