Allow setting eps in FakeQuantizeConfig

andrewor14 · andrewor14 · commit f782422ca1f8 · 2025-05-06T15:21:29.000-07:00
**Summary:** Today, we always use `torch.finfo(x.dtype).eps`,
where `x` is the value we are trying to quantize, and there is
no way for users to configure this. However, users lowering to
XNNPACK may wish to use this combination of dtypes during
training for end-to-end numerical match:

- input activations: bf16
- input activation scales: fp32
- input activation eps: `torch.finfo(torch.float32).eps`
- weight: bf16
- weight scales: bf16
- weight eps: `torch.finfo(torch.bfloat16).eps`

Adding `eps` to `FakeQuantizeConfig`enables such a use case.

**Test Plan:** TBD
diff --git a/torchao/quantization/GPTQ.py b/torchao/quantization/GPTQ.py
@@ -938,7 +938,10 @@ def linear_forward_8da4w(
     # TODO: in future add ability to specify activation_scale_dtype to PTQ configs
     # and enable similar change here
     x = per_token_dynamic_quant(
-        x, scale_dtype=torch.float32, zero_point_dtype=torch.float32
+        x,
+        scale_dtype=torch.float32,
+        zero_point_dtype=torch.float32,
+        eps=torch.finfo(torch.float32).eps,
     )
 
     # TODO: verify and remove following reshape code
diff --git a/torchao/quantization/qat/api.py b/torchao/quantization/qat/api.py
@@ -85,6 +85,7 @@ class FakeQuantizeConfig:
     zero_point_domain: ZeroPointDomain
     is_dynamic: bool = True
     range_learning: bool = False
+    eps: Optional[float] = None
 
     def __init__(
         self,
@@ -96,6 +97,7 @@ def __init__(
         zero_point_domain: ZeroPointDomain = ZeroPointDomain.INT,
         is_dynamic: bool = True,
         range_learning: bool = False,
+        eps: Optional[float] = None,
         *,
         group_size: Optional[int] = None,
         is_symmetric: Optional[bool] = None,
@@ -110,6 +112,7 @@ def __init__(
         self.zero_point_domain = zero_point_domain
         self.is_dynamic = is_dynamic
         self.range_learning = range_learning
+        self.eps = eps
 
         # Validate dtype
         all_dtypes = [torch.int8, torch.uint8]
diff --git a/torchao/quantization/qat/fake_quantizer.py b/torchao/quantization/qat/fake_quantizer.py
@@ -81,6 +81,7 @@ def _per_token_forward(self, x: torch.Tensor):
                 target_dtype=self.config.dtype,
                 quant_min=qmin,
                 quant_max=qmax,
+                eps=self.config.eps,
                 scale_dtype=self.config.scale_precision,
                 zero_point_dtype=self.config.zero_point_precision,
             )
@@ -117,13 +118,15 @@ def _per_channel_or_group_forward(self, x: torch.Tensor):
                     bit_width,
                     group_size,
                     scale_precision,
+                    eps=self.config.eps,
                 )
             else:
                 (self.scale, self.zero_point) = get_groupwise_affine_qparams(
                     x,
                     bit_width,
                     group_size,
                     scale_precision,
+                    eps=self.config.eps,
                 )
             self.zero_point = self.zero_point.to(zero_point_precision)
 
diff --git a/torchao/quantization/qat/linear.py b/torchao/quantization/qat/linear.py
@@ -177,6 +177,8 @@ def __init__(
         self.padding_allowed: bool = padding_allowed
         self.precision: torch.dtype = precision
         self.scales_precision: torch.dtype = scales_precision
+        # TODO: generalize this
+        self.activation_scales_precision = torch.float32
 
     def prepare(
         self, model: torch.nn.Module, *args: Any, **kwargs: Any
@@ -247,7 +249,7 @@ def _convert_qat_linear_8da4w(self, module: torch.nn.Module):
                 self._convert_qat_linear_8da4w(child)
 
     def get_activation_fake_quantize_config(self) -> Optional[FakeQuantizeConfig]:
-        return _get_8da4w_activation_config(self.scales_precision)
+        return _get_8da4w_activation_config(self.activation_scales_precision)
 
     def get_weight_fake_quantize_config(self) -> Optional[FakeQuantizeConfig]:
         return _get_8da4w_weight_config(self.groupsize, self.scales_precision)
@@ -280,6 +282,7 @@ def __init__(
     ) -> None:
         # Use torch.float32 to match torchao.quantization.quant_api._int8_asymm_per_token_quant,
         # which is used in PTQ routines
+        # TODO: generalize this
         activation_config = _get_8da4w_activation_config(torch.float32)
         weight_config = _get_8da4w_weight_config(groupsize, scales_precision)
         super().__init__(
@@ -320,13 +323,16 @@ def _get_8da4w_activation_config(qparams_precision: torch.dtype) -> FakeQuantize
     """
     Return the activation `FakeQuantizeConfig` for `Int8DynActInt4WeightQATQuantizer`.
     """
+    # TODO: generalize this
+    assert qparams_precision == torch.float32
     return FakeQuantizeConfig(
         dtype=torch.int8,
         granularity="per_token",
         is_symmetric=False,
         is_dynamic=True,
         scale_precision=qparams_precision,
         zero_point_precision=qparams_precision,
+        eps=torch.finfo(qparams_precision).eps,
     )
 
 
diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py
@@ -324,6 +324,7 @@ def get_groupwise_affine_qparams(
     dtype=torch.bfloat16,
     zero_point_domain=ZeroPointDomain.FLOAT,
     preserve_zero=False,
+    eps=None,
 ):
     if groupsize > w.shape[-1]:
         groupsize = w.shape[-1]
@@ -337,7 +338,8 @@ def get_groupwise_affine_qparams(
     block_size = (1, groupsize)
     quant_min = 0
     quant_max = 2**n_bit - 1
-    eps = 1e-6
+    if eps is None:
+        eps = 1e-6
     scale_dtype = dtype
     zero_point_dtype = (
         dtype if zero_point_domain != ZeroPointDomain.INT else torch.int32
@@ -529,6 +531,7 @@ def get_group_qparams_symmetric(
     groupsize=128,
     precision=torch.float32,
     mapping_type=MappingType.SYMMETRIC,
+    eps=None,
 ):
     # needed for GPTQ with padding
     if groupsize > w.shape[-1]:
@@ -539,7 +542,8 @@ def get_group_qparams_symmetric(
     assert n_bit <= 8, f"unsupported n_bit: {n_bit}"
 
     block_size = (1, groupsize)
-    eps = torch.finfo(w.dtype).eps
+    if eps is None:
+        eps = torch.finfo(w.dtype).eps
     ranges = {}
     ranges[1] = (-1, 0)
     # generating ranges for bit 2 to 8
@@ -590,6 +594,7 @@ def per_token_dynamic_quant(
     input: torch.Tensor,
     scale_dtype: torch.dtype = torch.float32,
     zero_point_dtype: torch.dtype = torch.float32,
+    eps: Optional[float] = None,
 ) -> torch.Tensor:
     mapping_type = MappingType.ASYMMETRIC
     block_size = _get_per_token_block_size(input)
@@ -607,6 +612,7 @@ def per_token_dynamic_quant(
         quant_max,
         scale_dtype=scale_dtype,
         zero_point_dtype=zero_point_dtype,
+        eps=eps,
     )
     q = quantize_affine(
         input,