Add cachemask variant for fake_quantize_affine

andrewor14 · andrewor14 · commit 4e4dd12289f6 · 2024-07-12T13:33:38.000-07:00
Summary: In QAT, we often wish to filter out the gradients
corresponding to values outside the expected quantization
range, for example:

```
q = _quantize_affine_no_dtype_cast(...)
dq = _dequantize_affine_no_dtype_check(...)
mask = torch.logical_and((q &gt;= quant_min), (q &lt;= quant_max))

grad = grad * mask
```

The existing `fake_quantize_affine` returns the dequantized
values only, so callers do not have access to this mask.
This commit adds the variant to this op that returns both
the dequantized values and the mask, similar to
`fake_quantize_per_tensor_affine_cachemask` in core.

Test Plan:
python test/quantization/test_quant_primitives.py -k test_fake_quantize_affine_cachemask
diff --git a/test/quantization/test_quant_primitives.py b/test/quantization/test_quant_primitives.py
@@ -10,6 +10,7 @@
 import torch
 from torchao.quantization.quant_primitives import (
     fake_quantize_affine,
+    fake_quantize_affine_cachemask,
     quantize_affine,
     dequantize_affine,
     choose_qparams_affine,
@@ -523,5 +524,28 @@ def test_fake_quantize_affine(self):
         fake_quantized = fake_quantize_affine(input, block_size, scale, zero_point, dtype, quant_min, quant_max)
         torch.testing.assert_close(dequantized, fake_quantized)
 
+    @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "skipping when torch version is 2.4 or lower")
+    def test_fake_quantize_affine_cachemask(self):
+        input = torch.randn(10, 10)
+
+        mapping_type = MappingType.SYMMETRIC
+        block_size = list(input.shape)
+        for i in range(len(block_size) - 1):
+            block_size[i] = 1
+        dtype = torch.int8
+        eps = 1e-5
+        quant_min = -127
+        quant_max = 127
+        scale, zero_point = choose_qparams_affine(input, mapping_type, block_size, dtype, quant_min, quant_max, eps=eps, scale_dtype=torch.float)
+
+        quantized = quantize_affine(input, block_size, scale, zero_point, dtype, quant_min, quant_max)
+        dequantized = dequantize_affine(quantized, block_size, scale, zero_point, dtype, quant_min, quant_max)
+        (fake_quantized, mask) = fake_quantize_affine_cachemask(
+            input, block_size, scale, zero_point, dtype, quant_min, quant_max,
+        )
+        expected_mask = torch.full(input.shape, True)
+        torch.testing.assert_close(dequantized, fake_quantized)
+        torch.testing.assert_close(expected_mask, mask)
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/torchao/quantization/quant_primitives.py b/torchao/quantization/quant_primitives.py
@@ -24,6 +24,7 @@
     "quantize_affine",
     "dequantize_affine",
     "fake_quantize_affine",
+    "fake_quantize_affine_cachemask",
 ]
 
 class MappingType(Enum):
@@ -411,6 +412,87 @@ def fake_quantize_affine(
         value during quantization
         default is ZeroPointDomain.INT
     """
+    (_, fq) = _do_fake_quantize_affine(
+        input,
+        block_size,
+        scale,
+        zero_point,
+        quant_dtype,
+        quant_min,
+        quant_max,
+        zero_point_domain,
+    )
+    return fq
+
+
+def fake_quantize_affine_cachemask(
+    input: torch.Tensor,
+    block_size: Tuple[int, ...],
+    scale: torch.Tensor,
+    zero_point: Optional[torch.Tensor],
+    quant_dtype: torch.dtype,
+    quant_min: Optional[int] = None,
+    quant_max: Optional[int] = None,
+    zero_point_domain: ZeroPointDomain = ZeroPointDomain.INT,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    General fake quantize op for quantization-aware training (QAT).
+    This is equivalent to calling `quantize_affine` + `dequantize_affine`
+    but without the dtype casts.
+
+    Note: Compared to `fake_quantize_affine`, this consumes more memory and
+    returns an additional outlier mask for intermediate quantized values.
+
+    Returns:
+      A 2-tuple of (
+          final fake quantized values,
+          outlier mask for intermediate quantized values
+      )
+
+    Args:
+      input (torch.Tensor): original float32, float16 or bfloat16 Tensor
+      block_size: (Tuple[int, ...]): granularity of quantization, this means the size of the tensor elements that's sharing the same qparam
+           e.g. when size is the same as the input tensor dimension, we are using per tensor quantization
+      scale (float): quantization parameter for affine quantization
+      zero_point (int): quantization parameter for affine quantization
+      quant_dtype (torch.dtype): desired quantized dtype for determining and validating quant_min and quant_max values.
+      quant_min (Optional[int]): minimum quantized value for output Tensor, if not specified, it will be derived from dtype
+      quant_max (Optional[int]): maximum quantized value for output Tensor, if not specified, it will be derived from dtype
+      zero_point_domain (ZeroPointDomain): the domain that zero_point is in, should be eitehr integer or float
+        if zero_point is in integer domain, zero point is added to the quantized integer value during
+        quantization
+        if zero_point is in floating point domain, zero point is subtracted from the floating point (unquantized)
+        value during quantization
+        default is ZeroPointDomain.INT
+    """
+    (q, dq) = _do_fake_quantize_affine(
+        input,
+        block_size,
+        scale,
+        zero_point,
+        quant_dtype,
+        quant_min,
+        quant_max,
+        zero_point_domain,
+    )
+    mask = torch.logical_and((q >= quant_min), (q <= quant_max))
+    return (dq, mask)
+
+
+def _do_fake_quantize_affine(
+    input: torch.Tensor,
+    block_size: Tuple[int, ...],
+    scale: torch.Tensor,
+    zero_point: Optional[torch.Tensor],
+    quant_dtype: torch.dtype,
+    quant_min: Optional[int] = None,
+    quant_max: Optional[int] = None,
+    zero_point_domain: ZeroPointDomain = ZeroPointDomain.INT,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Helper function for `fake_quantize_affine` that returns both the
+    intermediate quantized values and the final dequantized values.
+    """
     input_dtype = input.dtype
     quant_min, quant_max = _get_and_check_qmin_qmax(quant_dtype, quant_min, quant_max)
     q = _quantize_affine_no_dtype_cast(
@@ -432,7 +514,7 @@ def fake_quantize_affine(
         zero_point_domain.name,
         output_dtype=input_dtype,
     )
-    return dq
+    return (q, dq)
 
 
 def choose_qparams_affine(