add zero_point_domian arguments in documents

airMeng · airMeng · commit 51a4505800e4 · 2024-12-14T19:45:52.000+08:00
diff --git a/torchao/quantization/README.md b/torchao/quantization/README.md
@@ -202,6 +202,17 @@ We also have a unified quantized tensor subclass that implements how to get a qu
 #### Layouts
 We extended the `layout` concept to represent different packing formats for a tensor. `AffineQuantizedTensor` supports `plain` and `tensor_core_tiled` layout. `plain` layout is used for `int8_weight_only` and `int8_dynamic_activation_int8_weight` and also as a default layout. `tensor_core_tiled` layout is used for `int4_weight_only` quantization and is packing the weights in a format that is compatible with tinygemm [int4mm](https://github.com/pytorch/pytorch/blob/39357ba06f48cda7d293a4995aa5eba2a46598b5/aten/src/ATen/native/native_functions.yaml#L4138) kernels.
 
+### Zero Point Domains
+```ZeroPointDomain``` is used to control the data types of zero points. ```None``` represents symmetric quantization, while ```ZeroPointDomain.FLOAT``` and ```ZeroPointDomain.INT``` indicate asymmetric quantization. For detailed implementation of different zero point data types, refer to [the reference implementation]((../../test/quantization/test_quant_primitives.py)).
+The following support matrix illustrates the relationship between layouts and zero point domains, which may be updated with backend changes:
+
+|Layout|None(Symmetric)|Float|Int|
+|------|---------------|-----|---|
+|TensorCoreTiledLayout| Yes | Yes(Default) | No|
+|Int4CPULayout | Yes | Yes(Default) | No |
+|MarlinSparseLayout | No | No | Yes(Default) |
+
+
 ### Full Affine Quantization Flow Example
 Let's use int4 weight only quantization that's targeting tinygemm int4 weight only quantized matmul
 as an example:
@@ -239,6 +250,8 @@ m_bf16 = torch.compile(m_bf16, mode='max-autotune')
 group_size = 32
 # only works for torch 2.4+
 quantize_(m, int4_weight_only(group_size=group_size))
+## If different zero_point_domain needed
+# quantize_(m, int4_weight_only(group_size=group_size), zero_point_domain=ZeroPointDomain.FLOAT)
 
 # temporary workaround for tensor subclass + torch.compile
 # NOTE: this is only need for torch version < 2.5+
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -664,6 +664,7 @@ def int4_weight_only(
          size is more fine grained, choices are [256, 128, 64, 32]
         `layout`: layout type for quantized tensor, default is `TensorCoreTiledLayout(inner_k_tiles=8)`
         `use_hqq`: whether to use hqq or default quantization mode, default is False
+        `zero_point_domain`: data type of zeros points, choices are [None(default), ZeroPointDomain.FLOAT, ZeroPointDomain.INT, ZeroPointDomain.NONE]
     """
 
     def apply_int4_weight_only_quant(weight):
@@ -679,14 +680,14 @@ def apply_int4_weight_only_quant(weight):
         quant_min = 0
         quant_max = 15
         eps = 1e-6
+        preserve_zero = LAYOUT_TO_PRESERVE_ZEROS[type(layout)]
         zero_point_dtype = torch.bfloat16
 
         nonlocal zero_point_domain
         assert type(layout) in LAYOUT_TO_ZERO_POINT_DOMAIN.keys(), f"Only support layout: {LAYOUT_TO_ZERO_POINT_DOMAIN.keys()}"
         if zero_point_domain is None:
             # the first value is the default one
             zero_point_domain = LAYOUT_TO_ZERO_POINT_DOMAIN[type(layout)][0]
-            preserve_zero = LAYOUT_TO_PRESERVE_ZEROS[type(layout)]
         else:
             assert zero_point_domain in LAYOUT_TO_ZERO_POINT_DOMAIN[type(layout)], f"Layout only support {LAYOUT_TO_ZERO_POINT_DOMAIN[layout]}"