create staticmethod for quantizing weights of QATLinear and QATEmbedding

navsud · facebook-github-bot · commit d953902e270b · 2025-04-18T15:55:50.000-07:00
Summary:
For saving the quantized weights, we have been using adhoc notebooks with copy-pasted code from the convert method. 
This had been a source of numerical discrepancies. To avoid this issue, this diff adds separates the weight quantization logic in to a separate staticmethods so that we can reuse it.

Reviewed By: jerryzh168

Differential Revision: D73201409
diff --git a/torchao/quantization/qat/embedding.py b/torchao/quantization/qat/embedding.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Any, Optional
+from typing import Any, Optional, Tuple
 
 import torch
 import torch.nn.functional as F
@@ -15,9 +15,7 @@
 
 from .api import FakeQuantizeConfig
 from .fake_quantizer import FakeQuantizer
-from .utils import (
-    _get_qmin_qmax,
-)
+from .utils import _get_qmin_qmax
 
 
 class FakeQuantizedEmbedding(torch.nn.Embedding):
@@ -196,15 +194,40 @@ def convert(
         """
         self._convert_helper(model)
         return model
+    
+    @staticmethod
+    def quantize_weights(
+        weight: torch.Tensor,
+        bit_width: int,
+        group_size: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Helper function to quantize weights
+        """
+        (qmin, qmax) = _get_qmin_qmax(bit_width)
+        (s, zp) = get_group_qparams_symmetric(
+            weight, bit_width, group_size
+        )
+        from torchao._executorch_ops import (
+            _quantized_decomposed_quantize_per_channel_group_wrapper,
+        )
+        q_weight = _quantized_decomposed_quantize_per_channel_group_wrapper(
+            weight,
+            s,
+            zp,
+            qmin,
+            qmax,
+            torch.int8,
+            group_size,
+        )
+        return (q_weight, s, zp)
+
 
     def _convert_helper(self, module: torch.nn.Module):
         """
         Helper function to recursively swap `Int4WeightOnlyQATEmbedding`
         modules with `Int4WeightOnlyEmbedding`
         """
-        from torchao._executorch_ops import (
-            _quantized_decomposed_quantize_per_channel_group_wrapper,
-        )
 
         for name, child in module.named_children():
             if isinstance(child, Int4WeightOnlyQATEmbedding):
@@ -230,20 +253,8 @@ def _convert_helper(self, module: torch.nn.Module):
                 )
                 setattr(module, name, quantized_embedding)
 
+                q_weight, s, zp = self.quantize_weights(child.weight, self.bit_width, group_size)
                 # Load weights and qparams into quantized embedding
-                (qmin, qmax) = _get_qmin_qmax(self.bit_width)
-                (s, zp) = get_group_qparams_symmetric(
-                    child.weight, self.bit_width, group_size
-                )
-                q_weight = _quantized_decomposed_quantize_per_channel_group_wrapper(
-                    child.weight,
-                    s,
-                    zp,
-                    qmin,
-                    qmax,
-                    torch.int8,
-                    group_size,
-                )
                 quantized_embedding.weight = q_weight
                 quantized_embedding.scale = s.to(scale_precision)
                 quantized_embedding.zero_point = zp.to(zero_point_precision)
diff --git a/torchao/quantization/qat/linear.py b/torchao/quantization/qat/linear.py
@@ -4,33 +4,28 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Any, Optional
+from typing import Any, Optional, Tuple
 
 import torch
 import torch.nn.functional as F
 
 from torchao.dtypes.utils import is_device
 from torchao.quantization.GPTQ import (
-    Int8DynActInt4WeightLinear,
-    WeightOnlyInt4Linear,
     _check_linear_int4_k,
     _replace_linear_8da4w,
     _replace_linear_int4,
     groupwise_affine_quantize_tensor,
+    Int8DynActInt4WeightLinear,
+    WeightOnlyInt4Linear,
 )
-from torchao.quantization.quant_primitives import (
-    TorchAODType,
-    ZeroPointDomain,
-)
+from torchao.quantization.quant_primitives import TorchAODType, ZeroPointDomain
 from torchao.quantization.unified import TwoStepQuantizer
 from torchao.quantization.utils import get_group_qparams_symmetric
 from torchao.utils import TORCH_VERSION_AT_LEAST_2_6
 
 from .api import FakeQuantizeConfig
 from .fake_quantizer import FakeQuantizer
-from .utils import (
-    _get_qmin_qmax,
-)
+from .utils import _get_qmin_qmax
 
 
 class FakeQuantizedLinear(torch.nn.Linear):
@@ -197,6 +192,36 @@ def convert(
     ) -> torch.nn.Module:
         self._convert_qat_linear_8da4w(model)
         return model
+    
+    @staticmethod
+    def quantize_weights(
+        weight: torch.Tensor,
+        group_size: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Helper function to quantize weights
+        """
+        # Load weights and qparams into quantized linear
+        n_bit = 4
+        (qmin, qmax) = _get_qmin_qmax(n_bit)
+        (s, zp) = get_group_qparams_symmetric(
+            weight, n_bit, group_size
+        )
+        from torchao._executorch_ops import (
+            _quantized_decomposed_quantize_per_channel_group_wrapper,
+        )
+
+        q_weight = _quantized_decomposed_quantize_per_channel_group_wrapper(
+            weight,
+            s,
+            zp,
+            qmin,
+            qmax,
+            torch.int8,
+            group_size,
+        )
+        return (q_weight, s, zp)
+
 
     def _convert_qat_linear_8da4w(self, module: torch.nn.Module):
         """
@@ -215,28 +240,10 @@ def _convert_qat_linear_8da4w(self, module: torch.nn.Module):
                 )
                 setattr(module, name, quantized_linear)
 
-                # Load weights and qparams into quantized linear
-                n_bit = 4
-                (qmin, qmax) = _get_qmin_qmax(n_bit)
-                (s, zp) = get_group_qparams_symmetric(
-                    child.weight, n_bit, config.group_size
-                )
-                from torchao._executorch_ops import (
-                    _quantized_decomposed_quantize_per_channel_group_wrapper,
-                )
-
-                q_weight = _quantized_decomposed_quantize_per_channel_group_wrapper(
-                    child.weight,
-                    s,
-                    zp,
-                    qmin,
-                    qmax,
-                    torch.int8,
-                    config.group_size,
-                )
+                q_weight, scales, zeros = self.quantize_weights(child.weight, config.group_size)         
                 quantized_linear.weight = q_weight
-                quantized_linear.scales = s
-                quantized_linear.zeros = zp
+                quantized_linear.scales = scales
+                quantized_linear.zeros = zeros
                 if child.bias is not None:
                     quantized_linear.bias = child.bias
             else: