pytorch
diff --git a/‎test/test.py
Lines changed: 98 additions & 45 deletions b/‎test/test.py
Lines changed: 98 additions & 45 deletions
diff --git a/‎torchao/quantization/__init__.py
Lines changed: 5 additions & 3 deletions b/‎torchao/quantization/__init__.py
Lines changed: 5 additions & 3 deletions
diff --git a/‎torchao/quantization/quant_api.py
Lines changed: 24 additions & 10 deletions b/‎torchao/quantization/quant_api.py
Lines changed: 24 additions & 10 deletions
@@ -11,7 +11,7 @@
 import torch
 import torch.nn as nn
 from torch._inductor.utils import run_and_get_code
-
+from torch._dynamo import config
 from torch.ao.quantization import MinMaxObserver, QConfigMapping
 
 from torchao.quantization.dynamic_quant import (
@@ -21,7 +21,8 @@
     apply_dynamic_quant,
     apply_weight_only_int8_quant,
     change_linear_weights_to_dqtensors,
-    change_linear_weights_to_woqtensors,
+    change_linear_weights_to_int8woqtensors,
+    change_linear_weights_to_int4woqtensors,
     _replace_with_custom_fn_if_matches_filter,
 )
 from torchao.quantization.quant_primitives import (
@@ -42,8 +43,9 @@
     swap_linear_with_smooth_fq_linear,
 )
 from torchao.quantization.subclass import (
-    DynamicallyQuantizedLinearWeight,
-    WeightOnlyQuantizedLinearWeight
+    Int8DynamicallyQuantizedLinearWeight,
+    Int8WeightOnlyQuantizedLinearWeight,
+    Int4WeightOnlyQuantizedLinearWeight
 )
 from torchao.quantization.utils import (
     apply_logging_hook,
@@ -59,6 +61,7 @@
 import os
 
 torch.manual_seed(0)
+config.cache_size_limit = 100
 
 
 class SmoothquantUnitTest(unittest.TestCase):
@@ -788,62 +791,108 @@ def test_qlinear_per_channel_numerics_cuda(self):
 
 
 class TestSubclass(unittest.TestCase):
+    def _test_dequantize_impl(
+        self,
+        test_subclass_from_float,
+        min_sqnr=35,
+        test_dtype=torch.bfloat16,
+        test_shape=[32, 64, 64],
+    ):
+        m, k, n = test_shape
+        lin = torch.nn.Linear(k, n, device="cuda").to(test_dtype)
+        w = lin.weight.detach()
+        lin.weight = torch.nn.Parameter(
+            test_subclass_from_float(lin.weight), requires_grad=False
+        )
+        self.assertGreater(SQNR(w, lin.weight.dequantize()), min_sqnr, f"{lin.weight.__class__.__name__} failed dtype={test_dtype}")
+        self.assertGreater(SQNR(w.t(), lin.weight.t().dequantize()), min_sqnr, f"{lin.weight.__class__.__name__} failed transpose on dtype={test_dtype}")
+
+    def test_dequantize_int8_dynamic_quant_subclass(self):
+        for test_dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            self._test_dequantize_impl(Int8DynamicallyQuantizedLinearWeight.from_float, 35, test_dtype)
+
+    def test_dequantize_int8_weight_only_quant_subclass(self):
+        for test_dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            self._test_dequantize_impl(Int8WeightOnlyQuantizedLinearWeight.from_float, 35, test_dtype)
+
+    def test_dequantize_int4_weight_only_quant_subclass(self):
+        self._test_dequantize_impl(Int4WeightOnlyQuantizedLinearWeight.from_float, 15, test_shape=[1, 1024, 8])
+        for groupsize in [256, 128]:
+            for inner_k_tiles in [8, 2]:
+                for m in [1, 256]:
+                    self._test_dequantize_impl(lambda w: Int4WeightOnlyQuantizedLinearWeight.from_float(w, groupsize, inner_k_tiles), 15, test_shape=[m, 256, 8])
+
     def _test_lin_weight_subclass_impl(self,
-        test_subclass,
+        test_subclass_from_float,
         min_sqnr=35,
-        test_dtypes=[torch.float32, torch.float16, torch.bfloat16],
-        test_shape=[32, 64, 32]
+        test_dtype=torch.bfloat16,
+        test_shape=[32, 64, 32],
     ):
-        for test_dtype in test_dtypes:
-            m, k, n = test_shape
-            x = torch.randn(m, k, device="cuda", dtype=test_dtype)
-            lin = torch.nn.Linear(k, n, device="cuda").to(test_dtype)
-            ref_f = lin(x)
-
-            lin.weight = torch.nn.Parameter(
-                test_subclass.from_float(lin.weight), requires_grad=False
-            )
-            test = lin(x)
-            self.assertGreater(SQNR(ref_f, test), min_sqnr, f"{test_subclass.__name__} failed, no compile, dtype={test_dtype}, (m, k, n)={test_shape}")
-            lin_comp = torch.compile(lin, mode='max-autotune')
-            test_comp = lin_comp(x)
-            self.assertGreater(SQNR(ref_f, test_comp), min_sqnr, f"{test_subclass.__name__} failed at compile with dtype={test_dtype}, (m, k, n)={test_shape}")
+        m, k, n = test_shape
+        x = torch.randn(m, k, device="cuda", dtype=test_dtype)
+        lin = torch.nn.Linear(k, n, device="cuda").to(test_dtype)
+        ref_f = lin(x)
+
+        lin.weight = torch.nn.Parameter(
+            test_subclass_from_float(lin.weight), requires_grad=False
+        )
+        test = lin(x)
+        self.assertGreater(SQNR(ref_f, test), min_sqnr, f"{lin.weight.__class__.__name__} failed, no compile, dtype={test_dtype}, (m, k, n)={test_shape}")
+        lin_comp = torch.compile(lin, mode='max-autotune')
+        test_comp = lin_comp(x)
+        self.assertGreater(SQNR(ref_f, test_comp), min_sqnr, f"{lin.weight.__class__.__name__} failed at compile with dtype={test_dtype}, (m, k, n)={test_shape}")
 
     def test_int8_dynamic_quant_subclass(self):
-        self._test_lin_weight_subclass_impl(DynamicallyQuantizedLinearWeight, 35)
+        for test_dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            self._test_lin_weight_subclass_impl(Int8DynamicallyQuantizedLinearWeight.from_float, 35, test_dtype)
 
     def test_int8_weight_only_quant_subclass(self):
-        self._test_lin_weight_subclass_impl(WeightOnlyQuantizedLinearWeight, 40)
+        for test_dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            self._test_lin_weight_subclass_impl(Int8WeightOnlyQuantizedLinearWeight.from_float, 40, test_dtype)
+
+    def test_int4_weight_only_quant_subclass(self):
+        self._test_lin_weight_subclass_impl(Int4WeightOnlyQuantizedLinearWeight.from_float, 10, test_shape=[1, 1024, 8])
+        for groupsize in [128, 64]:
+            for inner_k_tiles in [4, 2]:
+                for m in [1, 256]:
+                    self._test_lin_weight_subclass_impl(lambda w: Int4WeightOnlyQuantizedLinearWeight.from_float(w, groupsize, inner_k_tiles), 10, test_shape=[m, 256, 8])
 
     @torch.no_grad()
     def _test_lin_weight_subclass_api_impl(
         self,
         api,
         min_sqnr=35,
-        test_dtypes=[torch.float32, torch.float16, torch.bfloat16],
+        test_dtype=torch.bfloat16,
         test_shape=[32, 64, 32]
     ):
-        for test_dtype in test_dtypes:
-            m, k, n = test_shape
-            x = torch.randn(m, k, device="cuda", dtype=test_dtype)
-            mod = nn.Sequential(
-                nn.Linear(k, n, device="cuda"), nn.ReLU(), nn.Linear(n, n, device="cuda")
-            ).to(test_dtype)
-            ref_f = mod(x)
-            api(mod)
-            test = mod(x)
-            self.assertGreater(SQNR(ref_f, test), min_sqnr, f"{api.__name__} failed, no compile dtype={test_dtype}, (m, k, n)={test_shape}")
-
-            mod_qc = torch.compile(mod, mode="max-autotune")
-            test_comp = mod_qc(x)
-            self.assertGreater(SQNR(ref_f, test_comp), min_sqnr, f"{api.__name__} failed when compiled with dtype={test_dtype}, (m, k, n)={test_shape}")
+        m, k, n = test_shape
+        x = torch.randn(m, k, device="cuda", dtype=test_dtype)
+        mod = nn.Sequential(
+            nn.Linear(k, n, device="cuda"), nn.ReLU(), nn.Linear(n, n, device="cuda")
+        ).to(test_dtype)
+        ref_f = mod(x)
+        api(mod)
+        test = mod(x)
+        self.assertGreater(SQNR(ref_f, test), min_sqnr, f"{api.__name__} failed, no compile dtype={test_dtype}, (m, k, n)={test_shape}")
+        mod_qc = torch.compile(mod, mode="max-autotune")
+        test_comp = mod_qc(x)
+        self.assertGreater(SQNR(ref_f, test_comp), min_sqnr, f"{api.__name__} failed when compiled with dtype={test_dtype}, (m, k, n)={test_shape}")
 
 
     def test_int8_dynamic_quant_subclass_api(self):
-        self._test_lin_weight_subclass_api_impl(change_linear_weights_to_dqtensors, 35)
+        for test_dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            self._test_lin_weight_subclass_api_impl(change_linear_weights_to_dqtensors, 35)
 
     def test_int8_weight_only_quant_subclass_api(self):
-        self._test_lin_weight_subclass_api_impl(change_linear_weights_to_woqtensors, 40)
+        for test_dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            self._test_lin_weight_subclass_api_impl(change_linear_weights_to_int8woqtensors, 40)
+
+    def test_int4_weight_only_quant_subclass_api(self):
+        self._test_lin_weight_subclass_api_impl(change_linear_weights_to_int4woqtensors, 15, test_shape=[1, 1024, 256])
+        for groupsize in [64, 32]:
+            for inner_k_tiles in [4, 2]:
+                    kwargs = {"groupsize": groupsize, "inner_k_tiles": inner_k_tiles}
+                    self._test_lin_weight_subclass_api_impl(lambda mod: change_linear_weights_to_int4woqtensors(mod, **kwargs), 15, test_shape=[256, 256, 8])
 
 class TestDynamicQuant(unittest.TestCase):
     def test_dynamic_quant(self):
@@ -906,7 +955,7 @@ def test_weight_only_quant_use_mixed_mm(self):
 
 class TestSaveLoadMeta(unittest.TestCase):
     @torch.no_grad()
-    def _test_handle_save_load_meta_impl(self, api):
+    def _test_handle_save_load_meta_impl(self, api, min_sqnr=35):
         m, k, n = 32, 64, 32
         class test_model(nn.Module):
             def __init__(self):
@@ -934,7 +983,7 @@ def forward(self, x):
         model_qc = torch.compile(model, mode="max-autotune")
         ref_q = model_qc(x).detach()
 
-        assert SQNR(ref_f, ref_q) > 35
+        assert SQNR(ref_f, ref_q) > min_sqnr
 
         # load model structure
         with torch.device('meta'):
@@ -951,16 +1000,20 @@ def forward(self, x):
         model_qc = torch.compile(model, mode="max-autotune")
         test = model_qc(x).detach()
 
-        assert SQNR(ref_f, test) > 35
+        assert SQNR(ref_f, test) > min_sqnr
         self.assertTrue(torch.equal(ref_q, test))
 
     @torch.no_grad()
     def test_save_load_dqtensors(self):
         self._test_handle_save_load_meta_impl(change_linear_weights_to_dqtensors)
 
     @torch.no_grad()
-    def test_save_load_woqtensors(self):
-        self._test_handle_save_load_meta_impl(change_linear_weights_to_woqtensors)
+    def test_save_load_int8woqtensors(self):
+        self._test_handle_save_load_meta_impl(change_linear_weights_to_int8woqtensors)
+
+    @torch.no_grad()
+    def test_save_load_int4woqtensors(self):
+        self._test_handle_save_load_meta_impl(change_linear_weights_to_int4woqtensors, 20)
 
 class TorchCompileUnitTest(unittest.TestCase):
     def test_fullgraph(self):
 
@@ -16,7 +16,8 @@
     "apply_weight_only_int8_quant",
     "apply_dynamic_quant",
     "change_linear_weights_to_dqtensors",
-    "change_linear_weights_to_woqtensors",
+    "change_linear_weights_to_int8woqtensors",
+    "change_linear_weights_to_int4woqtensors",
     "insert_subclass",
     "safe_int_mm",
     "dynamically_quantize_per_tensor",
@@ -34,8 +35,9 @@
     "swap_linear_with_smooth_fq_linear",
     "smooth_fq_linear_to_inference",
     "set_smooth_fq_attribute",
-    "DynamicallyQuantizedLinearWeight",
-    "WeightOnlyQuantizedLinearWeight",
+    "Int8DynamicallyQuantizedLinearWeight",
+    "Int8WeightOnlyQuantizedLinearWeight",
+    "Int4WeightOnlyQuantizedLinearWeight",
     "log_with_rank",
     "clear_logs",
     "compute_error",
 
@@ -20,8 +20,9 @@
     DynamicallyPerAxisQuantizedLinear,
 )
 from .subclass import (
-    DynamicallyQuantizedLinearWeight,
-    WeightOnlyQuantizedLinearWeight,
+    Int8DynamicallyQuantizedLinearWeight,
+    Int8WeightOnlyQuantizedLinearWeight,
+    Int4WeightOnlyQuantizedLinearWeight,
 )
 from .weight_only import (
     WeightOnlyInt8QuantLinear,
@@ -31,7 +32,8 @@
     "apply_weight_only_int8_quant",
     "apply_dynamic_quant",
     "change_linear_weights_to_dqtensors",
-    "change_linear_weights_to_woqtensors",
+    "change_linear_weights_to_int8woqtensors",
+    "change_linear_weights_to_int4woqtensors",
 ]
 
 
@@ -77,34 +79,46 @@ def apply_dynamic_quant(model):
         lambda mod, fqn: isinstance(mod, torch.nn.Linear),
     )
 
-def _get_subclass_inserter(cls):
+def _get_subclass_inserter(cls, **kwargs):
     def insert_subclass(lin):
         lin.weight = torch.nn.Parameter(
-            cls.from_float(lin.weight), requires_grad=False
+            cls.from_float(lin.weight, **kwargs), requires_grad=False
         )
         return lin
     return insert_subclass
 
 def change_linear_weights_to_dqtensors(model):
     """
-    Converts all linear weight tensors to the `DynamicallyQuantizedLinearWeight`
+    Converts all linear weight tensors to the `Int8DynamicallyQuantizedLinearWeight`
     Tensor subclass, effectively applying the same form of quantization
     as apply_dynamic_quant while not modifying the linear modules.
     """
     _replace_with_custom_fn_if_matches_filter(
         model,
-        _get_subclass_inserter(DynamicallyQuantizedLinearWeight),
+        _get_subclass_inserter(Int8DynamicallyQuantizedLinearWeight),
         lambda mod, fqn: isinstance(mod, torch.nn.Linear)
     )
 
-def change_linear_weights_to_woqtensors(model):
+def change_linear_weights_to_int8woqtensors(model):
     """
-    Converts all linear weight tensors to the `WeightOnlyQuantizedLinearWeight`
+    Converts all linear weight tensors to the `Int8WeightOnlyQuantizedLinearWeight`
     Tensor subclass, effectively applying the same form of quantization
     as apply_dynamic_quant while not modifying the linear modules.
     """
     _replace_with_custom_fn_if_matches_filter(
         model,
-        _get_subclass_inserter(WeightOnlyQuantizedLinearWeight),
+        _get_subclass_inserter(Int8WeightOnlyQuantizedLinearWeight),
+        lambda mod, fqn: isinstance(mod, torch.nn.Linear)
+    )
+
+def change_linear_weights_to_int4woqtensors(model, **kwargs):
+    """
+    Converts all linear weight tensors to the `Int4WeightOnlyQuantizedLinearWeight`
+    Tensor subclass, effectively applying the same form of quantization
+    as apply_dynamic_quant while not modifying the linear modules.
+    """
+    _replace_with_custom_fn_if_matches_filter(
+        model,
+        _get_subclass_inserter(Int4WeightOnlyQuantizedLinearWeight, **kwargs),
         lambda mod, fqn: isinstance(mod, torch.nn.Linear)
     )