huggingface · SunMarc · Feb 25, 2025 · Feb 12, 2025 · Feb 12, 2025 · Feb 12, 2025
diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md
@@ -59,7 +59,7 @@ Use the table below to help you decide which quantization method to use.
 | [HQQ](./hqq.md)                               | 🟢                   | 🟢              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🟢              | 1/8         | 🟢               | 🔴                          | 🟢                      | https://github.com/mobiusml/hqq/            |
 | [optimum-quanto](./quanto.md)                 | 🟢                   | 🟢              | 🟢        | 🔴        | 🟢                                 | 🔴              | 🟢              | 2/4/8     | 🔴               | 🔴                          | 🟢                      | https://github.com/huggingface/optimum-quanto       |
 | [FBGEMM_FP8](./fbgemm_fp8.md)                 | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🔴              | 8             | 🔴               | 🟢                          | 🟢                      | https://github.com/pytorch/FBGEMM       |
-| [torchao](./torchao.md)                       | 🟢                   |                 | 🟢        | 🔴        | 🟡 <sub>5</sub> | 🔴              |                 | 4/8         |                  | 🟢🔴                        | 🟢                      | https://github.com/pytorch/ao       |
+| [torchao](./torchao.md)                       | 🟢                   | 🟢               | 🟢        | 🔴        | 🟡 <sub>5</sub> | 🔴              |                 | 4/8         |                  | 🟢🔴                        | 🟢                      | https://github.com/pytorch/ao       |
 | [VPTQ](./vptq.md)                             | 🔴                   | 🔴              |     🟢     | 🟡        | 🔴                                 | 🔴              | 🟢              | 1/8         | 🔴               | 🟢                          | 🟢                      | https://github.com/microsoft/VPTQ            |
 | [SpQR](./spqr.md)                          | 🔴                       |  🔴   | 🟢        | 🔴              |    🔴    | 🔴         |         🟢              | 3              |              🔴                     | 🟢           | 🟢                      | https://github.com/Vahe1994/SpQR/       |
 | [FINEGRAINED_FP8](./finegrained_fp8.md)                 | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🔴              | 8             | 🔴               | 🟢                          | 🟢                      |        |

diff --git a/docs/source/en/quantization/torchao.md b/docs/source/en/quantization/torchao.md
@@ -22,6 +22,8 @@ pip install --upgrade torch torchao transformers
 
 By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type.
 
+If you want to run the following codes on CPU even with GPU available, just change `device_map="cpu"` and `quantization_config = TorchAoConfig("int4_weight_only", group_size=128, layout=Int4CPULayout())` where `layout` comes from `from torchao.dtypes import Int4CPULayout` which is only available from torchao 0.8.0 and higher.
+
 ```py
 import torch
 from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
@@ -34,7 +36,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="
 
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 input_text = "What are we having for dinner?"
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device)
 
 # auto-compile the quantized model with `cache_implementation="static"` to get speedup
 output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
@@ -58,7 +60,7 @@ def benchmark_fn(f, *args, **kwargs):
 MAX_NEW_TOKENS = 1000
 print("int4wo-128 model:", benchmark_fn(quantized_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS, cache_implementation="static"))
 
-bf16_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda", torch_dtype=torch.bfloat16)
+bf16_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.bfloat16)
 output = bf16_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static") # auto-compile
 print("bf16 model:", benchmark_fn(bf16_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS, cache_implementation="static"))
 
@@ -80,7 +82,7 @@ quantized_model.save_pretrained(output_dir, safe_serialization=False)
 
 # load quantized model
 ckpt_id = "llama3-8b-int4wo-128"  # or huggingface hub model id
-loaded_quantized_model = AutoModelForCausalLM.from_pretrained(ckpt_id, device_map="cuda")
+loaded_quantized_model = AutoModelForCausalLM.from_pretrained(ckpt_id, device_map="auto")
 
 
 # confirm the speedup

diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
@@ -1534,7 +1534,17 @@ def _get_torchao_quant_type_to_method(self):
 
     def get_apply_tensor_subclass(self):
         _STR_TO_METHOD = self._get_torchao_quant_type_to_method()
-        return _STR_TO_METHOD[self.quant_type](**self.quant_type_kwargs)
+        quant_type_kwargs = self.quant_type_kwargs.copy()
+        if (
+            not torch.cuda.is_available()
+            and is_torchao_available()
+            and self.quant_type == "int4_weight_only"
+            and version.parse(importlib.metadata.version("torchao")) >= version.parse("0.8.0")
+        ):
+            from torchao.dtypes import Int4CPULayout
+
+            quant_type_kwargs["layout"] = Int4CPULayout()
+        return _STR_TO_METHOD[self.quant_type](**quant_type_kwargs)
 
     def __repr__(self):
         config_dict = self.to_dict()

diff --git a/tests/quantization/torchao_integration/test_torchao.py b/tests/quantization/torchao_integration/test_torchao.py
@@ -22,7 +22,6 @@
     require_torch_gpu,
     require_torch_multi_gpu,
     require_torchao,
-    torch_device,
 )
 from transformers.utils import is_torch_available, is_torchao_available
 
@@ -33,16 +32,19 @@
 if is_torchao_available():
     from torchao.dtypes import (
         AffineQuantizedTensor,
+        Int4CPULayout,
         TensorCoreTiledLayout,
     )
 
 
-def check_torchao_quantized(test_module, qlayer, batch_size=1, context_size=1024):
+def check_torchao_int4_wo_quantized(test_module, qlayer):
     weight = qlayer.weight
-    test_module.assertTrue(isinstance(weight, AffineQuantizedTensor))
     test_module.assertEqual(weight.quant_min, 0)
     test_module.assertEqual(weight.quant_max, 15)
-    test_module.assertTrue(isinstance(weight.layout, TensorCoreTiledLayout))
+    if is_torchao_available():
+        test_module.assertTrue(isinstance(weight, AffineQuantizedTensor))
+        layout = Int4CPULayout if weight.device.type == "cpu" else TensorCoreTiledLayout
+        test_module.assertTrue(isinstance(weight.tensor_impl._layout, layout))
 
 
 def check_forward(test_module, model, batch_size=1, context_size=1024):
@@ -53,7 +55,6 @@ def check_forward(test_module, model, batch_size=1, context_size=1024):
     test_module.assertEqual(out.shape[1], context_size)
 
 
-@require_torch_gpu
 @require_torchao
 class TorchAoConfigTest(unittest.TestCase):
     def test_to_dict(self):
@@ -95,15 +96,16 @@ def test_json_serializable(self):
         quantization_config.to_json_string(use_diff=False)
 
 
-@require_torch_gpu
 @require_torchao
 class TorchAoTest(unittest.TestCase):
     input_text = "What are we having for dinner?"
     max_new_tokens = 10
-
     EXPECTED_OUTPUT = "What are we having for dinner?\n- 1. What is the temperature outside"
-
     model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+    device = "cpu"
+    quant_scheme_kwargs = (
+        {"group_size": 32, "layout": Int4CPULayout()} if is_torchao_available() else {"group_size": 32}
+    )
 
     def tearDown(self):
         gc.collect()
@@ -114,20 +116,20 @@ def test_int4wo_quant(self):
         """
         Simple LLM model testing int4 weight only quantization
         """
-        quant_config = TorchAoConfig("int4_weight_only", group_size=32)
+        quant_config = TorchAoConfig("int4_weight_only", **self.quant_scheme_kwargs)
 
         # Note: we quantize the bfloat16 model on the fly to int4
         quantized_model = AutoModelForCausalLM.from_pretrained(
             self.model_name,
             torch_dtype=torch.bfloat16,
-            device_map=torch_device,
+            device_map=self.device,
             quantization_config=quant_config,
         )
         tokenizer = AutoTokenizer.from_pretrained(self.model_name)
 
-        check_torchao_quantized(self, quantized_model.model.layers[0].self_attn.v_proj)
+        check_torchao_int4_wo_quantized(self, quantized_model.model.layers[0].self_attn.v_proj)
 
-        input_ids = tokenizer(self.input_text, return_tensors="pt").to(torch_device)
+        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
 
         output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
         self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@@ -136,46 +138,51 @@ def test_int4wo_quant_bfloat16_conversion(self):
         """
         Testing the dtype of model will be modified to be bfloat16 for int4 weight only quantization
         """
-        quant_config = TorchAoConfig("int4_weight_only", group_size=32)
+        quant_config = TorchAoConfig("int4_weight_only", **self.quant_scheme_kwargs)
 
         # Note: we quantize the bfloat16 model on the fly to int4
         quantized_model = AutoModelForCausalLM.from_pretrained(
             self.model_name,
             torch_dtype=None,
-            device_map=torch_device,
+            device_map=self.device,
             quantization_config=quant_config,
         )
         tokenizer = AutoTokenizer.from_pretrained(self.model_name)
 
-        check_torchao_quantized(self, quantized_model.model.layers[0].self_attn.v_proj)
+        check_torchao_int4_wo_quantized(self, quantized_model.model.layers[0].self_attn.v_proj)
 
-        input_ids = tokenizer(self.input_text, return_tensors="pt").to(torch_device)
+        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
 
         output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
         self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
 
-    @require_torch_multi_gpu
-    def test_int4wo_quant_multi_gpu(self):
+    def test_int8_dynamic_activation_int8_weight_quant(self):
         """
-        Simple test that checks if the quantized model int4 wieght only is working properly with multiple GPUs
-        set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUS
+        Simple LLM model testing int8_dynamic_activation_int8_weight
         """
+        quant_config = TorchAoConfig("int8_dynamic_activation_int8_weight")
 
-        quant_config = TorchAoConfig("int4_weight_only", group_size=32)
         quantized_model = AutoModelForCausalLM.from_pretrained(
             self.model_name,
-            torch_dtype=torch.bfloat16,
-            device_map="auto",
+            device_map=self.device,
             quantization_config=quant_config,
         )
         tokenizer = AutoTokenizer.from_pretrained(self.model_name)
 
-        self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
-
-        input_ids = tokenizer(self.input_text, return_tensors="pt").to(torch_device)
+        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
 
         output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
-        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
+        EXPECTED_OUTPUT = [
+            "What are we having for dinner?\n\nJessica: (smiling)",
+            "What are we having for dinner?\n\nJess: (smiling) I",
+        ]
+        self.assertTrue(tokenizer.decode(output[0], skip_special_tokens=True) in EXPECTED_OUTPUT)
+
+
+@require_torch_gpu
+class TorchAoGPUTest(TorchAoTest):
+    device = "cuda"
+    quant_scheme_kwargs = {"group_size": 32}
 
     def test_int4wo_offload(self):
         """
@@ -221,35 +228,37 @@ def test_int4wo_offload(self):
         )
         tokenizer = AutoTokenizer.from_pretrained(self.model_name)
 
-        input_ids = tokenizer(self.input_text, return_tensors="pt").to(torch_device)
+        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
 
         output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
         EXPECTED_OUTPUT = "What are we having for dinner?\n- 2. What is the temperature outside"
 
         self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
 
-    def test_int8_dynamic_activation_int8_weight_quant(self):
+    @require_torch_multi_gpu
+    def test_int4wo_quant_multi_gpu(self):
         """
-        Simple LLM model testing int8_dynamic_activation_int8_weight
+        Simple test that checks if the quantized model int4 wieght only is working properly with multiple GPUs
+        set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUS
         """
-        quant_config = TorchAoConfig("int8_dynamic_activation_int8_weight")
 
-        # Note: we quantize the bfloat16 model on the fly to int4
+        quant_config = TorchAoConfig("int4_weight_only", **self.quant_scheme_kwargs)
         quantized_model = AutoModelForCausalLM.from_pretrained(
             self.model_name,
-            device_map=torch_device,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
             quantization_config=quant_config,
         )
         tokenizer = AutoTokenizer.from_pretrained(self.model_name)
 
-        input_ids = tokenizer(self.input_text, return_tensors="pt").to(torch_device)
+        self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
+
+        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
 
         output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
-        EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
-        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
+        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
 
 
-@require_torch_gpu
 @require_torchao
 class TorchAoSerializationTest(unittest.TestCase):
     input_text = "What are we having for dinner?"
@@ -258,8 +267,11 @@ class TorchAoSerializationTest(unittest.TestCase):
     # TODO: investigate why we don't have the same output as the original model for this test
     SERIALIZED_EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
     model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-    quant_scheme, quant_scheme_kwargs = "int4_weight_only", {"group_size": 32}
-    device = "cuda:0"
+    quant_scheme = "int4_weight_only"
+    quant_scheme_kwargs = (
+        {"group_size": 32, "layout": Int4CPULayout()} if is_torchao_available() else {"group_size": 32}
+    )
+    device = "cpu"
 
     # called only once for all test in this class
     @classmethod
@@ -291,9 +303,9 @@ def check_serialization_expected_output(self, device, expected_output):
         with tempfile.TemporaryDirectory() as tmpdirname:
             self.quantized_model.save_pretrained(tmpdirname, safe_serialization=False)
             loaded_quantized_model = AutoModelForCausalLM.from_pretrained(
-                self.model_name, torch_dtype=torch.bfloat16, device_map=self.device
+                self.model_name, torch_dtype=torch.bfloat16, device_map=device
             )
-            input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(self.device)
+            input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(device)
 
             output = loaded_quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
             self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), expected_output)
@@ -302,46 +314,46 @@ def test_serialization_expected_output(self):
         self.check_serialization_expected_output(self.device, self.SERIALIZED_EXPECTED_OUTPUT)
 
 
-class TorchAoSerializationW8A8Test(TorchAoSerializationTest):
+class TorchAoSerializationW8A8CPUTest(TorchAoSerializationTest):
     quant_scheme, quant_scheme_kwargs = "int8_dynamic_activation_int8_weight", {}
     ORIGINAL_EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
     SERIALIZED_EXPECTED_OUTPUT = ORIGINAL_EXPECTED_OUTPUT
-    device = "cuda:0"
+
+    @require_torch_gpu
+    def test_serialization_expected_output_on_cuda(self):
+        self.check_serialization_expected_output("cuda", self.SERIALIZED_EXPECTED_OUTPUT)
 
 
-class TorchAoSerializationW8Test(TorchAoSerializationTest):
+class TorchAoSerializationW8CPUTest(TorchAoSerializationTest):
     quant_scheme, quant_scheme_kwargs = "int8_weight_only", {}
     ORIGINAL_EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
     SERIALIZED_EXPECTED_OUTPUT = ORIGINAL_EXPECTED_OUTPUT
+
+    @require_torch_gpu
+    def test_serialization_expected_output_on_cuda(self):
+        self.check_serialization_expected_output("cuda", self.SERIALIZED_EXPECTED_OUTPUT)
+
+
+@require_torch_gpu
+class TorchAoSerializationGPTTest(TorchAoSerializationTest):
+    quant_scheme, quant_scheme_kwargs = "int4_weight_only", {"group_size": 32}
     device = "cuda:0"
 
 
-class TorchAoSerializationW8A8CPUTest(TorchAoSerializationTest):
+@require_torch_gpu
+class TorchAoSerializationW8A8GPUTest(TorchAoSerializationTest):
     quant_scheme, quant_scheme_kwargs = "int8_dynamic_activation_int8_weight", {}
     ORIGINAL_EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
     SERIALIZED_EXPECTED_OUTPUT = ORIGINAL_EXPECTED_OUTPUT
-    device = "cpu"
-
-    def test_serialization_expected_output_cuda(self):
-        """
-        Test if we can serialize on device (cpu) and load/infer the model on cuda
-        """
-        new_device = "cuda:0"
-        self.check_serialization_expected_output(new_device, self.SERIALIZED_EXPECTED_OUTPUT)
+    device = "cuda:0"
 
 
-class TorchAoSerializationW8CPUTest(TorchAoSerializationTest):
+@require_torch_gpu
+class TorchAoSerializationW8GPUTest(TorchAoSerializationTest):
     quant_scheme, quant_scheme_kwargs = "int8_weight_only", {}
     ORIGINAL_EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
     SERIALIZED_EXPECTED_OUTPUT = ORIGINAL_EXPECTED_OUTPUT
-    device = "cpu"
-
-    def test_serialization_expected_output_cuda(self):
-        """
-        Test if we can serialize on device (cpu) and load/infer the model on cuda
-        """
-        new_device = "cuda:0"
-        self.check_serialization_expected_output(new_device, self.SERIALIZED_EXPECTED_OUTPUT)
+    device = "cuda:0"
 
 
 if __name__ == "__main__":