Skip to content
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
936b206
enable torchao quantization on CPU
jiqing-feng Feb 12, 2025
4759045
fix int4
jiqing-feng Feb 12, 2025
5e51a1c
fix format
jiqing-feng Feb 12, 2025
6b3c076
enable CPU torchao tests
jiqing-feng Feb 12, 2025
2bf0ba2
fix cuda tests
jiqing-feng Feb 12, 2025
36c6534
fix cpu tests
jiqing-feng Feb 12, 2025
872c778
update tests
jiqing-feng Feb 13, 2025
76badb1
fix style
jiqing-feng Feb 13, 2025
c964c6f
fix cuda tests
jiqing-feng Feb 13, 2025
92b3ff1
Merge branch 'main' into torchao
jiqing-feng Feb 13, 2025
fcf3e9e
fix torchao available
jiqing-feng Feb 13, 2025
a871b35
fix torchao available
jiqing-feng Feb 13, 2025
65b7de3
fix torchao config cannot convert to json
jiqing-feng Feb 13, 2025
6847b7c
Merge branch 'main' into torchao
jiqing-feng Feb 14, 2025
33da778
fix docs
jiqing-feng Feb 14, 2025
8b9b6b1
Merge branch 'main' into torchao
jiqing-feng Feb 17, 2025
50d48c2
Merge branch 'main' into torchao
jiqing-feng Feb 18, 2025
f5c2c8d
Merge branch 'main' into torchao
jiqing-feng Feb 19, 2025
e1bdbd7
rm to_dict to rebase
jiqing-feng Feb 19, 2025
a880c2c
Merge branch 'main' into torchao
MekkCyber Feb 19, 2025
49015bf
limited torchao version for CPU
jiqing-feng Feb 20, 2025
81897c4
Merge branch 'main' into torchao
jiqing-feng Feb 20, 2025
135bbab
fix format
jiqing-feng Feb 20, 2025
443b1cf
Merge branch 'main' into torchao
jiqing-feng Feb 21, 2025
248e065
fix skip
jiqing-feng Feb 21, 2025
a71d8b9
fix format
jiqing-feng Feb 21, 2025
9b3053a
Merge branch 'main' into torchao
SunMarc Feb 21, 2025
e2fef70
Update src/transformers/testing_utils.py
jiqing-feng Feb 24, 2025
66b5751
Merge branch 'main' into torchao
jiqing-feng Feb 24, 2025
d356bf6
Merge branch 'main' into torchao
jiqing-feng Feb 25, 2025
9d529ca
fix cpu test
jiqing-feng Feb 25, 2025
a633f27
fix format
jiqing-feng Feb 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/en/quantization/overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ Use the table below to help you decide which quantization method to use.
| [HQQ](./hqq.md) | 🟢 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 🟢 | 1/8 | 🟢 | 🔴 | 🟢 | https://github.com/mobiusml/hqq/ |
| [optimum-quanto](./quanto.md) | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | 🔴 | 🟢 | 2/4/8 | 🔴 | 🔴 | 🟢 | https://github.com/huggingface/optimum-quanto |
| [FBGEMM_FP8](./fbgemm_fp8.md) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | 🔴 | 8 | 🔴 | 🟢 | 🟢 | https://github.com/pytorch/FBGEMM |
| [torchao](./torchao.md) | 🟢 | | 🟢 | 🔴 | 🟡 <sub>5</sub> | 🔴 | | 4/8 | | 🟢🔴 | 🟢 | https://github.com/pytorch/ao |
| [torchao](./torchao.md) | 🟢 | 🟢 | 🟢 | 🔴 | 🟡 <sub>5</sub> | 🔴 | | 4/8 | | 🟢🔴 | 🟢 | https://github.com/pytorch/ao |
| [VPTQ](./vptq.md) | 🔴 | 🔴 | 🟢 | 🟡 | 🔴 | 🔴 | 🟢 | 1/8 | 🔴 | 🟢 | 🟢 | https://github.com/microsoft/VPTQ |
| [SpQR](./spqr.md) | 🔴 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | 🟢 | 3 | 🔴 | 🟢 | 🟢 | https://github.com/Vahe1994/SpQR/ |
| [FINEGRAINED_FP8](./finegrained_fp8.md) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | 🔴 | 8 | 🔴 | 🟢 | 🟢 | |
Expand Down
8 changes: 5 additions & 3 deletions docs/source/en/quantization/torchao.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ pip install --upgrade torch torchao transformers

By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type.

If you want to run the following codes on CPU even with GPU available, just change `device_map="cpu"` and `quantization_config = TorchAoConfig("int4_weight_only", group_size=128, layout=Int4CPULayout())` where `layout` comes from `from torchao.dtypes import Int4CPULayout` which is only available from torchao 0.8.0 and higher.

```py
import torch
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
Expand All @@ -34,7 +36,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="

tokenizer = AutoTokenizer.from_pretrained(model_name)
input_text = "What are we having for dinner?"
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device)

# auto-compile the quantized model with `cache_implementation="static"` to get speedup
output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
Expand All @@ -58,7 +60,7 @@ def benchmark_fn(f, *args, **kwargs):
MAX_NEW_TOKENS = 1000
print("int4wo-128 model:", benchmark_fn(quantized_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS, cache_implementation="static"))

bf16_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda", torch_dtype=torch.bfloat16)
bf16_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.bfloat16)
output = bf16_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static") # auto-compile
print("bf16 model:", benchmark_fn(bf16_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS, cache_implementation="static"))

Expand All @@ -80,7 +82,7 @@ quantized_model.save_pretrained(output_dir, safe_serialization=False)

# load quantized model
ckpt_id = "llama3-8b-int4wo-128" # or huggingface hub model id
loaded_quantized_model = AutoModelForCausalLM.from_pretrained(ckpt_id, device_map="cuda")
loaded_quantized_model = AutoModelForCausalLM.from_pretrained(ckpt_id, device_map="auto")


# confirm the speedup
Expand Down
12 changes: 11 additions & 1 deletion src/transformers/utils/quantization_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1534,7 +1534,17 @@ def _get_torchao_quant_type_to_method(self):

def get_apply_tensor_subclass(self):
_STR_TO_METHOD = self._get_torchao_quant_type_to_method()
return _STR_TO_METHOD[self.quant_type](**self.quant_type_kwargs)
quant_type_kwargs = self.quant_type_kwargs.copy()
if (
not torch.cuda.is_available()
and is_torchao_available()
and self.quant_type == "int4_weight_only"
and version.parse(importlib.metadata.version("torchao")) >= version.parse("0.8.0")
):
from torchao.dtypes import Int4CPULayout

quant_type_kwargs["layout"] = Int4CPULayout()
return _STR_TO_METHOD[self.quant_type](**quant_type_kwargs)

def __repr__(self):
config_dict = self.to_dict()
Expand Down
136 changes: 74 additions & 62 deletions tests/quantization/torchao_integration/test_torchao.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
require_torch_gpu,
require_torch_multi_gpu,
require_torchao,
torch_device,
)
from transformers.utils import is_torch_available, is_torchao_available

Expand All @@ -33,16 +32,19 @@
if is_torchao_available():
from torchao.dtypes import (
AffineQuantizedTensor,
Int4CPULayout,
TensorCoreTiledLayout,
)


def check_torchao_quantized(test_module, qlayer, batch_size=1, context_size=1024):
def check_torchao_int4_wo_quantized(test_module, qlayer):
weight = qlayer.weight
test_module.assertTrue(isinstance(weight, AffineQuantizedTensor))
test_module.assertEqual(weight.quant_min, 0)
test_module.assertEqual(weight.quant_max, 15)
test_module.assertTrue(isinstance(weight.layout, TensorCoreTiledLayout))
if is_torchao_available():
test_module.assertTrue(isinstance(weight, AffineQuantizedTensor))
layout = Int4CPULayout if weight.device.type == "cpu" else TensorCoreTiledLayout
test_module.assertTrue(isinstance(weight.tensor_impl._layout, layout))


def check_forward(test_module, model, batch_size=1, context_size=1024):
Expand All @@ -53,7 +55,6 @@ def check_forward(test_module, model, batch_size=1, context_size=1024):
test_module.assertEqual(out.shape[1], context_size)


@require_torch_gpu
@require_torchao
class TorchAoConfigTest(unittest.TestCase):
def test_to_dict(self):
Expand Down Expand Up @@ -95,15 +96,16 @@ def test_json_serializable(self):
quantization_config.to_json_string(use_diff=False)


@require_torch_gpu
@require_torchao
class TorchAoTest(unittest.TestCase):
input_text = "What are we having for dinner?"
max_new_tokens = 10

EXPECTED_OUTPUT = "What are we having for dinner?\n- 1. What is the temperature outside"

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
device = "cpu"
quant_scheme_kwargs = (
{"group_size": 32, "layout": Int4CPULayout()} if is_torchao_available() else {"group_size": 32}
)

def tearDown(self):
gc.collect()
Expand All @@ -114,20 +116,20 @@ def test_int4wo_quant(self):
"""
Simple LLM model testing int4 weight only quantization
"""
quant_config = TorchAoConfig("int4_weight_only", group_size=32)
quant_config = TorchAoConfig("int4_weight_only", **self.quant_scheme_kwargs)

# Note: we quantize the bfloat16 model on the fly to int4
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
torch_dtype=torch.bfloat16,
device_map=torch_device,
device_map=self.device,
quantization_config=quant_config,
)
tokenizer = AutoTokenizer.from_pretrained(self.model_name)

check_torchao_quantized(self, quantized_model.model.layers[0].self_attn.v_proj)
check_torchao_int4_wo_quantized(self, quantized_model.model.layers[0].self_attn.v_proj)

input_ids = tokenizer(self.input_text, return_tensors="pt").to(torch_device)
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)

output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
Expand All @@ -136,46 +138,51 @@ def test_int4wo_quant_bfloat16_conversion(self):
"""
Testing the dtype of model will be modified to be bfloat16 for int4 weight only quantization
"""
quant_config = TorchAoConfig("int4_weight_only", group_size=32)
quant_config = TorchAoConfig("int4_weight_only", **self.quant_scheme_kwargs)

# Note: we quantize the bfloat16 model on the fly to int4
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
torch_dtype=None,
device_map=torch_device,
device_map=self.device,
quantization_config=quant_config,
)
tokenizer = AutoTokenizer.from_pretrained(self.model_name)

check_torchao_quantized(self, quantized_model.model.layers[0].self_attn.v_proj)
check_torchao_int4_wo_quantized(self, quantized_model.model.layers[0].self_attn.v_proj)

input_ids = tokenizer(self.input_text, return_tensors="pt").to(torch_device)
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)

output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)

@require_torch_multi_gpu
def test_int4wo_quant_multi_gpu(self):
def test_int8_dynamic_activation_int8_weight_quant(self):
"""
Simple test that checks if the quantized model int4 wieght only is working properly with multiple GPUs
set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUS
Simple LLM model testing int8_dynamic_activation_int8_weight
"""
quant_config = TorchAoConfig("int8_dynamic_activation_int8_weight")

quant_config = TorchAoConfig("int4_weight_only", group_size=32)
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
torch_dtype=torch.bfloat16,
device_map="auto",
device_map=self.device,
quantization_config=quant_config,
)
tokenizer = AutoTokenizer.from_pretrained(self.model_name)

self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})

input_ids = tokenizer(self.input_text, return_tensors="pt").to(torch_device)
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)

output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
EXPECTED_OUTPUT = [
"What are we having for dinner?\n\nJessica: (smiling)",
"What are we having for dinner?\n\nJess: (smiling) I",
]
self.assertTrue(tokenizer.decode(output[0], skip_special_tokens=True) in EXPECTED_OUTPUT)


@require_torch_gpu
class TorchAoGPUTest(TorchAoTest):
device = "cuda"
quant_scheme_kwargs = {"group_size": 32}

def test_int4wo_offload(self):
"""
Expand Down Expand Up @@ -221,35 +228,37 @@ def test_int4wo_offload(self):
)
tokenizer = AutoTokenizer.from_pretrained(self.model_name)

input_ids = tokenizer(self.input_text, return_tensors="pt").to(torch_device)
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)

output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
EXPECTED_OUTPUT = "What are we having for dinner?\n- 2. What is the temperature outside"

self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)

def test_int8_dynamic_activation_int8_weight_quant(self):
@require_torch_multi_gpu
def test_int4wo_quant_multi_gpu(self):
"""
Simple LLM model testing int8_dynamic_activation_int8_weight
Simple test that checks if the quantized model int4 wieght only is working properly with multiple GPUs
set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUS
"""
quant_config = TorchAoConfig("int8_dynamic_activation_int8_weight")

# Note: we quantize the bfloat16 model on the fly to int4
quant_config = TorchAoConfig("int4_weight_only", **self.quant_scheme_kwargs)
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
device_map=torch_device,
torch_dtype=torch.bfloat16,
device_map="auto",
quantization_config=quant_config,
)
tokenizer = AutoTokenizer.from_pretrained(self.model_name)

input_ids = tokenizer(self.input_text, return_tensors="pt").to(torch_device)
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})

input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)

output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)


@require_torch_gpu
@require_torchao
class TorchAoSerializationTest(unittest.TestCase):
input_text = "What are we having for dinner?"
Expand All @@ -258,8 +267,11 @@ class TorchAoSerializationTest(unittest.TestCase):
# TODO: investigate why we don't have the same output as the original model for this test
SERIALIZED_EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
quant_scheme, quant_scheme_kwargs = "int4_weight_only", {"group_size": 32}
device = "cuda:0"
quant_scheme = "int4_weight_only"
quant_scheme_kwargs = (
{"group_size": 32, "layout": Int4CPULayout()} if is_torchao_available() else {"group_size": 32}
)
device = "cpu"

# called only once for all test in this class
@classmethod
Expand Down Expand Up @@ -291,9 +303,9 @@ def check_serialization_expected_output(self, device, expected_output):
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname, safe_serialization=False)
loaded_quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name, torch_dtype=torch.bfloat16, device_map=self.device
self.model_name, torch_dtype=torch.bfloat16, device_map=device
)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(self.device)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(device)

output = loaded_quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), expected_output)
Expand All @@ -302,46 +314,46 @@ def test_serialization_expected_output(self):
self.check_serialization_expected_output(self.device, self.SERIALIZED_EXPECTED_OUTPUT)


class TorchAoSerializationW8A8Test(TorchAoSerializationTest):
class TorchAoSerializationW8A8CPUTest(TorchAoSerializationTest):
quant_scheme, quant_scheme_kwargs = "int8_dynamic_activation_int8_weight", {}
ORIGINAL_EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
SERIALIZED_EXPECTED_OUTPUT = ORIGINAL_EXPECTED_OUTPUT
device = "cuda:0"

@require_torch_gpu
def test_serialization_expected_output_on_cuda(self):
self.check_serialization_expected_output("cuda", self.SERIALIZED_EXPECTED_OUTPUT)


class TorchAoSerializationW8Test(TorchAoSerializationTest):
class TorchAoSerializationW8CPUTest(TorchAoSerializationTest):
quant_scheme, quant_scheme_kwargs = "int8_weight_only", {}
ORIGINAL_EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
SERIALIZED_EXPECTED_OUTPUT = ORIGINAL_EXPECTED_OUTPUT

@require_torch_gpu
def test_serialization_expected_output_on_cuda(self):
self.check_serialization_expected_output("cuda", self.SERIALIZED_EXPECTED_OUTPUT)


@require_torch_gpu
class TorchAoSerializationGPTTest(TorchAoSerializationTest):
quant_scheme, quant_scheme_kwargs = "int4_weight_only", {"group_size": 32}
device = "cuda:0"


class TorchAoSerializationW8A8CPUTest(TorchAoSerializationTest):
@require_torch_gpu
class TorchAoSerializationW8A8GPUTest(TorchAoSerializationTest):
quant_scheme, quant_scheme_kwargs = "int8_dynamic_activation_int8_weight", {}
ORIGINAL_EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
SERIALIZED_EXPECTED_OUTPUT = ORIGINAL_EXPECTED_OUTPUT
device = "cpu"

def test_serialization_expected_output_cuda(self):
"""
Test if we can serialize on device (cpu) and load/infer the model on cuda
"""
new_device = "cuda:0"
self.check_serialization_expected_output(new_device, self.SERIALIZED_EXPECTED_OUTPUT)
device = "cuda:0"


class TorchAoSerializationW8CPUTest(TorchAoSerializationTest):
@require_torch_gpu
class TorchAoSerializationW8GPUTest(TorchAoSerializationTest):
quant_scheme, quant_scheme_kwargs = "int8_weight_only", {}
ORIGINAL_EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
SERIALIZED_EXPECTED_OUTPUT = ORIGINAL_EXPECTED_OUTPUT
device = "cpu"

def test_serialization_expected_output_cuda(self):
"""
Test if we can serialize on device (cpu) and load/infer the model on cuda
"""
new_device = "cuda:0"
self.check_serialization_expected_output(new_device, self.SERIALIZED_EXPECTED_OUTPUT)
device = "cuda:0"


if __name__ == "__main__":
Expand Down