-
Notifications
You must be signed in to change notification settings - Fork 346
[AutoRound] Support w8a8 scheme in auto-round and add example #2150
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
4ec7c84
cc7284c
f493883
c0ecb08
a77be82
2963647
0c378ed
c4b97c1
792554a
7a18267
64dff8e
a6981f7
1282ba2
868af0d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,61 @@ | ||
| from auto_round.calib_dataset import get_dataset | ||
| from transformers import AutoTokenizer, Llama4ForConditionalGeneration, AutoProcessor | ||
|
|
||
| from llmcompressor import oneshot | ||
| from llmcompressor.modifiers.autoround import AutoRoundModifier | ||
| from llmcompressor.utils import dispatch_for_generation | ||
|
|
||
| # Select model and load it. | ||
| model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct" | ||
| model = Llama4ForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto") | ||
| tokenizer = AutoTokenizer.from_pretrained(model_id) | ||
| processor = AutoProcessor.from_pretrained(model_id) | ||
|
|
||
| # Select calibration dataset. | ||
| NUM_CALIBRATION_SAMPLES = 128 | ||
| MAX_SEQUENCE_LENGTH = 2048 | ||
| # Get aligned calibration dataset. | ||
|
|
||
| ds = get_dataset( | ||
| tokenizer=tokenizer, | ||
| seqlen=MAX_SEQUENCE_LENGTH, | ||
| nsamples=NUM_CALIBRATION_SAMPLES, | ||
| ) | ||
|
|
||
|
|
||
| # Configure the quantization algorithm to run. | ||
| scheme = "FP8_DYNAMIC" | ||
| recipe = AutoRoundModifier( | ||
| targets="Linear", | ||
| scheme=scheme, | ||
| ignore=["re:.*lm_head", "re:.*router", "re:.*self_attn.*", "re:.*shared_expert.*" , "re:multi_modal_projector.*", "re:vision_model"], | ||
| iters=0, | ||
| ) | ||
|
|
||
|
|
||
| # Apply algorithms. | ||
| oneshot( | ||
| model=model, | ||
| dataset=ds, | ||
| recipe=recipe, | ||
| max_seq_length=MAX_SEQUENCE_LENGTH, | ||
| num_calibration_samples=NUM_CALIBRATION_SAMPLES, | ||
| # disable shuffling to get slightly better mmlu score | ||
| shuffle_calibration_samples=False, | ||
| ) | ||
|
|
||
| # Confirm generations of the quantized model look sane. | ||
| print("\n\n") | ||
| print("========== SAMPLE GENERATION ==============") | ||
| dispatch_for_generation(model) | ||
| sample = tokenizer("Hello my name is", return_tensors="pt") | ||
| sample = {key: value.to(model.device) for key, value in sample.items()} | ||
| output = model.generate(**sample, max_new_tokens=1) | ||
| print(tokenizer.decode(output[0])) | ||
| print("==========================================\n\n") | ||
|
|
||
| # Save to disk compressed. | ||
| SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W8A8-Dynamic-AutoRound" | ||
| model.save_pretrained(SAVE_DIR, save_compressed=True) | ||
| tokenizer.save_pretrained(SAVE_DIR) | ||
| processor.save_pretrained(SAVE_DIR) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,61 @@ | ||
| from auto_round.calib_dataset import get_dataset | ||
| from transformers import AutoTokenizer, Llama4ForConditionalGeneration, AutoProcessor | ||
|
|
||
| from llmcompressor import oneshot | ||
| from llmcompressor.modifiers.autoround import AutoRoundModifier | ||
| from llmcompressor.utils import dispatch_for_generation | ||
|
|
||
| # Select model and load it. | ||
| model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct" | ||
| model = Llama4ForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto") | ||
| tokenizer = AutoTokenizer.from_pretrained(model_id) | ||
| processor = AutoProcessor.from_pretrained(model_id) | ||
|
|
||
| # Select calibration dataset. | ||
| NUM_CALIBRATION_SAMPLES = 128 | ||
| MAX_SEQUENCE_LENGTH = 2048 | ||
| # Get aligned calibration dataset. | ||
|
|
||
| ds = get_dataset( | ||
| tokenizer=tokenizer, | ||
| seqlen=MAX_SEQUENCE_LENGTH, | ||
| nsamples=NUM_CALIBRATION_SAMPLES, | ||
| ) | ||
|
|
||
|
|
||
| # Configure the quantization algorithm to run. | ||
| scheme = "FP8" | ||
| recipe = AutoRoundModifier( | ||
| targets="Linear", | ||
| scheme=scheme, | ||
| ignore=["re:.*lm_head", "re:.*router", "re:.*self_attn.*", "re:.*shared_expert.*" , "re:multi_modal_projector.*", "re:vision_model"], | ||
| iters=0, | ||
| ) | ||
|
|
||
|
|
||
| # Apply algorithms. | ||
| oneshot( | ||
| model=model, | ||
| dataset=ds, | ||
| recipe=recipe, | ||
| max_seq_length=MAX_SEQUENCE_LENGTH, | ||
| num_calibration_samples=NUM_CALIBRATION_SAMPLES, | ||
| # disable shuffling to get slightly better mmlu score | ||
| shuffle_calibration_samples=False, | ||
| ) | ||
|
|
||
| # Confirm generations of the quantized model look sane. | ||
| print("\n\n") | ||
| print("========== SAMPLE GENERATION ==============") | ||
| dispatch_for_generation(model) | ||
| sample = tokenizer("Hello my name is", return_tensors="pt") | ||
| sample = {key: value.to(model.device) for key, value in sample.items()} | ||
| output = model.generate(**sample, max_new_tokens=1) | ||
| print(tokenizer.decode(output[0])) | ||
| print("==========================================\n\n") | ||
|
|
||
| # Save to disk compressed. | ||
| SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W8A8-Dynamic-AutoRound" | ||
| model.save_pretrained(SAVE_DIR, save_compressed=True) | ||
| tokenizer.save_pretrained(SAVE_DIR) | ||
| processor.save_pretrained(SAVE_DIR) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,6 +3,8 @@ | |
| import torch | ||
| from auto_round import AutoRound | ||
| from auto_round.schemes import QuantizationScheme as ARQuantizationScheme | ||
| from auto_round.wrapper import WrapperWALayer | ||
|
|
||
| from compressed_tensors.quantization import ( | ||
| QuantizationScheme, | ||
| QuantizationStrategy, | ||
|
|
@@ -145,8 +147,7 @@ def start_calibration(self, model: torch.nn.Module): | |
| untie_word_embeddings(model) | ||
|
|
||
| for _, module in match_named_modules(model, self.targets, self.ignore): | ||
| # Note: No need to register observers for auto-round | ||
| self._calibration_hooks |= self._initialize_hooks(module) | ||
| # skip register observers for auto-round | ||
| apply_calibration_status(module) | ||
|
|
||
| model.apply(enable_quantization) # quantize at the same time as calibrate | ||
|
|
@@ -242,16 +243,38 @@ def apply_autoround(self, state, subgraph): | |
| auto_offload=False, | ||
| ) | ||
| self._q_input = q_input | ||
|
|
||
| # auto-round will return WrapperWALayer if activation is quantized | ||
mengniwang95 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| for name, module in decoding_layer.named_modules(): | ||
| if isinstance(module, WrapperWALayer): | ||
| parent, child = name.rsplit(".", maxsplit=1) | ||
| parent = decoding_layer.get_submodule(parent) | ||
| setattr(parent, child, module.orig_layer) | ||
mengniwang95 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| # Update offload parameters and remove temporary attributes | ||
| for _, module in decoding_layer.named_modules(): | ||
| if hasattr(module, "weight_scale") and hasattr( | ||
| module, "weight_zero_point" | ||
| for name, module in decoding_layer.named_modules(): | ||
| if ( | ||
| hasattr(module, "weight_scale") | ||
| and hasattr(module, "weight_zero_point") | ||
| and hasattr(module, "scale") | ||
| ): | ||
| # Note: The model's weight is already q-dq in-place by auto-round. | ||
| weight_scale = module.scale | ||
| del module.scale | ||
| # TODO: update zero_point after supporting asymmetric quantization | ||
| update_offload_parameter(module, "weight_scale", weight_scale) | ||
|
|
||
| if ( | ||
| hasattr(module, "act_scale") | ||
| and hasattr(module, "input_scale") | ||
| ): | ||
| act_scale = module.act_scale | ||
| assert act_scale.numel() == module.input_scale.numel(), "Activation scale size mismatch" | ||
mengniwang95 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| del module.act_scale | ||
|
|
||
| # activation scale shape maybe different | ||
| update_offload_parameter(module, "input_scale", act_scale.reshape(module.input_scale.shape)) | ||
|
|
||
| decoding_layer.eval() | ||
|
|
||
| def post_autoround_cleanup(self): | ||
|
|
@@ -314,23 +337,65 @@ def _mapping_config_to_autoround(self): | |
| ), f"Expected QuantizationScheme, got {type(scheme)}" | ||
| quant_scheme = scheme | ||
| weight_args = quant_scheme.weights | ||
| assert weight_args.strategy == QuantizationStrategy.GROUP, ( | ||
| "Only group-wise quantization is supported in AutoRoundModifier for now, " | ||
| f"got {weight_args.strategy}" | ||
| ) | ||
| assert quant_scheme.input_activations is None, ( | ||
| "Input activation quantization is not supported in AutoRoundModifier, " | ||
| f"got {quant_scheme.input_activations}" | ||
| ) | ||
| activation_args = quant_scheme.input_activations | ||
| assert quant_scheme.output_activations is None, ( | ||
| "Output activation quantization is not supported in AutoRoundModifier, " | ||
| f"got {quant_scheme.output_activations}" | ||
| ) | ||
| group_size = weight_args.group_size | ||
| data_type = weight_args.type | ||
| if group_size is None: | ||
| if weight_args.strategy == QuantizationStrategy.CHANNEL: | ||
| group_size = -1 | ||
| elif weight_args.strategy == QuantizationStrategy.TENSOR: | ||
| group_size = 0 | ||
| else: | ||
| raise ValueError( | ||
| "AutoRoundModifier only supports channel-wise and tensor-wise weight quantization" | ||
| ) | ||
|
|
||
| if data_type == "float": | ||
| data_type = "fp" | ||
|
|
||
| if activation_args is None: | ||
| act_bits = 16 | ||
| act_group_size = None | ||
| act_symmetric = None | ||
| act_bits = None | ||
mengniwang95 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| act_dynamic = None | ||
| act_data_type = None | ||
| else: | ||
| act_dynamic = activation_args.dynamic | ||
| act_group_size = activation_args.group_size | ||
| act_symmetric = activation_args.symmetric | ||
| act_bits = activation_args.num_bits | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about using
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are default values in QuantizationArgs for each parameter. If we use getattr, I think all similar codes should be replaced to keep aligned. |
||
|
|
||
| # activation is quantized dynamically, don't need to collect scale in auto-round | ||
| if act_dynamic: | ||
| act_bits = 16 | ||
|
|
||
| act_data_type = activation_args.type | ||
| assert activation_args.strategy != QuantizationStrategy.GROUP, ( | ||
| "Input activation group-wise quantization is not supported in AutoRoundModifier" | ||
| ) | ||
| if act_group_size is None: | ||
| if activation_args.strategy in [QuantizationStrategy.CHANNEL, QuantizationStrategy.TOKEN]: | ||
| group_size = -1 | ||
| if activation_args.strategy == QuantizationStrategy.TENSOR: | ||
| group_size = 0 | ||
mengniwang95 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| if act_data_type == "float": | ||
| act_data_type = "fp" | ||
|
|
||
| ar_quant_scheme = ARQuantizationScheme( | ||
| bits=weight_args.num_bits, | ||
| sym=weight_args.symmetric, | ||
| group_size=weight_args.group_size, | ||
| data_type=weight_args.type, | ||
| act_bits=16, | ||
| group_size=group_size, | ||
| data_type=data_type, | ||
| act_bits=act_bits, | ||
| act_group_size=act_group_size, | ||
| act_sym=act_symmetric, | ||
| act_dynamic=act_dynamic, | ||
| act_data_type=act_data_type, | ||
| ) | ||
| return ar_quant_scheme | ||
Uh oh!
There was an error while loading. Please reload this page.