Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 44 additions & 33 deletions src/transformers/quantizers/quantizer_fbgemm_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,39 +242,50 @@ def update_missing_keys(self, model, missing_keys: List[str], prefix: str) -> Li
return [k for k in missing_keys if k not in not_missing_keys]

def update_tp_plan(self, config):
text_plan = {
"layers.*.self_attn.q_proj.weight": "local_colwise",
"layers.*.self_attn.q_proj.weight_scale": "local_colwise",
"layers.*.self_attn.k_proj.weight": "local_colwise",
"layers.*.self_attn.k_proj.weight_scale": "local_colwise",
"layers.*.self_attn.v_proj.weight": "local_colwise",
"layers.*.self_attn.v_proj.weight_scale": "local_colwise",
"layers.*.self_attn.o_proj.weight": "local_rowwise",
"layers.*.self_attn": "gather",
"layers.*.input_layernorm.weight": "sequence_parallel",
"layers.*.post_attention_layernorm.weight": "sequence_parallel",
"norm.weight": "sequence_parallel",
"layers.*.feed_forward.shared_expert.gate_proj.weight": "local_colwise",
"layers.*.feed_forward.shared_expert.gate_proj.weight_scale": "local_colwise",
"layers.*.feed_forward.shared_expert.up_proj.weight": "local_colwise",
"layers.*.feed_forward.shared_expert.up_proj.weight_scale": "local_colwise",
"layers.*.feed_forward.shared_expert.down_proj.weight": "local_rowwise",
"layers.*.feed_forward.experts": "local",
"layers.*.feed_forward": "gather",
"layers.*.feed_forward.experts.*.gate_proj.weight": "local_colwise",
"layers.*.feed_forward.experts.*.gate_proj.weight_scale": "local_colwise",
"layers.*.feed_forward.experts.*.up_proj.weight": "local_colwise",
"layers.*.feed_forward.experts.*.up_proj.weight_scale": "local_colwise",
"layers.*.feed_forward.experts.*.down_proj.weight": "local_rowwise",
# For Fused implementation
"layers.*.feed_forward.experts.gate_up_proj": "local_packed_rowwise",
"layers.*.feed_forward.experts.gate_up_proj_scale": "local_packed_rowwise",
"layers.*.feed_forward.experts.down_proj": "local_colwise",
}
if config.get_text_config() is not None:
config.get_text_config().base_model_tp_plan = text_plan
else:
config.base_model_tp_plan = text_plan
if "Llama4" in config.__class__.__name__:
text_plan = {
# We are using a different tp plan with local_colwise and local_rowwise for the attention because fbgemm operations cannot be parallelized
# With local_colwise and local_rowwise, all the operations are done locally, and we add a gather operation to gather the results instead of
# using dtensors
"layers.*.self_attn.q_proj.weight": "local_colwise",
"layers.*.self_attn.q_proj.weight_scale": "local_colwise",
"layers.*.self_attn.k_proj.weight": "local_colwise",
"layers.*.self_attn.k_proj.weight_scale": "local_colwise",
"layers.*.self_attn.v_proj.weight": "local_colwise",
"layers.*.self_attn.v_proj.weight_scale": "local_colwise",
"layers.*.self_attn.o_proj.weight": "local_rowwise",
"layers.*.self_attn": "gather",
# We keep the same sequence_parallel plan for layernorms
"layers.*.input_layernorm.weight": "sequence_parallel",
"layers.*.post_attention_layernorm.weight": "sequence_parallel",
"norm.weight": "sequence_parallel",
# We keep the same local_colwise and local_rowwise plan for the feed forward shared expert
# We also add scales for the shared expert, for local_colwise the scale is also local_colwise
# For local_rowwise the scale is replicated, so we don't need to add it
"layers.*.feed_forward.shared_expert.gate_proj.weight": "local_colwise",
"layers.*.feed_forward.shared_expert.gate_proj.weight_scale": "local_colwise",
"layers.*.feed_forward.shared_expert.up_proj.weight": "local_colwise",
"layers.*.feed_forward.shared_expert.up_proj.weight_scale": "local_colwise",
"layers.*.feed_forward.shared_expert.down_proj.weight": "local_rowwise",
"layers.*.feed_forward.experts": "local",
"layers.*.feed_forward": "gather",
"layers.*.feed_forward.experts.*.gate_proj.weight": "local_colwise",
"layers.*.feed_forward.experts.*.gate_proj.weight_scale": "local_colwise",
"layers.*.feed_forward.experts.*.up_proj.weight": "local_colwise",
"layers.*.feed_forward.experts.*.up_proj.weight_scale": "local_colwise",
"layers.*.feed_forward.experts.*.down_proj.weight": "local_rowwise",
# For Fused implementation we use local_packed_rowwise for the gate_up_proj, and the same for the packed scales
# We use local_colwise for the down_proj, and the scales are replicated so we don't add them
"layers.*.feed_forward.experts.gate_up_proj": "local_packed_rowwise",
"layers.*.feed_forward.experts.gate_up_proj_scale": "local_packed_rowwise",
"layers.*.feed_forward.experts.down_proj": "local_colwise",
}
if config.get_text_config() is not None:
config.get_text_config().base_model_tp_plan = text_plan
else:
config.base_model_tp_plan = text_plan
return config

return config

def is_serializable(self, safe_serialization=None):
Expand Down