Add GGUF support for MiniMax-M2.1 model

JoursBleu · JoursBleu · commit f73a17faa5b8 · 2026-03-08T13:03:14.000Z
diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py
@@ -287,6 +287,23 @@
         "attention.layer_norm_rms_epsilon": "rms_norm_eps",
         "vocab_size": "vocab_size",
     },
+    "minimax-m2": {
+        "context_length": "max_position_embeddings",
+        "block_count": "num_hidden_layers",
+        "feed_forward_length": "intermediate_size",
+        "embedding_length": "hidden_size",
+        "rope.dimension_count": "rotary_dim",
+        "rope.freq_base": "rope_theta",
+        "attention.head_count": "num_attention_heads",
+        "attention.head_count_kv": "num_key_value_heads",
+        "attention.key_length": "head_dim",
+        "attention.value_length": None,
+        "attention.layer_norm_rms_epsilon": "rms_norm_eps",
+        "expert_count": "num_local_experts",
+        "expert_used_count": "num_experts_per_tok",
+        "expert_feed_forward_length": None,
+        "vocab_size": "vocab_size",
+    },
 }
 
 GGUF_TOKENIZER_MAPPING = {
@@ -766,6 +783,7 @@ def converted(self) -> Tokenizer:
     "umt5": GGUFT5Converter,
     "deci": GGUFLlamaConverter,
     "decilm": GGUFLlamaConverter,
+    "minimax_m2": GGUFQwen2Converter,
 }
 
 
diff --git a/src/transformers/modeling_gguf_pytorch_utils.py b/src/transformers/modeling_gguf_pytorch_utils.py
@@ -299,6 +299,50 @@ def process(self, weights, name, **kwargs):
         return GGUFTensor(weights, name, {})
 
 
+class MiniMaxM2TensorProcessor(TensorProcessor):
+    HF_EXPERT_RENAME_PATTERN = re.compile(r"block_sparse_moe\.experts\.\d+\.")
+    HF_MOE_PATTERN = re.compile(r"(?:model\.)?layers\.(?P<bid>\d+)\.block_sparse_moe\.experts\.(?P<w>w[123])\.weight")
+    GGUF_MOE_WEIGHTS_PATTERN = re.compile(r"(?P<name>.*\.ffn_(?P<w>gate|down|up)_exps)\.weight$")
+
+    HF_BIAS_PATTERN = re.compile(r"(?:model\.)?layers\.(?P<bid>\d+)\.block_sparse_moe\.e_score_correction_bias")
+
+    def __init__(self, config=None):
+        super().__init__(config=config)
+
+    def preprocess_name(self, hf_name: str) -> str:
+        return re.sub(self.HF_EXPERT_RENAME_PATTERN, "block_sparse_moe.experts.", hf_name)
+
+    def perform_fallback_tensor_mapping(
+        self, gguf_to_hf_name_map: dict[str, str], suffix: str, qual_name: str, hf_name: str
+    ):
+        # Map w1/w2/w3 expert names to GGUF ffn_gate/down/up_exps tensor names.
+        # MiniMax-M2 uses w1 (gate), w2 (down), w3 (up) naming instead of
+        # gate_proj/down_proj/up_proj, so gguf-py's name_map cannot resolve them.
+        if m := re.fullmatch(self.HF_MOE_PATTERN, hf_name):
+            full_hf_name = qual_name + hf_name
+            w_map = {"w1": "gate", "w2": "down", "w3": "up"}
+            gguf_to_hf_name_map[f"blk.{m['bid']}.ffn_{w_map[m['w']]}_exps{suffix}"] = full_hf_name
+        # Map e_score_correction_bias to GGUF exp_probs_b.bias.
+        # gguf-py knows "e_score_correction" but HF uses "e_score_correction_bias".
+        elif m := re.fullmatch(self.HF_BIAS_PATTERN, hf_name):
+            gguf_to_hf_name_map[f"blk.{m['bid']}.exp_probs_b.bias"] = qual_name + hf_name
+
+    def process(self, weights, name: str, **kwargs):
+        if re.fullmatch(self.GGUF_MOE_WEIGHTS_PATTERN, name):
+            tensor_key_mapping = kwargs.get("tensor_key_mapping")
+            parsed_parameters = kwargs.get("parsed_parameters")
+            if tensor_key_mapping and name in tensor_key_mapping:
+                self._split_moe_expert_tensor(weights, parsed_parameters, tensor_key_mapping[name])
+            return GGUFTensor(weights, None, {})
+        return GGUFTensor(weights, name, {})
+
+    def _split_moe_expert_tensor(self, weights: np.ndarray, parsed_parameters: dict[str, dict], hf_name: str):
+        w_counter = self.config.get("num_local_experts", 256)
+        for i in range(w_counter):
+            temp_name = hf_name.replace("block_sparse_moe.experts.", f"block_sparse_moe.experts.{i}.")
+            parsed_parameters["tensors"][temp_name] = torch.from_numpy(np.copy(weights[i]))
+
+
 TENSOR_PROCESSORS = {
     "llama": LlamaTensorProcessor,
     "qwen2moe": Qwen2MoeTensorProcessor,
@@ -312,6 +356,7 @@ def process(self, weights, name, **kwargs):
     "gemma2": Gemma2TensorProcessor,
     "gemma3": Gemma2TensorProcessor,
     "lfm2": Lfm2TensorProcessor,
+    "minimax-m2": MiniMaxM2TensorProcessor,
 }
 
 
@@ -360,6 +405,8 @@ def get_gguf_hf_weights_map(
         model_type = "gemma3"
     elif model_type == "umt5":
         model_type = "t5"
+    elif model_type == "minimax_m2":
+        model_type = "minimax-m2"
     arch = None
     for key, value in MODEL_ARCH_NAMES.items():
         if value == model_type:
@@ -462,6 +509,8 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False, model_to_lo
         updated_architecture = "qwen2_moe"
     elif "qwen3moe" in architecture:
         updated_architecture = "qwen3_moe"
+    elif "minimax-m2" in architecture:
+        updated_architecture = "minimax_m2"
 
     # For stablelm architecture, we need to set qkv_bias and use_parallel_residual from tensors
     # If `qkv_bias=True`, qkv_proj with bias will be present in the tensors