[loader]supoort wint2 backend (PaddlePaddle#6139)

bukejiyu · web-flow · commit dc5917289dd6 · 2026-02-08T22:42:36.000-08:00
* support wint2

* update
diff --git a/fastdeploy/config.py b/fastdeploy/config.py
@@ -1225,6 +1225,7 @@ def __init__(
         args,
     ):
         self.load_choices: Union[str, LoadChoices] = LoadChoices.DEFAULT.value
+        self.is_pre_sharded: bool = False
         self.dynamic_load_weight: bool = False
         self.load_strategy: Optional[Literal["ipc", "ipc_snapshot", "meta", "normal", "rsync"]] = "normal"
         self.rsync_config: Optional[Dict[str, Any]] = None
diff --git a/fastdeploy/model_executor/layers/embeddings.py b/fastdeploy/model_executor/layers/embeddings.py
@@ -255,7 +255,7 @@ def weight_loader(self, param, loaded_weight, shard_id=None):
             else:
                 loaded_weight = loaded_weight.cast(param.dtype)
 
-        if output_dim is None:
+        if output_dim is None or self.fd_config.load_config.is_pre_sharded:
             assert (
                 param.shape == loaded_weight.shape
             ), f"Shape mismatch: param {param.shape} vs loaded_weight {loaded_weight.shape}"
diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py
@@ -556,7 +556,7 @@ def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = N
                 loaded_weight = get_tensor(loaded_weight)
                 loaded_weight = loaded_weight.transpose([1, 0])
             # Tensor parallelism splits the weight along the output_dim
-            if self.tp_size > 1 and output_dim is not None:
+            if self.tp_size > 1 and output_dim is not None and not self.fd_config.load_config.is_pre_sharded:
                 dim = -1 if output_dim else 0
                 if isinstance(loaded_weight, (np.ndarray, paddle.Tensor)):
                     size = loaded_weight.shape[dim]
@@ -713,7 +713,7 @@ def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = N
                 loaded_weight = get_tensor(loaded_weight)
                 loaded_weight = loaded_weight.transpose([1, 0])
             # Tensor parallelism splits the weight along the output_dim
-            if self.tp_size > 1 and output_dim is not None:
+            if self.tp_size > 1 and output_dim is not None and not self.fd_config.load_config.is_pre_sharded:
                 block_size = self._get_shard_size_mapping(loaded_shard_id, head_dim)
                 shard_id = self.local_rank if loaded_shard_id == "q" else self.local_rank // self.num_kv_head_replicas
                 shard_offset = shard_id * block_size
diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_wint2_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_wint2_backend.py
@@ -21,6 +21,7 @@
 
 import fastdeploy
 from fastdeploy.model_executor.ops.gpu import moe_expert_dispatch, moe_expert_reduce
+from fastdeploy.model_executor.utils import set_weight_attrs
 from fastdeploy.utils import ceil_div
 
 from ..quantization.quant_base import QuantMethodBase
@@ -154,6 +155,22 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
                 default_initializer=paddle.nn.initializer.Constant(0),
             ),
         )
+        for weight_name in [
+            "up_gate_proj_weight",
+            "down_proj_weight",
+            "up_gate_proj_weight_scale",
+            "down_proj_weight_scale",
+            "up_gate_proj_super_scales",
+            "down_proj_super_scales",
+            "up_gate_proj_code_scale",
+            "down_proj_code_scale",
+            "up_gate_proj_code_zp",
+            "down_proj_code_zp",
+        ]:
+            set_weight_attrs(
+                getattr(layer, weight_name),
+                extra_weight_attrs,
+            )
 
 
 class CutlassWint2FusedMoeMethod(Wint2MoeMethod):
@@ -164,6 +181,24 @@ class CutlassWint2FusedMoeMethod(Wint2MoeMethod):
     def __init__(self, quant_config):
         super().__init__(quant_config)
 
+    def process_weights_after_loading(self, layer):
+        if self.quant_config.is_checkpoint_bf16:
+            # dynamic quantize
+            return
+        w1_shape = layer.up_gate_proj_weight.shape
+        up_gate_proj_weight = layer.up_gate_proj_weight.reshape(
+            [w1_shape[0], w1_shape[1] // 16, 16, w1_shape[2] // 8, 8]
+        )
+        up_gate_proj_weight = paddle.transpose(up_gate_proj_weight, perm=[0, 3, 1, 4, 2])
+        up_gate_proj_weight = up_gate_proj_weight.reshape(w1_shape)
+        layer.up_gate_proj_weight.data = up_gate_proj_weight
+
+        w2_shape = layer.down_proj_weight.shape
+        down_proj_weight = layer.down_proj_weight.reshape([w2_shape[0], w2_shape[1] // 16, 16, w2_shape[2] // 8, 8])
+        down_proj_weight = paddle.transpose(down_proj_weight, perm=[0, 3, 1, 4, 2])
+        down_proj_weight = down_proj_weight.reshape(w2_shape)
+        layer.down_proj_weight.data = down_proj_weight
+
     def process_loaded_weights(self, layer, weights) -> None:
         """
         process_loaded_weights
diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py
@@ -316,7 +316,7 @@ def weight_loader(
             )
 
     def _load_gate_up_weight(self, param, expert_id, loaded_weight, shard_id, shard_dim=None, is_sharded=False):
-        if self.tp_size > 1 and not is_sharded:
+        if self.tp_size > 1 and not is_sharded and not self.fd_config.load_config.is_pre_sharded:
             tp_shard_dim = shard_dim
             weight_dim = -1 if tp_shard_dim else 0
             size = loaded_weight.shape[weight_dim]
@@ -371,7 +371,7 @@ def _load_gate_up_weight(self, param, expert_id, loaded_weight, shard_id, shard_
         h2d_copy(dst=expert_param, src=loaded_weight)
 
     def _load_down_weight(self, param, expert_id, loaded_weight, shard_id, shard_dim=None):
-        if self.tp_size > 1 and shard_dim is not None:
+        if self.tp_size > 1 and shard_dim is not None and not self.fd_config.load_config.is_pre_sharded:
             tp_shard_dim = shard_dim
             dim = -1 if tp_shard_dim else 0
             size = loaded_weight.shape[dim]
@@ -397,7 +397,7 @@ def _load_down_weight(self, param, expert_id, loaded_weight, shard_id, shard_dim
         h2d_copy(dst=expert_param, src=loaded_weight)
 
     def _load_fused_experts_weight(self, param, loaded_weight):
-        if self.tp_size > 1 and self.moe_quant_type != "mxfp4":
+        if self.tp_size > 1 and self.moe_quant_type != "mxfp4" and not self.fd_config.load_config.is_pre_sharded:
             dim = -1
             if isinstance(loaded_weight, (np.ndarray, paddle.Tensor)):
                 size = loaded_weight.shape[dim]
diff --git a/fastdeploy/model_executor/load_weight_utils.py b/fastdeploy/model_executor/load_weight_utils.py
@@ -76,6 +76,20 @@ def load_weights_from_cache(model, weights_iterator):
                 model_sublayer.process_weights_after_loading()
 
 
+def get_model_path(fd_config: FDConfig):
+    model_path = fd_config.model_config.model
+    rank_dirs = [
+        f for f in os.listdir(model_path) if f.startswith("rank") and os.path.isdir(os.path.join(model_path, f))
+    ]
+    if len(rank_dirs) > 1:
+        local_rank = fd_config.parallel_config.tensor_parallel_rank
+        if fd_config.parallel_config.tensor_parallel_size != len(rank_dirs):
+            raise ValueError(f"Your model only supports loading with tp{len(rank_dirs)}")
+        model_path = os.path.join(model_path, f"rank{local_rank}")
+        fd_config.load_config.is_pre_sharded = True
+    return model_path
+
+
 def get_weight_iterator(model_path: str):
     files_list, ordered_weight_map, use_safetensors, is_key_ordered = get_all_weights_file(model_path)
     if use_safetensors:
@@ -404,10 +418,8 @@ def load_pre_sharded_checkpoint(model_path: str, local_rank: int):
     """
     load_pre_sharded_checkpoint
     """
-
     state_dict = {}
-    safetensor_files, _, _, _ = get_all_weights_file(os.path.join(model_path, f"rank{local_rank}"))
-    weights_iterator = safetensors_weights_iterator(safetensor_files)
+    weights_iterator = get_weight_iterator(os.path.join(model_path, f"rank{local_rank}"))
     for name, weight in weights_iterator:
         state_dict[name] = weight.clone()
     return state_dict
diff --git a/fastdeploy/model_executor/model_loader/default_loader_v1.py b/fastdeploy/model_executor/model_loader/default_loader_v1.py
@@ -20,6 +20,7 @@
 
 from fastdeploy.config import FDConfig, LoadConfig, ModelConfig
 from fastdeploy.model_executor.load_weight_utils import (
+    get_model_path,
     get_weight_iterator,
     is_weight_cache_enabled,
     load_weights_from_cache,
@@ -51,7 +52,8 @@ def clean_memory_fragments(self) -> None:
     @save_model()
     @measure_time()
     def load_weights(self, model, fd_config: FDConfig, enable_cache: bool = False) -> None:
-        weights_iterator = get_weight_iterator(fd_config.model_config.model)
+        model_path = get_model_path(fd_config)
+        weights_iterator = get_weight_iterator(model_path)
         if enable_cache:
             load_weights_from_cache(model, weights_iterator)
         else:
diff --git a/fastdeploy/model_executor/models/deepseek_v3.py b/fastdeploy/model_executor/models/deepseek_v3.py
@@ -677,6 +677,7 @@ def load_weights(self, weights_iterator) -> None:
         params_dict = dict(self.named_parameters())
         process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers()), self.fd_config)
         for loaded_weight_name, loaded_weight in weights_iterator:
+            logger.debug(f"Loading weight: {loaded_weight_name}")
             loaded_weight_name = loaded_weight_name.replace("deepseek_v3", "model")
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in loaded_weight_name:
diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py
@@ -633,6 +633,7 @@ def load_weights(self, weights_iterator) -> None:
         )
 
         for loaded_weight_name, loaded_weight in weights_iterator:
+            logger.debug(f"Loading weight: {loaded_weight_name}")
             loaded_weight_name = loaded_weight_name.replace("model", "ernie")
             for param_name, weight_name, exp_id, shard_id, is_moe in all_param_mapping:
                 loaded_weight_name = checkpoint_to_fd_key_fn(loaded_weight_name, is_moe)
diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py
@@ -705,6 +705,7 @@ def load_weights(self, weights_iterator) -> None:
         expert_id = None
         shard_id = None
         for loaded_weight_name, loaded_weight in weights_iterator:
+            logger.debug(f"Loading weight: {loaded_weight_name}")
             loaded_weight_name = (
                 self.process_weights_before_loading_fn(loaded_weight_name)
                 if getattr(self, "process_weights_before_loading_fn", None)
diff --git a/fastdeploy/model_executor/models/glm4_moe.py b/fastdeploy/model_executor/models/glm4_moe.py
@@ -462,6 +462,7 @@ def load_weights(self, weights_iterator) -> None:
         params_dict = dict(self.named_parameters())
         process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers()), self.fd_config)
         for loaded_weight_name, loaded_weight in weights_iterator:
+            logger.debug(f"Loading weight: {loaded_weight_name}")
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in loaded_weight_name:
                     continue
diff --git a/fastdeploy/model_executor/models/gpt_oss.py b/fastdeploy/model_executor/models/gpt_oss.py
@@ -22,6 +22,7 @@
 import numpy as np
 import paddle
 from paddle import nn
+from paddleformers.utils.log import logger
 
 from fastdeploy.config import FDConfig
 from fastdeploy.model_executor.forward_meta import ForwardMeta
@@ -287,6 +288,7 @@ def load_weights(self, weights_iterator) -> None:
         params_dict = dict(self.named_parameters())
         process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers()), self.fd_config)
         for loaded_weight_name, loaded_weight in weights_iterator:
+            logger.debug(f"Loading weight: {loaded_weight_name}")
             matched = False
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in loaded_weight_name:
diff --git a/fastdeploy/model_executor/models/qwen2.py b/fastdeploy/model_executor/models/qwen2.py
@@ -349,6 +349,7 @@ def load_weights(self, weights_iterator) -> None:
         params_dict = dict(self.named_parameters())
         process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers()), self.fd_config)
         for loaded_weight_name, loaded_weight in weights_iterator:
+            logger.debug(f"Loading weight: {loaded_weight_name}")
             loaded_weight_name = (
                 self.process_weights_before_loading_fn(loaded_weight_name)
                 if getattr(self, "process_weights_before_loading_fn", None)
diff --git a/fastdeploy/model_executor/models/qwen2_5_vl/qwen2_5_vl.py b/fastdeploy/model_executor/models/qwen2_5_vl/qwen2_5_vl.py
@@ -213,6 +213,7 @@ def load_weights(self, weights_iterator) -> None:
         params_dict = dict(self.named_parameters())
         process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers()), self.fd_config)
         for loaded_weight_name, loaded_weight in weights_iterator:
+            logger.debug(f"Loading weight: {loaded_weight_name}")
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in loaded_weight_name:
                     continue
diff --git a/fastdeploy/model_executor/models/qwen3.py b/fastdeploy/model_executor/models/qwen3.py
@@ -277,6 +277,7 @@ def load_weights(self, weights_iterator) -> None:
         process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers()), self.fd_config)
 
         for loaded_weight_name, loaded_weight in weights_iterator:
+            logger.debug(f"Loading weight: {loaded_weight_name}")
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in loaded_weight_name:
                     continue
diff --git a/fastdeploy/model_executor/models/qwen3_vl/qwen3_vl.py b/fastdeploy/model_executor/models/qwen3_vl/qwen3_vl.py
@@ -217,6 +217,7 @@ def load_weights(self, weights_iterator) -> None:
         process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers()), self.fd_config)
         logger.info(f"[Qwen3-VL] params_dict names: {list(params_dict.keys())} ")
         for loaded_weight_name, loaded_weight in weights_iterator:
+            logger.debug(f"Loading weight: {loaded_weight_name}")
             loaded_weight_name = loaded_weight_name.replace(".language_model", "")
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in loaded_weight_name:
diff --git a/fastdeploy/model_executor/models/qwen3_vl/qwen3_vl_moe.py b/fastdeploy/model_executor/models/qwen3_vl/qwen3_vl_moe.py
@@ -244,6 +244,7 @@ def load_weights(self, weights_iterator) -> None:
         process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers()), self.fd_config)
         logger.info(f"[Qwen3Moe-VL] params_dict names: {list(params_dict.keys())} ")
         for loaded_weight_name, loaded_weight in weights_iterator:
+            logger.debug(f"Loading weight: {loaded_weight_name}")
             loaded_weight_name = loaded_weight_name.replace(".language_model", "")
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if "experts.gate_up_proj" in loaded_weight_name or "experts.down_proj" in loaded_weight_name:
diff --git a/fastdeploy/model_executor/models/qwen3moe.py b/fastdeploy/model_executor/models/qwen3moe.py
@@ -365,6 +365,7 @@ def load_weights(self, weights_iterator) -> None:
         params_dict = dict(self.named_parameters())
         process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers()), self.fd_config)
         for loaded_weight_name, loaded_weight in weights_iterator:
+            logger.debug(f"Loading weight: {loaded_weight_name}")
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in loaded_weight_name:
                     continue
diff --git a/fastdeploy/model_executor/utils.py b/fastdeploy/model_executor/utils.py
@@ -318,7 +318,12 @@ def fn(param, loaded_weight, shard_id: Optional[Union[int, str]] = None):
         if weight_need_transpose:
             loaded_weight = loaded_weight.transpose([1, 0])
         # Tensor parallelism splits the weight along the output_dim
-        if output_dim is not None and fd_config is not None and fd_config.parallel_config.tensor_parallel_size > 1:
+        if (
+            output_dim is not None
+            and fd_config is not None
+            and fd_config.parallel_config.tensor_parallel_size > 1
+            and not fd_config.load_config.is_pre_sharded
+        ):
             dim = -1 if output_dim else 0
             if isinstance(loaded_weight, paddle.Tensor):
                 size = loaded_weight.shape[dim]
diff --git a/tests/utils.py b/tests/utils.py
@@ -64,6 +64,12 @@ def __init__(self):
         self.architectures = ["test_model"]
 
 
+class FakeLoadConfig:
+    def __init__(self):
+        self.is_pre_sharded: bool = False
+        self.dynamic_load_weight: bool = False
+
+
 def get_default_test_fd_config():
     graph_opt_config = GraphOptimizationConfig(args={})
     scheduler_config = SchedulerConfig(args={})
@@ -72,12 +78,14 @@ def get_default_test_fd_config():
     parallel_config.data_parallel_rank = 1
     cache_config = CacheConfig({})
     model_config = FakeModelConfig()
+    load_config = FakeLoadConfig()
     fd_config = FDConfig(
         graph_opt_config=graph_opt_config,
         parallel_config=parallel_config,
         cache_config=cache_config,
         scheduler_config=scheduler_config,
         model_config=model_config,
+        load_config=load_config,
         test_mode=True,
     )
     return fd_config

Original file line number	Diff line number	Diff line change
`@@ -633,6 +633,7 @@ def load_weights(self, weights_iterator) -> None:`
`633`	`633`	`)`
`634`	`634`
`635`	`635`	`for loaded_weight_name, loaded_weight in weights_iterator:`
	`636`	`+ logger.debug(f"Loading weight: {loaded_weight_name}")`
`636`	`637`	`loaded_weight_name = loaded_weight_name.replace("model", "ernie")`
`637`	`638`	`for param_name, weight_name, exp_id, shard_id, is_moe in all_param_mapping:`
`638`	`639`	`loaded_weight_name = checkpoint_to_fd_key_fn(loaded_weight_name, is_moe)`