[veomni] feat: support offloading/loading the veomni model/optimizer (verl-project#4916)

ji-huazhong · web-flow · commit d7c88e597286 · 2026-01-16T22:49:25.000+08:00
### What does this PR do? This PR adds support for offloading both the model and optimizer (in veomni style) to CPU, as well as onloading them back to the device. Additionally, it includes two model conversion scripts required by veomni: - `moe_merge.py`: Converts models from Hugging Face (HF) format into a format compatible with veomni. - `moe_split.py`: Converts checkpoints generated by veomni training back into HF format. ### Checklist Before Starting - [ ] Search for similar PRs. Paste at least one query link here: ... - [ ] Format the PR title as `[{modules}] {type}: {description}` (This will be checked by the CI) - `{modules}` include `fsdp`, `megatron`, `veomni`, `sglang`, `vllm`, `rollout`, `trainer`, `ci`, `training_utils`, `recipe`, `hardware`, `deployment`, `ray`, `worker`, `single_controller`, `misc`, `perf`, `model`, `algo`, `env`, `tool`, `ckpt`, `doc`, `data`, `cfg`, `reward` - If this PR involves multiple modules, separate them with `,` like `[megatron, fsdp, doc]` - `{type}` is in `feat`, `fix`, `refactor`, `chore`, `test` - If this PR breaks any API (CLI arguments, config, function signature, etc.), add `[BREAKING]` to the beginning of the title. - Example: `[BREAKING][fsdp, megatron] feat: dynamic batching` ### Test > For changes that can not be tested by CI (e.g., algorithm implementation, new model support), validate by experiment(s) and show results like training curve plots, evaluation results, etc. ### API and Usage Example > Demonstrate how the API changes if any, and provide usage example(s) if possible. ```python # Add code snippet or script demonstrating how to use this ``` ### Design & Code Changes > Demonstrate the high-level design if this PR is complex, and list the specific changes. ### Checklist Before Submitting > [!IMPORTANT] > Please check all the following items before requesting a review, otherwise the reviewer might deprioritize this PR for review. - [ ] Read the [Contribute Guide](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md). - [ ] Apply [pre-commit checks](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md#code-linting-and-formatting): `pre-commit install && pre-commit run --all-files --show-diff-on-failure --color=always` - [ ] Add / Update [the documentation](https://github.com/volcengine/verl/tree/main/docs). - [ ] Add unit or end-to-end test(s) to [the CI workflow](https://github.com/volcengine/verl/tree/main/.github/workflows) to cover all the code. If not feasible, explain why: ... - [ ] Once your PR is ready for CI, send a message in [the `ci-request` channel](https://verl-project.slack.com/archives/C091TCESWB1) in [the `verl` Slack workspace](https://join.slack.com/t/verl-project/shared_invite/zt-3855yhg8g-CTkqXu~hKojPCmo7k_yXTQ). (If not accessible, please try [the Feishu group (飞书群)](https://applink.larkoffice.com/client/chat/chatter/add_by_link?link_token=772jd4f1-cd91-441e-a820-498c6614126a).) - [ ] If your PR is related to the `recipe` submodule, please also update the reference to the submodule commit via `git submodule update --remote` or `cd recipe && git pull origin main`.
diff --git a/scripts/veomni/moe_merge.py b/scripts/veomni/moe_merge.py
@@ -0,0 +1,121 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Merge individual MoE expert weights into stacked tensors for efficient loading.
+
+This script takes a HuggingFace checkpoint with individual expert weights
+(e.g., model.layers.{i}.mlp.experts.{j}.gate_proj.weight) and merges them
+into stacked tensors (e.g., model.layers.{i}.mlp.experts.gate_proj) for
+faster loading and better memory efficiency in VeOmni.
+
+The merging process:
+1. Loads individual expert weights from the HF checkpoint
+2. Stacks them into single tensors for each projection type
+3. Handles all three projection types: gate_proj, up_proj, down_proj
+4. Supports both Qwen3-MoE (num_experts) and DeepSeek (n_routed_experts) formats
+5. Handles models with initial dense layers (first_k_dense_replace)
+
+Usage: python moe_merge.py --raw_hf_path <input_checkpoint> --merge_hf_path <output_dir>
+"""
+
+import os
+from argparse import ArgumentParser
+from dataclasses import dataclass
+from glob import glob
+from typing import Generator
+
+import torch
+from safetensors.torch import safe_open
+from tqdm import tqdm
+from transformers import AutoConfig
+from veomni.models import build_tokenizer, save_model_weights
+
+
+@dataclass
+class StateDictIterator:
+    filepath: str
+
+    def __iter__(self) -> Generator[tuple[str, "torch.Tensor"], None, None]:
+        if self.filepath.endswith(".safetensors"):
+            with safe_open(self.filepath, framework="pt", device="cpu") as f:
+                for key in f.keys():
+                    yield key, f.get_tensor(key)
+
+        else:
+            state_dict = torch.load(self.filepath, map_location="cpu", weights_only=True, mmap=True)
+            for key in state_dict.keys():
+                yield key, state_dict[key]
+
+
+def main(raw_hf_path, merge_hf_path):
+    torch.set_default_dtype(torch.bfloat16)
+    os.makedirs(merge_hf_path, exist_ok=True)
+
+    config = AutoConfig.from_pretrained(raw_hf_path)
+    tokenizer = build_tokenizer(raw_hf_path)
+
+    safetensor_files = list(glob(os.path.join(raw_hf_path, "*.safetensors")))
+    safetensor_files.sort()
+    state_dict_iterators = [StateDictIterator(shard_file) for shard_file in safetensor_files]
+    new_state_dict = {}
+    for state_dict_iterator in tqdm(state_dict_iterators, desc="Loading checkpoint shards"):
+        for name, tensor in state_dict_iterator:
+            new_state_dict[name] = tensor.cpu()
+
+    print(new_state_dict.keys())
+
+    if hasattr(config, "num_experts"):
+        # qwen3moe
+        num_experts = config.num_experts
+    elif hasattr(config, "n_routed_experts"):
+        # deepseek
+        num_experts = config.n_routed_experts
+    else:
+        raise RuntimeError("could not find how many experts to assign")
+    num_hidden_layers = config.num_hidden_layers
+
+    if hasattr(config, "first_k_dense_replace"):
+        # deepseek first k dense layer
+        moe_layer_start_idx = config.first_k_dense_replace
+    else:
+        # moe layer only in the model
+        moe_layer_start_idx = 0
+
+    for i in range(moe_layer_start_idx, num_hidden_layers):
+        gate_proj = []
+        for j in range(num_experts):
+            gate_proj.append(new_state_dict.pop(f"model.layers.{i}.mlp.experts.{j}.gate_proj.weight"))
+
+        new_state_dict[f"model.layers.{i}.mlp.experts.gate_proj"] = torch.stack(gate_proj)
+        up_proj = []
+        for j in range(num_experts):
+            up_proj.append(new_state_dict.pop(f"model.layers.{i}.mlp.experts.{j}.up_proj.weight"))
+
+        new_state_dict[f"model.layers.{i}.mlp.experts.up_proj"] = torch.stack(up_proj)
+        down_proj = []
+        for j in range(num_experts):
+            down_proj.append(new_state_dict.pop(f"model.layers.{i}.mlp.experts.{j}.down_proj.weight"))
+
+        new_state_dict[f"model.layers.{i}.mlp.experts.down_proj"] = torch.stack(down_proj)
+
+    model_assets = [config, tokenizer]
+    save_model_weights(merge_hf_path, new_state_dict, model_assets=model_assets)
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--raw_hf_path", type=str, required=True)
+    parser.add_argument("--merge_hf_path", type=str, required=True)
+    args = parser.parse_args()
+    main(args.raw_hf_path, args.merge_hf_path)
diff --git a/scripts/veomni/moe_split.py b/scripts/veomni/moe_split.py
@@ -0,0 +1,96 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Reverse process of moe_merge.py - splits merged MoE expert weights back to individual experts.
+
+This script takes a HF checkpoint that has been processed by moe_merge.py (where expert weights
+are stacked into single tensors) and splits them back to the original format with individual
+expert weights.
+
+The process reverses the merging by:
+1. Loading stacked tensors like model.layers.{i}.mlp.experts.gate_proj
+2. Unstacking them back to individual experts model.layers.{i}.mlp.experts.{j}.gate_proj.weight
+3. Handling all three projection types: gate_proj, up_proj, down_proj
+
+Usage: python moe_split.py --merge_hf_path <merged_checkpoint> --split_hf_path <output_dir>
+"""
+
+import os
+from argparse import ArgumentParser
+from dataclasses import dataclass
+from glob import glob
+from typing import Generator
+
+import torch
+from safetensors.torch import safe_open
+from tqdm import tqdm
+from transformers import AutoConfig
+from veomni.models import build_tokenizer, save_model_weights
+
+
+@dataclass
+class StateDictIterator:
+    filepath: str
+
+    def __iter__(self) -> Generator[tuple[str, "torch.Tensor"], None, None]:
+        if self.filepath.endswith(".safetensors"):
+            with safe_open(self.filepath, framework="pt", device="cpu") as f:
+                for key in f.keys():
+                    yield key, f.get_tensor(key)
+
+        else:
+            state_dict = torch.load(self.filepath, map_location="cpu", weights_only=True, mmap=True)
+            for key in state_dict.keys():
+                yield key, state_dict[key]
+
+
+def main(merge_hf_path, split_hf_path):
+    torch.set_default_dtype(torch.bfloat16)
+    os.makedirs(split_hf_path, exist_ok=True)
+
+    config = AutoConfig.from_pretrained(merge_hf_path)
+    tokenizer = build_tokenizer(merge_hf_path)
+
+    safetensor_files = list(glob(os.path.join(merge_hf_path, "*.safetensors")))
+    safetensor_files.sort()
+    state_dict_iterators = [StateDictIterator(shard_file) for shard_file in safetensor_files]
+    new_state_dict = {}
+    for state_dict_iterator in tqdm(state_dict_iterators, desc="Loading checkpoint shards"):
+        for name, tensor in state_dict_iterator:
+            new_state_dict[name] = tensor.cpu()
+
+    num_experts = config.num_experts
+    num_hidden_layers = config.num_hidden_layers
+    for i in range(num_hidden_layers):
+        print(f"Converting layer {i}")
+        for proj_name in ["gate_proj", "up_proj", "down_proj"]:
+            stacked_key = f"model.layers.{i}.mlp.experts.{proj_name}"
+            if stacked_key in new_state_dict:
+                stacked_tensor = new_state_dict.pop(stacked_key)
+                for j in range(num_experts):
+                    expert_key = f"model.layers.{i}.mlp.experts.{j}.{proj_name}.weight"
+                    new_state_dict[expert_key] = stacked_tensor[j]
+
+    model_assets = [config, tokenizer]
+
+    print("Saving to safetensors")
+    save_model_weights(split_hf_path, new_state_dict, model_assets=model_assets)
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--merge_hf_path", type=str, required=True)
+    parser.add_argument("--split_hf_path", type=str, required=True)
+    args = parser.parse_args()
+    main(args.merge_hf_path, args.split_hf_path)
diff --git a/verl/trainer/config/engine/veomni.yaml b/verl/trainer/config/engine/veomni.yaml
@@ -1,6 +1,12 @@
 # Target class for this configuration
 _target_: verl.workers.config.VeOmniEngineConfig
 
+# Whether to offload model parameters to CPU
+param_offload: False
+
+# Whether to offload optimizer state to CPU
+optimizer_offload: False
+
 # fsdp or fsdp2
 data_parallel_mode: fsdp2
 
diff --git a/verl/utils/veomni_utils.py b/verl/utils/veomni_utils.py
@@ -0,0 +1,78 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from verl.utils.device import get_device_id, get_torch_device
+
+
+@torch.no_grad()
+def offload_veomni_model_to_cpu(model, empty_cache: bool = True):
+    from veomni.distributed.parallel_state import get_parallel_state
+
+    assert get_parallel_state().dp_mode == "fsdp2", "Only support fsdp2 offloading for VeOmni model"
+
+    model.cpu()
+    if empty_cache:
+        get_torch_device().empty_cache()
+
+
+@torch.no_grad()
+def load_veomni_model_to_gpu(model):
+    from veomni.distributed.parallel_state import get_parallel_state
+
+    assert get_parallel_state().dp_mode == "fsdp2", "Only support fsdp2 offloading for VeOmni model"
+
+    device = get_device_id()
+    model.to(device)
+
+
+@torch.no_grad()
+def offload_veomni_optimizer(optimizer):
+    optimizers = []
+    # Check if this is a MultiOptimizer (for ep and non-ep parameters when ep+fsdp2 is enabled)
+    if hasattr(optimizer, "_is_multi_optimizer") and optimizer._is_multi_optimizer:
+        optimizers.extend(optimizer.optimizers_dict.values())
+    else:
+        optimizers.append(optimizer)
+
+    for opt in optimizers:
+        if not opt.state:
+            continue
+        for param_group in opt.param_groups:
+            for param in param_group["params"]:
+                state = opt.state[param]
+                for key, value in state.items():
+                    if isinstance(value, torch.Tensor):
+                        state[key] = value.to("cpu", non_blocking=True)
+
+
+@torch.no_grad()
+def load_veomni_optimizer(optimizer, device_id):
+    optimizers = []
+    # Check if this is a MultiOptimizer (for ep and non-ep parameters when ep+fsdp2 is enabled)
+    if hasattr(optimizer, "_is_multi_optimizer") and optimizer._is_multi_optimizer:
+        optimizers.extend(optimizer.optimizers_dict.values())
+    else:
+        optimizers.append(optimizer)
+
+    for opt in optimizers:
+        if not opt.state:
+            continue
+        for param_group in opt.param_groups:
+            for param in param_group["params"]:
+                state = opt.state[param]
+                for key, value in state.items():
+                    if isinstance(value, torch.Tensor):
+                        state[key] = value.to(device_id, non_blocking=True)
diff --git a/verl/workers/engine/veomni/transformer_impl.py b/verl/workers/engine/veomni/transformer_impl.py
@@ -31,9 +31,15 @@
 from verl.trainer.config import CheckpointConfig
 from verl.utils import tensordict_utils as tu
 from verl.utils.checkpoint.fsdp_checkpoint_manager import FSDPCheckpointManager
-from verl.utils.device import get_device_id
+from verl.utils.device import get_device_id, get_device_name
 from verl.utils.fsdp_utils import fsdp_version
 from verl.utils.profiler import log_gpu_memory_usage
+from verl.utils.veomni_utils import (
+    load_veomni_model_to_gpu,
+    load_veomni_optimizer,
+    offload_veomni_model_to_cpu,
+    offload_veomni_optimizer,
+)
 from verl.workers.config import HFModelConfig, VeOmniEngineConfig, VeOmniOptimizerConfig
 from verl.workers.sharding_manager.fsdp_ulysses import FSDPUlyssesShardingManager
 
@@ -217,6 +223,34 @@ def _build_model_optimizer(self):
             self.engine_config.activation_gpu_limit,
         )
 
+    def to(self, device: str, model: bool = True, optimizer: bool = True, grad: bool = True):
+        """
+        Move model parameters, optimizer states, or both to the specified device.
+        Note that this function executes irrespective of offload config. It serves as manual control.
+
+        Args:
+            device: Target device identifier.
+            model: If True, move the model.
+            optimizer: If True, move the optimizer states.
+        """
+        super(FSDPEngine, self).to(device=device, model=model, optimizer=optimizer, grad=grad)
+
+        device_name = get_device_name()
+
+        assert device in (device_name, "cpu")
+        if device == device_name:
+            if model:
+                load_veomni_model_to_gpu(self.module)
+            if optimizer and self.optimizer is not None:
+                load_veomni_optimizer(self.optimizer, device)
+        elif device == "cpu":
+            if model:
+                offload_veomni_model_to_cpu(self.module)
+            if optimizer and self.optimizer is not None:
+                offload_veomni_optimizer(self.optimizer)
+        else:
+            raise ValueError(f"Invalid device type: {device}")
+
     def optimizer_step(self):
         """
         Perform an optimization step using the optimizer.