Skip to content

Commit 84b12a5

Browse files
set fsdp2 as the defualt data parallel mode for VeOmniEngine
1 parent 87fc67e commit 84b12a5

File tree

7 files changed

+5
-15
lines changed

7 files changed

+5
-15
lines changed

.github/workflows/e2e_ppo_trainer_veomni_vllm.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ jobs:
134134
- name: Running GEO3K E2E training tests on 8 L20 GPUs with veomni engine (FSDP_SIZE=8, USP=1)
135135
run: |
136136
ray stop --force
137-
MODEL_ID=Qwen/Qwen3-VL-2B-Instruct TRAIN_FILES=${HOME}/data/geo3k/train.parquet VAL_FILES=${HOME}/data/gsm8k/test.parquet VAL_BEFORE_TRAIN=True NUM_GPUS=8 FSDP_SIZE=8 SP_SIZE=1 EP_SIZE=1 VERL_EXP_NAME="qwen3-2b-vl-function-reward-minimal-fsdp-size4" bash tests/special_e2e/run_ppo_trainer_veomni.sh
137+
MODEL_ID=Qwen/Qwen3-VL-2B-Instruct TRAIN_FILES=${HOME}/data/geo3k/train.parquet VAL_FILES=${HOME}/data/gsm8k/test.parquet VAL_BEFORE_TRAIN=True NUM_GPUS=8 FSDP_SIZE=8 SP_SIZE=1 EP_SIZE=1 VERL_EXP_NAME="qwen3-2b-vl-function-reward-minimal-fsdp-size8" bash tests/special_e2e/run_ppo_trainer_veomni.sh
138138
139139
cleanup:
140140
runs-on: ubuntu-latest

tests/special_e2e/run_ppo_trainer_veomni.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ python3 -m verl.trainer.main_ppo --config-path=config\
3131
actor_rollout_ref.model.enable_gradient_checkpointing=True \
3232
actor_rollout_ref.actor.veomni.param_offload=True \
3333
actor_rollout_ref.actor.veomni.optimizer_offload=True \
34-
actor_rollout_ref.actor.veomni.data_parallel_mode=fsdp2 \
3534
actor_rollout_ref.actor.ppo_mini_batch_size=8 \
3635
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
3736
actor_rollout_ref.actor.use_kl_loss=True \

tests/special_e2e/sft/run_sft_engine.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,6 @@ VEOMNI_ENGINE_CONFIG="\
6767
optim.lr_min=1e-6 \
6868
optim.lr_scheduler_type=cosine \
6969
engine.ulysses_parallel_size=${SP_SIZE} \
70-
engine.data_parallel_mode=${FSDP_STRATEGY} \
7170
engine.data_parallel_size=${FSDP_SIZE}"
7271

7372

verl/trainer/config/engine/veomni.yaml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,6 @@ param_offload: False
77
# Whether to offload optimizer state to CPU
88
optimizer_offload: False
99

10-
# fsdp or fsdp2
11-
data_parallel_mode: fsdp2
12-
1310
data_parallel_size: 1
1411

1512
data_parallel_replicate_size: 1

verl/trainer/config/reward_model/veomni_reward_model.yaml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,6 @@ veomni:
1515
# Target configuration dataclass
1616
_target_: verl.workers.config.VeOmniEngineConfig
1717

18-
# fsdp or fsdp2
19-
data_parallel_mode: fsdp2
20-
2118
data_parallel_size: 1
2219

2320
data_parallel_replicate_size: 1

verl/workers/config/engine.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,6 @@ class VeOmniEngineConfig(EngineConfig):
202202
pipeline_parallel_size (int): Pipeline parallel size, default 1
203203
context_parallel_size (int): Ring-attn context parallel size, default 1
204204
ulysses_parallel_size (int): Ulysses sequence parallel size, default 1
205-
data_parallel_mode (str): Data parallel mode, default "fsdp"
206205
init_device (str): Device to initialize model weights.
207206
1. `cpu`: Init parameters on CPU in rank0 only.
208207
2. `cuda`: Init parameters on GPU.
@@ -259,7 +258,6 @@ class VeOmniEngineConfig(EngineConfig):
259258
pipeline_parallel_size: int = 1
260259
context_parallel_size: int = 1
261260
ulysses_parallel_size: int = 1
262-
data_parallel_mode: Literal["ddp", "fsdp1", "fsdp2"] = "fsdp"
263261
seed: int = 42
264262
full_determinism: bool = False
265263
mixed_precision: bool = False

verl/workers/engine/veomni/transformer_impl.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,8 @@ def __init__(
7575
self.engine_config = engine_config
7676
self.optimizer_config = optimizer_config
7777
self.checkpoint_config = checkpoint_config
78-
assert self.engine_config.data_parallel_mode == "fsdp2", "VeOmniEngine only supports fsdp2."
79-
78+
# VeOmniEngine only supports fsdp2.
79+
self.data_parallel_mode = "fsdp2"
8080
self.rank = dist.get_rank()
8181

8282
parallel_state.init_parallel_state(
@@ -88,7 +88,7 @@ def __init__(
8888
pp_size=self.engine_config.pipeline_parallel_size,
8989
cp_size=self.engine_config.context_parallel_size,
9090
ulysses_size=self.engine_config.ulysses_parallel_size,
91-
dp_mode=self.engine_config.data_parallel_mode,
91+
dp_mode=self.data_parallel_mode,
9292
)
9393

9494
if self.engine_config.full_determinism:
@@ -155,7 +155,7 @@ def _build_optimizer(self, module):
155155
)
156156
get_optimizer_pre_hook = getattr(module, "get_optimizer_pre_hook", None)
157157
if get_optimizer_pre_hook is not None:
158-
optimizer_pre_hook = get_optimizer_pre_hook(module, module.config, self.engine_config.data_parallel_mode)
158+
optimizer_pre_hook = get_optimizer_pre_hook(module, module.config, self.data_parallel_mode)
159159
optimizer.register_step_pre_hook(optimizer_pre_hook)
160160

161161
return optimizer

0 commit comments

Comments
 (0)