Skip to content

Commit 74d6d7c

Browse files
committed
add rl support for veomni backend
1 parent f65fd72 commit 74d6d7c

File tree

11 files changed

+622
-31
lines changed

11 files changed

+622
-31
lines changed
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
set -x
2+
ENGINE=${1:-vllm}
3+
4+
# Some models are optimized by vllm ascend. While in some case, e.g. rlhf training,
5+
# the optimized model may not be suitable. In this case, set this value to 0 to disable the optimized model.
6+
export USE_OPTIMIZED_MODEL=0
7+
8+
python3 -m verl.trainer.main_ppo --config-path=config \
9+
--config-name="ppo_veomni_trainer.yaml" \
10+
algorithm.adv_estimator=grpo \
11+
data.train_files=$HOME/data/geo3k/train.parquet \
12+
data.val_files=$HOME/data/geo3k/test.parquet \
13+
data.train_batch_size=16 \
14+
data.max_prompt_length=512 \
15+
data.max_response_length=1024 \
16+
data.filter_overlong_prompts=True \
17+
data.truncation='error' \
18+
data.image_key=images \
19+
actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-3B-Instruct \
20+
actor_rollout_ref.actor.optim.lr=1e-6 \
21+
actor_rollout_ref.model.use_remove_padding=True \
22+
actor_rollout_ref.actor.veomni.param_offload=True \
23+
actor_rollout_ref.actor.veomni.optimizer_offload=True \
24+
actor_rollout_ref.actor.ppo_mini_batch_size=16 \
25+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
26+
actor_rollout_ref.actor.use_kl_loss=True \
27+
actor_rollout_ref.actor.kl_loss_coef=0.01 \
28+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
29+
actor_rollout_ref.actor.entropy_coeff=0 \
30+
actor_rollout_ref.actor.use_torch_compile=False \
31+
actor_rollout_ref.actor.veomni.data_parallel_size=2 \
32+
actor_rollout_ref.model.enable_gradient_checkpointing=True \
33+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
34+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
35+
actor_rollout_ref.rollout.name=$ENGINE \
36+
+actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
37+
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
38+
actor_rollout_ref.rollout.enable_chunked_prefill=False \
39+
actor_rollout_ref.rollout.enforce_eager=True \
40+
actor_rollout_ref.rollout.free_cache_engine=True \
41+
actor_rollout_ref.rollout.n=5 \
42+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
43+
actor_rollout_ref.ref.veomni.param_offload=True \
44+
algorithm.use_kl_in_reward=False \
45+
trainer.use_legacy_worker_impl=disable \
46+
trainer.critic_warmup=0 \
47+
trainer.logger=console \
48+
trainer.project_name='verl_grpo_example_geo3k' \
49+
trainer.experiment_name='qwen2_5_vl_3b_function_rm' \
50+
trainer.n_gpus_per_node=2 \
51+
trainer.nnodes=1 \
52+
trainer.save_freq=-1 \
53+
trainer.test_freq=-1 \
54+
trainer.total_epochs=15 $@
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# veomni actor config, inheriting from trainer/config/actor/actor.yaml
2+
defaults:
3+
# veomni optimizer config
4+
- ../optim@optim: veomni
5+
6+
# veomni engine config
7+
- ../engine@veomni: veomni
8+
9+
- actor
10+
11+
# load the reference default config, then apply the fields in the current yaml
12+
- _self_
13+
14+
_target_: verl.workers.config.VeOmniActorConfig
15+
16+
strategy: veomni
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# defaults specify the default config from each component
2+
defaults:
3+
4+
# veomni optimizer config
5+
- ../optim@optim: veomni
6+
7+
# veomni engine config
8+
- ../engine@veomni: veomni
9+
10+
# critic config, inheriting from trainer/config/critic/critic.yaml
11+
- critic
12+
13+
# load the reference default config, then apply the fields in the current yaml
14+
- _self_
15+
16+
# Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
17+
_target_: verl.workers.config.VeOmniCriticConfig
18+
19+
strategy: veomni
20+
21+
# model config for the critic
22+
model:
23+
24+
# Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
25+
_target_: verl.trainer.config.BaseModelConfig
26+
27+
# seed for data loader
28+
data_loader_seed: ${oc.select:actor_rollout_ref.actor.data_loader_seed,null}
29+
Lines changed: 241 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,241 @@
1+
# specify the default per-component configs
2+
defaults:
3+
# <folder_name>@<field_name>.<field_name>: <yaml_file_name>
4+
# actor_rollout_ref.actor: trainer/config/actor/veomni_actor.yaml
5+
- actor@actor_rollout_ref.actor: veomni_actor
6+
# data: trainer/config/data/legacy_data.yaml
7+
- data@data: legacy_data
8+
# (Rule-based) Reward manager config.
9+
- reward_manager@reward_manager
10+
# load the reference default config, then apply the fields in the current yaml
11+
# Reference model config.
12+
# Reference model will be enabled when actor.use_kl_loss or/and algorithm.use_kl_in_reward is/are True.
13+
- ref@actor_rollout_ref.ref: veomni_ref
14+
# Rollout model config.
15+
- rollout@actor_rollout_ref.rollout: rollout
16+
# Model config.
17+
- model@actor_rollout_ref.model: hf_model
18+
# Critic model config.
19+
- critic@critic: veomni_critic
20+
# Reward model config.
21+
- reward_model@reward_model: veomni_reward_loop
22+
# Rollout correction config.
23+
- algorithm@algorithm.rollout_correction: rollout_correction
24+
- _self_
25+
26+
actor_rollout_ref:
27+
hybrid_engine: True
28+
29+
nccl_timeout: 600 # seconds, default is 10 minutes for torch, you can set it to a larger value if you have long-running operations like 32B or 72B model using veomni
30+
31+
model:
32+
override_config:
33+
model_config: {}
34+
moe_config:
35+
freeze_moe_router: False
36+
37+
use_fused_kernels: False # Whether to use custom fused kernels (PostProcessing, for memory efficiency)
38+
39+
trust_remote_code: False
40+
41+
# Whether to remove padding tokens in inputs during training
42+
use_remove_padding: false
43+
44+
# LoRA (Low-Rank Adaptation) configuration for parameter-efficient fine-tuning
45+
lora:
46+
# LoRA type: "lora", "vlm_lora", "canonical_lora", or "dora"
47+
type: lora
48+
49+
# LoRA rank (Dimension of the low-rank projection space.). Set to 0 to disable LoRA
50+
rank: 0 # typical values: 8, 16, 32, 64
51+
52+
# Weighting factor for the low-rank projection. Defaults to 32
53+
alpha: 32
54+
55+
# Dropout rate for the low-rank projection. Defaults to 0.0
56+
dropout: 0.0
57+
58+
# A list of module names to apply LoRA to.
59+
# For fused LoRA, Defaults to all linear layers ['linear_qkv', 'linear_proj', 'linear_fc1', 'linear_fc2'].
60+
# For canonical LoRA: ["linear_q", "linear_k", "linear_v", "linear_proj", "linear_fc1_up", "linear_fc1_gate", "linear_fc2"]
61+
# - 'linear_qkv': Apply LoRA to the fused linear layer used for query, key, and value projections in self-attention
62+
# - 'linear_proj': Apply LoRA to the linear layer used for projecting the output of self-attention
63+
# - 'linear_fc1': Apply LoRA to the first fully-connected layer in MLP
64+
# - 'linear_fc2': Apply LoRA to the second fully-connected layer in MLP
65+
# Target modules can also contain wildcards. For example, you can specify
66+
# target_modules=['*.layers.0.*.linear_qkv', '*.layers.1.*.linear_qkv'] to add LoRA to only linear_qkv on the first two layers
67+
target_modules:
68+
- linear_qkv
69+
- linear_proj
70+
- linear_fc1
71+
- linear_fc2
72+
73+
# A list of module names not to apply LoRa to. It will match all nn.Linear & nn.Linear-adjacent modules whose name
74+
# does not match any string in exclude_modules. If used, will require target_modules to be empty list or None
75+
exclude_modules: []
76+
77+
# Position for applying dropout, can be 'pre' (before the low-rank projection) or 'post' (after). Defaults to 'pre'
78+
dropout_position: pre
79+
80+
# Initialization method for the low-rank matrix A. Defaults to "xavier".
81+
lora_A_init_method: xavier
82+
83+
# Initialization method for the low-rank matrix B. Defaults to "zero".
84+
lora_B_init_method: zero
85+
86+
# Enables the experimental All-to-All (A2A) communication strategy. Defaults to False
87+
a2a_experimental: False
88+
89+
# Parameter data type for LoRA weights. Default to null, which will use model's dtype.
90+
dtype: null
91+
92+
# Path to pre-trained LoRA adapter weights (null to train from scratch)
93+
adapter_path: null
94+
95+
# VLMLoRA additionally allows the user to specify whether the language or vision models should be frozen.
96+
# For example, a common finetuning workload for multimodal models is to apply adapters to language model and fully
97+
# finetune the vision model.
98+
freeze_vision_model: True
99+
freeze_vision_projection: True
100+
freeze_language_model: True
101+
102+
rollout:
103+
quantization: null
104+
105+
layer_name_map:
106+
qkv_layer_name: qkv
107+
gate_proj_layer_name: gate_up
108+
109+
custom_reward_function:
110+
path: null
111+
name: compute_score
112+
113+
algorithm:
114+
# Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
115+
_target_: verl.trainer.config.AlgoConfig
116+
gamma: 1.0
117+
lam: 1.0
118+
adv_estimator: gae
119+
norm_adv_by_std_in_grpo: True
120+
use_kl_in_reward: False
121+
kl_penalty: kl # how to estimate kl divergence
122+
kl_ctrl:
123+
# Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
124+
_target_: verl.trainer.config.KLControlConfig
125+
type: fixed
126+
kl_coef: 0.001
127+
horizon: 10000
128+
target_kl: 0.1
129+
use_pf_ppo: False
130+
pf_ppo:
131+
reweight_method: pow # ["pow", "max_min", "max_random"]
132+
weight_pow: 2.0
133+
134+
trainer:
135+
balance_batch: True
136+
total_epochs: 30
137+
total_training_steps: null
138+
project_name: verl_examples
139+
experiment_name: gsm8k
140+
logger: ["console", "wandb"]
141+
log_val_generations: 0
142+
nnodes: 1
143+
n_gpus_per_node: 8
144+
save_freq: -1
145+
esi_redundant_time: 0
146+
147+
# auto: find the last ckpt to resume. If can't find, start from scratch
148+
resume_mode: auto # or disable or resume_path if resume_from_path is set
149+
resume_from_path: null
150+
del_local_ckpt_after_load: False
151+
val_before_train: True
152+
test_freq: -1
153+
critic_warmup: 0
154+
default_hdfs_dir: null
155+
default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
156+
max_actor_ckpt_to_keep: null
157+
max_critic_ckpt_to_keep: null
158+
# The timeout for ray worker group to wait for the register center to be ready
159+
ray_wait_register_center_timeout: 300
160+
device: cuda
161+
# Directory for logging rollout data; no dump if null
162+
rollout_data_dir: null
163+
164+
# whether to use legacy worker implementation
165+
# mode: "auto", "enable", or "disable"
166+
use_legacy_worker_impl: auto
167+
168+
global_profiler:
169+
_target_: verl.utils.profiler.ProfilerConfig
170+
tool: null # choose between nsys, npu, torch, torch_memory
171+
steps: null # profile steps
172+
profile_continuous_steps: False
173+
save_path: "outputs/profile" # profiler saving path
174+
# Specific tool configs, can use +profiler.tool_config.[tool].xxx to config
175+
global_tool_config:
176+
# nsys config
177+
nsys:
178+
# True for each task has its own database, False for all tasks in one training step share one database.
179+
discrete: False
180+
181+
# controller Nvidia Nsight Systems Options. Must set when profile_steps is not None.
182+
## reference https://docs.nvidia.com/nsight-systems/UserGuide/index.html
183+
## reference https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html
184+
controller_nsight_options:
185+
# Select the API(s) to be traced.
186+
trace: "cuda,nvtx,cublas,ucx"
187+
188+
# Track the GPU memory usage by CUDA kernels. Must be string type "true" or "false".
189+
cuda-memory-usage: "true"
190+
191+
# CUDA graphs will be traced as a whole
192+
cuda-graph-trace: "graph"
193+
194+
# worker Nvidia Nsight Systems Options. Must set when profile_steps is not None.
195+
worker_nsight_options:
196+
# Select the API(s) to be traced.
197+
trace: "cuda,nvtx,cublas,ucx"
198+
199+
# Track the GPU memory usage by CUDA kernels. Must be string type "true" or "false".
200+
cuda-memory-usage: "true"
201+
202+
# CUDA graphs will be traced as a whole
203+
cuda-graph-trace: "graph"
204+
205+
# Profiling only in a range of torch.cuda.profiler.start and stop. Do not change this config.
206+
capture-range: "cudaProfilerApi"
207+
208+
# Specify the desired behavior when a capture range ends.
209+
# In verl we need the torch.cuda.profiler.start/stop pair to repeats n times.
210+
# valid values are "repeat-shutdown:n" or null.
211+
# For normal whole step profiling, n = len(profile_steps);
212+
# but for discrete profiling, n = len(profile_steps) * Number(subtasks).
213+
# Or you can just leave it null and the program will use n = len(profile_steps) * 6;
214+
capture-range-end: null
215+
216+
# Send signal to the target application's process group. We let the program to exit by itself.
217+
kill: none
218+
219+
# enable memory visualization for debugging memory usage
220+
torch_memory:
221+
# Maximum number of allocation entries to record
222+
trace_alloc_max_entries: 100_000
223+
# The depth of the call stack to capture for each allocation
224+
stack_depth: 32
225+
# 'alloc': records only allocation events || 'state': records memory state changes || 'all': records both.
226+
context: "all"
227+
# 'python': records Python stacks || 'cpp': records C++ stacks (available in some versions) || 'all': records both.
228+
stacks: "all"
229+
# devices, record_context etc.
230+
kw_args: {}
231+
232+
# configs for TransferQueue
233+
transfer_queue:
234+
# Whether to enable transfer queue
235+
enable: False
236+
237+
ray_kwargs:
238+
ray_init:
239+
num_cpus: null # `None` means using all CPUs, which might cause hang if limited in systems like SLURM. Please set to a number allowed then.
240+
timeline_json_file: null
241+
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# veomni ref config, inheriting from trainer/config/ref/ref.yaml
2+
defaults:
3+
- ref
4+
5+
# veomni engine config
6+
- ../engine@veomni: veomni
7+
8+
# load the reference default config, then apply the fields in the current yaml
9+
- _self_
10+
11+
_target_: verl.workers.config.VeOmniActorConfig
12+
13+
strategy: veomni
14+
15+
veomni:
16+
seed: ${oc.select:actor_rollout_ref.actor.veomni.seed,42}
17+
data_parallel_size: ${oc.select:actor_rollout_ref.actor.veomni.data_parallel_size,1}
18+
data_parallel_replicate_size: ${oc.select:actor_rollout_ref.actor.veomni.data_parallel_replicate_size,1}
19+
data_parallel_shard_size: ${oc.select:actor_rollout_ref.actor.veomni.data_parallel_shard_size,1}
20+
tensor_parallel_size: ${oc.select:actor_rollout_ref.actor.veomni.tensor_parallel_size,1}
21+
expert_parallel_size: ${oc.select:actor_rollout_ref.actor.veomni.expert_parallel_size,1}
22+
pipeline_parallel_size: ${oc.select:actor_rollout_ref.actor.veomni.pipeline_parallel_size,1}
23+
context_parallel_size: ${oc.select:actor_rollout_ref.actor.veomni.context_parallel_size,1}
24+
ulysses_parallel_size: ${oc.select:actor_rollout_ref.actor.veomni.ulysses_parallel_size,1}
25+
param_offload: ${oc.select:actor_rollout_ref.actor.veomni.param_offload,False}
26+
forward_only: True
27+

0 commit comments

Comments
 (0)