Skip to content

Commit 27c4a88

Browse files
committed
add rl support for veomni backend
1 parent f65fd72 commit 27c4a88

File tree

12 files changed

+627
-119
lines changed

12 files changed

+627
-119
lines changed
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
set -x
2+
ENGINE=${1:-vllm}
3+
4+
# Some models are optimized by vllm ascend. While in some case, e.g. rlhf training,
5+
# the optimized model may not be suitable. In this case, set this value to 0 to disable the optimized model.
6+
export USE_OPTIMIZED_MODEL=0
7+
8+
python3 -m verl.trainer.main_ppo --config-path=config \
9+
--config-name="ppo_veomni_trainer.yaml" \
10+
algorithm.adv_estimator=grpo \
11+
data.train_files=$HOME/data/geo3k/train.parquet \
12+
data.val_files=$HOME/data/geo3k/test.parquet \
13+
data.train_batch_size=16 \
14+
data.max_prompt_length=512 \
15+
data.max_response_length=1024 \
16+
data.filter_overlong_prompts=True \
17+
data.truncation='error' \
18+
data.image_key=images \
19+
actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-3B-Instruct \
20+
actor_rollout_ref.actor.optim.lr=1e-6 \
21+
actor_rollout_ref.model.use_remove_padding=True \
22+
actor_rollout_ref.actor.veomni.param_offload=True \
23+
actor_rollout_ref.actor.veomni.optimizer_offload=True \
24+
actor_rollout_ref.actor.ppo_mini_batch_size=16 \
25+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
26+
actor_rollout_ref.actor.use_kl_loss=True \
27+
actor_rollout_ref.actor.kl_loss_coef=0.01 \
28+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
29+
actor_rollout_ref.actor.entropy_coeff=0 \
30+
actor_rollout_ref.actor.use_torch_compile=False \
31+
actor_rollout_ref.actor.veomni.data_parallel_size=2 \
32+
actor_rollout_ref.model.enable_gradient_checkpointing=True \
33+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
34+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
35+
actor_rollout_ref.rollout.name=$ENGINE \
36+
+actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
37+
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
38+
actor_rollout_ref.rollout.enable_chunked_prefill=False \
39+
actor_rollout_ref.rollout.enforce_eager=True \
40+
actor_rollout_ref.rollout.free_cache_engine=True \
41+
actor_rollout_ref.rollout.n=5 \
42+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
43+
actor_rollout_ref.ref.veomni.param_offload=True \
44+
algorithm.use_kl_in_reward=False \
45+
trainer.use_legacy_worker_impl=disable \
46+
trainer.critic_warmup=0 \
47+
trainer.logger=console \
48+
trainer.project_name='verl_grpo_example_geo3k' \
49+
trainer.experiment_name='qwen2_5_vl_3b_function_rm' \
50+
trainer.n_gpus_per_node=2 \
51+
trainer.nnodes=1 \
52+
trainer.save_freq=-1 \
53+
trainer.test_freq=-1 \
54+
trainer.total_epochs=15 $@
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# veomni actor config, inheriting from trainer/config/actor/actor.yaml
2+
defaults:
3+
# veomni optimizer config
4+
- ../optim@optim: veomni
5+
6+
# veomni engine config
7+
- ../engine@veomni: veomni
8+
9+
- actor
10+
11+
# load the reference default config, then apply the fields in the current yaml
12+
- _self_
13+
14+
_target_: verl.workers.config.VeOmniActorConfig
15+
16+
strategy: veomni
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# defaults specify the default config from each component
2+
defaults:
3+
4+
# veomni optimizer config
5+
- ../optim@optim: veomni
6+
7+
# veomni engine config
8+
- ../engine@veomni: veomni
9+
10+
# critic config, inheriting from trainer/config/critic/critic.yaml
11+
- critic
12+
13+
# load the reference default config, then apply the fields in the current yaml
14+
- _self_
15+
16+
# Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
17+
_target_: verl.workers.config.VeOmniCriticConfig
18+
19+
strategy: veomni
20+
21+
# model config for the critic
22+
model:
23+
24+
# Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
25+
_target_: verl.trainer.config.BaseModelConfig
26+
27+
# seed for data loader
28+
data_loader_seed: ${oc.select:actor_rollout_ref.actor.data_loader_seed,null}
29+
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
# specify the default per-component configs
2+
defaults:
3+
# <folder_name>@<field_name>.<field_name>: <yaml_file_name>
4+
# actor_rollout_ref.actor: trainer/config/actor/veomni_actor.yaml
5+
- actor@actor_rollout_ref.actor: veomni_actor
6+
# data: trainer/config/data/legacy_data.yaml
7+
- data@data: legacy_data
8+
# (Rule-based) Reward manager config.
9+
- reward_manager@reward_manager
10+
# load the reference default config, then apply the fields in the current yaml
11+
# Reference model config.
12+
# Reference model will be enabled when actor.use_kl_loss or/and algorithm.use_kl_in_reward is/are True.
13+
- ref@actor_rollout_ref.ref: veomni_ref
14+
# Rollout model config.
15+
- rollout@actor_rollout_ref.rollout: rollout
16+
# Model config.
17+
- model@actor_rollout_ref.model: hf_model
18+
# Critic model config.
19+
- critic@critic: veomni_critic
20+
# Reward model config.
21+
- reward_model@reward_model: veomni_reward_loop
22+
# Rollout correction config.
23+
- algorithm@algorithm.rollout_correction: rollout_correction
24+
- _self_
25+
26+
actor_rollout_ref:
27+
hybrid_engine: True
28+
29+
nccl_timeout: 600 # seconds, default is 10 minutes for torch, you can set it to a larger value if you have long-running operations like 32B or 72B model using veomni
30+
31+
model:
32+
override_config:
33+
model_config: {}
34+
moe_config:
35+
freeze_moe_router: False
36+
37+
use_fused_kernels: False # Whether to use custom fused kernels (PostProcessing, for memory efficiency)
38+
39+
trust_remote_code: False
40+
41+
# Whether to remove padding tokens in inputs during training
42+
use_remove_padding: false
43+
44+
rollout:
45+
quantization: null
46+
47+
layer_name_map:
48+
qkv_layer_name: qkv
49+
gate_proj_layer_name: gate_up
50+
51+
custom_reward_function:
52+
path: null
53+
name: compute_score
54+
55+
algorithm:
56+
# Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
57+
_target_: verl.trainer.config.AlgoConfig
58+
gamma: 1.0
59+
lam: 1.0
60+
adv_estimator: gae
61+
norm_adv_by_std_in_grpo: True
62+
use_kl_in_reward: False
63+
kl_penalty: kl # how to estimate kl divergence
64+
kl_ctrl:
65+
# Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
66+
_target_: verl.trainer.config.KLControlConfig
67+
type: fixed
68+
kl_coef: 0.001
69+
horizon: 10000
70+
target_kl: 0.1
71+
use_pf_ppo: False
72+
pf_ppo:
73+
reweight_method: pow # ["pow", "max_min", "max_random"]
74+
weight_pow: 2.0
75+
76+
trainer:
77+
balance_batch: True
78+
total_epochs: 30
79+
total_training_steps: null
80+
project_name: verl_examples
81+
experiment_name: gsm8k
82+
logger: ["console", "wandb"]
83+
log_val_generations: 0
84+
nnodes: 1
85+
n_gpus_per_node: 8
86+
save_freq: -1
87+
esi_redundant_time: 0
88+
89+
# auto: find the last ckpt to resume. If can't find, start from scratch
90+
resume_mode: auto # or disable or resume_path if resume_from_path is set
91+
resume_from_path: null
92+
del_local_ckpt_after_load: False
93+
val_before_train: True
94+
test_freq: -1
95+
critic_warmup: 0
96+
default_hdfs_dir: null
97+
default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
98+
max_actor_ckpt_to_keep: null
99+
max_critic_ckpt_to_keep: null
100+
# The timeout for ray worker group to wait for the register center to be ready
101+
ray_wait_register_center_timeout: 300
102+
device: cuda
103+
# Directory for logging rollout data; no dump if null
104+
rollout_data_dir: null
105+
106+
# whether to use legacy worker implementation
107+
# mode: "auto", "enable", or "disable"
108+
use_legacy_worker_impl: auto
109+
110+
global_profiler:
111+
_target_: verl.utils.profiler.ProfilerConfig
112+
tool: null # choose between nsys, npu, torch, torch_memory
113+
steps: null # profile steps
114+
profile_continuous_steps: False
115+
save_path: "outputs/profile" # profiler saving path
116+
# Specific tool configs, can use +profiler.tool_config.[tool].xxx to config
117+
global_tool_config:
118+
# nsys config
119+
nsys:
120+
# True for each task has its own database, False for all tasks in one training step share one database.
121+
discrete: False
122+
123+
# controller Nvidia Nsight Systems Options. Must set when profile_steps is not None.
124+
## reference https://docs.nvidia.com/nsight-systems/UserGuide/index.html
125+
## reference https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html
126+
controller_nsight_options:
127+
# Select the API(s) to be traced.
128+
trace: "cuda,nvtx,cublas,ucx"
129+
130+
# Track the GPU memory usage by CUDA kernels. Must be string type "true" or "false".
131+
cuda-memory-usage: "true"
132+
133+
# CUDA graphs will be traced as a whole
134+
cuda-graph-trace: "graph"
135+
136+
# worker Nvidia Nsight Systems Options. Must set when profile_steps is not None.
137+
worker_nsight_options:
138+
# Select the API(s) to be traced.
139+
trace: "cuda,nvtx,cublas,ucx"
140+
141+
# Track the GPU memory usage by CUDA kernels. Must be string type "true" or "false".
142+
cuda-memory-usage: "true"
143+
144+
# CUDA graphs will be traced as a whole
145+
cuda-graph-trace: "graph"
146+
147+
# Profiling only in a range of torch.cuda.profiler.start and stop. Do not change this config.
148+
capture-range: "cudaProfilerApi"
149+
150+
# Specify the desired behavior when a capture range ends.
151+
# In verl we need the torch.cuda.profiler.start/stop pair to repeats n times.
152+
# valid values are "repeat-shutdown:n" or null.
153+
# For normal whole step profiling, n = len(profile_steps);
154+
# but for discrete profiling, n = len(profile_steps) * Number(subtasks).
155+
# Or you can just leave it null and the program will use n = len(profile_steps) * 6;
156+
capture-range-end: null
157+
158+
# Send signal to the target application's process group. We let the program to exit by itself.
159+
kill: none
160+
161+
# enable memory visualization for debugging memory usage
162+
torch_memory:
163+
# Maximum number of allocation entries to record
164+
trace_alloc_max_entries: 100_000
165+
# The depth of the call stack to capture for each allocation
166+
stack_depth: 32
167+
# 'alloc': records only allocation events || 'state': records memory state changes || 'all': records both.
168+
context: "all"
169+
# 'python': records Python stacks || 'cpp': records C++ stacks (available in some versions) || 'all': records both.
170+
stacks: "all"
171+
# devices, record_context etc.
172+
kw_args: {}
173+
174+
# configs for TransferQueue
175+
transfer_queue:
176+
# Whether to enable transfer queue
177+
enable: False
178+
179+
ray_kwargs:
180+
ray_init:
181+
num_cpus: null # `None` means using all CPUs, which might cause hang if limited in systems like SLURM. Please set to a number allowed then.
182+
timeline_json_file: null
183+
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# veomni ref config, inheriting from trainer/config/ref/ref.yaml
2+
defaults:
3+
- ref
4+
5+
# veomni engine config
6+
- ../engine@veomni: veomni
7+
8+
# load the reference default config, then apply the fields in the current yaml
9+
- _self_
10+
11+
_target_: verl.workers.config.VeOmniActorConfig
12+
13+
strategy: veomni
14+
15+
veomni:
16+
seed: ${oc.select:actor_rollout_ref.actor.veomni.seed,42}
17+
data_parallel_size: ${oc.select:actor_rollout_ref.actor.veomni.data_parallel_size,1}
18+
data_parallel_replicate_size: ${oc.select:actor_rollout_ref.actor.veomni.data_parallel_replicate_size,1}
19+
data_parallel_shard_size: ${oc.select:actor_rollout_ref.actor.veomni.data_parallel_shard_size,1}
20+
tensor_parallel_size: ${oc.select:actor_rollout_ref.actor.veomni.tensor_parallel_size,1}
21+
expert_parallel_size: ${oc.select:actor_rollout_ref.actor.veomni.expert_parallel_size,1}
22+
pipeline_parallel_size: ${oc.select:actor_rollout_ref.actor.veomni.pipeline_parallel_size,1}
23+
context_parallel_size: ${oc.select:actor_rollout_ref.actor.veomni.context_parallel_size,1}
24+
ulysses_parallel_size: ${oc.select:actor_rollout_ref.actor.veomni.ulysses_parallel_size,1}
25+
param_offload: ${oc.select:actor_rollout_ref.actor.veomni.param_offload,False}
26+
forward_only: True
27+
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
defaults:
2+
- veomni_reward_model
3+
- _self_
4+
5+
use_reward_loop: True
6+
reward_manager: naive
7+
enable: False
8+
9+
# Whether to deploy the model to a separate resource pool.
10+
enable_resource_pool: False
11+
n_gpus_per_node: 8
12+
num_workers: 1
13+
nnodes: 0
14+
15+
model:
16+
path: ~/models/FsfairX-LLaMA3-RM-v0.1
17+
external_lib: ${actor_rollout_ref.model.external_lib}
18+
trust_remote_code: False
19+
20+
rollout:
21+
_target_: verl.workers.config.RolloutConfig
22+
name: ???
23+
dtype: bfloat16
24+
gpu_memory_utilization: 0.5
25+
enforce_eager: true
26+
cudagraph_capture_sizes: null
27+
free_cache_engine: true
28+
data_parallel_size: 1
29+
expert_parallel_size: 1
30+
tensor_model_parallel_size: 2
31+
max_num_batched_tokens: 8192
32+
max_model_len: null
33+
max_num_seqs: 1024
34+
load_format: auto
35+
engine_kwargs: {}
36+
limit_images: null
37+
enable_chunked_prefill: true
38+
enable_prefix_caching: true
39+
disable_log_stats: true
40+
skip_tokenizer_init: false
41+
42+
prompt_length: 2048
43+
response_length: 2048
44+

0 commit comments

Comments
 (0)