|
| 1 | +# specify the default per-component configs |
| 2 | +defaults: |
| 3 | + # <folder_name>@<field_name>.<field_name>: <yaml_file_name> |
| 4 | + # actor_rollout_ref.actor: trainer/config/actor/veomni_actor.yaml |
| 5 | + - actor@actor_rollout_ref.actor: veomni_actor |
| 6 | + # data: trainer/config/data/legacy_data.yaml |
| 7 | + - data@data: legacy_data |
| 8 | + # (Rule-based) Reward manager config. |
| 9 | + - reward_manager@reward_manager |
| 10 | + # load the reference default config, then apply the fields in the current yaml |
| 11 | + # Reference model config. |
| 12 | + # Reference model will be enabled when actor.use_kl_loss or/and algorithm.use_kl_in_reward is/are True. |
| 13 | + - ref@actor_rollout_ref.ref: veomni_ref |
| 14 | + # Rollout model config. |
| 15 | + - rollout@actor_rollout_ref.rollout: rollout |
| 16 | + # Model config. |
| 17 | + - model@actor_rollout_ref.model: hf_model |
| 18 | + # Critic model config. |
| 19 | + - critic@critic: veomni_critic |
| 20 | + # Reward model config. |
| 21 | + - reward_model@reward_model: veomni_reward_loop |
| 22 | + # Rollout correction config. |
| 23 | + - algorithm@algorithm.rollout_correction: rollout_correction |
| 24 | + - _self_ |
| 25 | + |
| 26 | +actor_rollout_ref: |
| 27 | + hybrid_engine: True |
| 28 | + |
| 29 | + nccl_timeout: 600 # seconds, default is 10 minutes for torch, you can set it to a larger value if you have long-running operations like 32B or 72B model using veomni |
| 30 | + |
| 31 | + model: |
| 32 | + override_config: |
| 33 | + model_config: {} |
| 34 | + moe_config: |
| 35 | + freeze_moe_router: False |
| 36 | + |
| 37 | + use_fused_kernels: False # Whether to use custom fused kernels (PostProcessing, for memory efficiency) |
| 38 | + |
| 39 | + trust_remote_code: False |
| 40 | + |
| 41 | + # Whether to remove padding tokens in inputs during training |
| 42 | + use_remove_padding: false |
| 43 | + |
| 44 | + # LoRA (Low-Rank Adaptation) configuration for parameter-efficient fine-tuning |
| 45 | + lora: |
| 46 | + # LoRA type: "lora", "vlm_lora", "canonical_lora", or "dora" |
| 47 | + type: lora |
| 48 | + |
| 49 | + # LoRA rank (Dimension of the low-rank projection space.). Set to 0 to disable LoRA |
| 50 | + rank: 0 # typical values: 8, 16, 32, 64 |
| 51 | + |
| 52 | + # Weighting factor for the low-rank projection. Defaults to 32 |
| 53 | + alpha: 32 |
| 54 | + |
| 55 | + # Dropout rate for the low-rank projection. Defaults to 0.0 |
| 56 | + dropout: 0.0 |
| 57 | + |
| 58 | + # A list of module names to apply LoRA to. |
| 59 | + # For fused LoRA, Defaults to all linear layers ['linear_qkv', 'linear_proj', 'linear_fc1', 'linear_fc2']. |
| 60 | + # For canonical LoRA: ["linear_q", "linear_k", "linear_v", "linear_proj", "linear_fc1_up", "linear_fc1_gate", "linear_fc2"] |
| 61 | + # - 'linear_qkv': Apply LoRA to the fused linear layer used for query, key, and value projections in self-attention |
| 62 | + # - 'linear_proj': Apply LoRA to the linear layer used for projecting the output of self-attention |
| 63 | + # - 'linear_fc1': Apply LoRA to the first fully-connected layer in MLP |
| 64 | + # - 'linear_fc2': Apply LoRA to the second fully-connected layer in MLP |
| 65 | + # Target modules can also contain wildcards. For example, you can specify |
| 66 | + # target_modules=['*.layers.0.*.linear_qkv', '*.layers.1.*.linear_qkv'] to add LoRA to only linear_qkv on the first two layers |
| 67 | + target_modules: |
| 68 | + - linear_qkv |
| 69 | + - linear_proj |
| 70 | + - linear_fc1 |
| 71 | + - linear_fc2 |
| 72 | + |
| 73 | + # A list of module names not to apply LoRa to. It will match all nn.Linear & nn.Linear-adjacent modules whose name |
| 74 | + # does not match any string in exclude_modules. If used, will require target_modules to be empty list or None |
| 75 | + exclude_modules: [] |
| 76 | + |
| 77 | + # Position for applying dropout, can be 'pre' (before the low-rank projection) or 'post' (after). Defaults to 'pre' |
| 78 | + dropout_position: pre |
| 79 | + |
| 80 | + # Initialization method for the low-rank matrix A. Defaults to "xavier". |
| 81 | + lora_A_init_method: xavier |
| 82 | + |
| 83 | + # Initialization method for the low-rank matrix B. Defaults to "zero". |
| 84 | + lora_B_init_method: zero |
| 85 | + |
| 86 | + # Enables the experimental All-to-All (A2A) communication strategy. Defaults to False |
| 87 | + a2a_experimental: False |
| 88 | + |
| 89 | + # Parameter data type for LoRA weights. Default to null, which will use model's dtype. |
| 90 | + dtype: null |
| 91 | + |
| 92 | + # Path to pre-trained LoRA adapter weights (null to train from scratch) |
| 93 | + adapter_path: null |
| 94 | + |
| 95 | + # VLMLoRA additionally allows the user to specify whether the language or vision models should be frozen. |
| 96 | + # For example, a common finetuning workload for multimodal models is to apply adapters to language model and fully |
| 97 | + # finetune the vision model. |
| 98 | + freeze_vision_model: True |
| 99 | + freeze_vision_projection: True |
| 100 | + freeze_language_model: True |
| 101 | + |
| 102 | + rollout: |
| 103 | + quantization: null |
| 104 | + |
| 105 | + layer_name_map: |
| 106 | + qkv_layer_name: qkv |
| 107 | + gate_proj_layer_name: gate_up |
| 108 | + |
| 109 | +custom_reward_function: |
| 110 | + path: null |
| 111 | + name: compute_score |
| 112 | + |
| 113 | +algorithm: |
| 114 | + # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs |
| 115 | + _target_: verl.trainer.config.AlgoConfig |
| 116 | + gamma: 1.0 |
| 117 | + lam: 1.0 |
| 118 | + adv_estimator: gae |
| 119 | + norm_adv_by_std_in_grpo: True |
| 120 | + use_kl_in_reward: False |
| 121 | + kl_penalty: kl # how to estimate kl divergence |
| 122 | + kl_ctrl: |
| 123 | + # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs |
| 124 | + _target_: verl.trainer.config.KLControlConfig |
| 125 | + type: fixed |
| 126 | + kl_coef: 0.001 |
| 127 | + horizon: 10000 |
| 128 | + target_kl: 0.1 |
| 129 | + use_pf_ppo: False |
| 130 | + pf_ppo: |
| 131 | + reweight_method: pow # ["pow", "max_min", "max_random"] |
| 132 | + weight_pow: 2.0 |
| 133 | + |
| 134 | +trainer: |
| 135 | + balance_batch: True |
| 136 | + total_epochs: 30 |
| 137 | + total_training_steps: null |
| 138 | + project_name: verl_examples |
| 139 | + experiment_name: gsm8k |
| 140 | + logger: ["console", "wandb"] |
| 141 | + log_val_generations: 0 |
| 142 | + nnodes: 1 |
| 143 | + n_gpus_per_node: 8 |
| 144 | + save_freq: -1 |
| 145 | + esi_redundant_time: 0 |
| 146 | + |
| 147 | + # auto: find the last ckpt to resume. If can't find, start from scratch |
| 148 | + resume_mode: auto # or disable or resume_path if resume_from_path is set |
| 149 | + resume_from_path: null |
| 150 | + del_local_ckpt_after_load: False |
| 151 | + val_before_train: True |
| 152 | + test_freq: -1 |
| 153 | + critic_warmup: 0 |
| 154 | + default_hdfs_dir: null |
| 155 | + default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} |
| 156 | + max_actor_ckpt_to_keep: null |
| 157 | + max_critic_ckpt_to_keep: null |
| 158 | + # The timeout for ray worker group to wait for the register center to be ready |
| 159 | + ray_wait_register_center_timeout: 300 |
| 160 | + device: cuda |
| 161 | + # Directory for logging rollout data; no dump if null |
| 162 | + rollout_data_dir: null |
| 163 | + |
| 164 | + # whether to use legacy worker implementation |
| 165 | + # mode: "auto", "enable", or "disable" |
| 166 | + use_legacy_worker_impl: auto |
| 167 | + |
| 168 | +global_profiler: |
| 169 | + _target_: verl.utils.profiler.ProfilerConfig |
| 170 | + tool: null # choose between nsys, npu, torch, torch_memory |
| 171 | + steps: null # profile steps |
| 172 | + profile_continuous_steps: False |
| 173 | + save_path: "outputs/profile" # profiler saving path |
| 174 | + # Specific tool configs, can use +profiler.tool_config.[tool].xxx to config |
| 175 | + global_tool_config: |
| 176 | + # nsys config |
| 177 | + nsys: |
| 178 | + # True for each task has its own database, False for all tasks in one training step share one database. |
| 179 | + discrete: False |
| 180 | + |
| 181 | + # controller Nvidia Nsight Systems Options. Must set when profile_steps is not None. |
| 182 | + ## reference https://docs.nvidia.com/nsight-systems/UserGuide/index.html |
| 183 | + ## reference https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html |
| 184 | + controller_nsight_options: |
| 185 | + # Select the API(s) to be traced. |
| 186 | + trace: "cuda,nvtx,cublas,ucx" |
| 187 | + |
| 188 | + # Track the GPU memory usage by CUDA kernels. Must be string type "true" or "false". |
| 189 | + cuda-memory-usage: "true" |
| 190 | + |
| 191 | + # CUDA graphs will be traced as a whole |
| 192 | + cuda-graph-trace: "graph" |
| 193 | + |
| 194 | + # worker Nvidia Nsight Systems Options. Must set when profile_steps is not None. |
| 195 | + worker_nsight_options: |
| 196 | + # Select the API(s) to be traced. |
| 197 | + trace: "cuda,nvtx,cublas,ucx" |
| 198 | + |
| 199 | + # Track the GPU memory usage by CUDA kernels. Must be string type "true" or "false". |
| 200 | + cuda-memory-usage: "true" |
| 201 | + |
| 202 | + # CUDA graphs will be traced as a whole |
| 203 | + cuda-graph-trace: "graph" |
| 204 | + |
| 205 | + # Profiling only in a range of torch.cuda.profiler.start and stop. Do not change this config. |
| 206 | + capture-range: "cudaProfilerApi" |
| 207 | + |
| 208 | + # Specify the desired behavior when a capture range ends. |
| 209 | + # In verl we need the torch.cuda.profiler.start/stop pair to repeats n times. |
| 210 | + # valid values are "repeat-shutdown:n" or null. |
| 211 | + # For normal whole step profiling, n = len(profile_steps); |
| 212 | + # but for discrete profiling, n = len(profile_steps) * Number(subtasks). |
| 213 | + # Or you can just leave it null and the program will use n = len(profile_steps) * 6; |
| 214 | + capture-range-end: null |
| 215 | + |
| 216 | + # Send signal to the target application's process group. We let the program to exit by itself. |
| 217 | + kill: none |
| 218 | + |
| 219 | + # enable memory visualization for debugging memory usage |
| 220 | + torch_memory: |
| 221 | + # Maximum number of allocation entries to record |
| 222 | + trace_alloc_max_entries: 100_000 |
| 223 | + # The depth of the call stack to capture for each allocation |
| 224 | + stack_depth: 32 |
| 225 | + # 'alloc': records only allocation events || 'state': records memory state changes || 'all': records both. |
| 226 | + context: "all" |
| 227 | + # 'python': records Python stacks || 'cpp': records C++ stacks (available in some versions) || 'all': records both. |
| 228 | + stacks: "all" |
| 229 | + # devices, record_context etc. |
| 230 | + kw_args: {} |
| 231 | + |
| 232 | +# configs for TransferQueue |
| 233 | +transfer_queue: |
| 234 | + # Whether to enable transfer queue |
| 235 | + enable: False |
| 236 | + |
| 237 | +ray_kwargs: |
| 238 | + ray_init: |
| 239 | + num_cpus: null # `None` means using all CPUs, which might cause hang if limited in systems like SLURM. Please set to a number allowed then. |
| 240 | + timeline_json_file: null |
| 241 | + |
0 commit comments