|
3 | 3 | # |
4 | 4 | # Requirements: |
5 | 5 | # pip install --upgrade transformers==5.3.0 |
6 | | -# mbridge: https://github.com/ISEEKYAN/mbridge |
| 6 | +# mbridge: make sure https://github.com/ISEEKYAN/mbridge/pull/98 this pr has merged |
7 | 7 | # |
8 | 8 | # MTP (Multi-Token Prediction) notes: |
9 | 9 | # - actor_rollout_ref.model.mtp.enable=True enables MTP module |
10 | 10 | # - actor_rollout_ref.model.mtp.enable_train=True enables MTP training loss |
11 | 11 | # - actor_rollout_ref.model.mtp.enable_rollout=True enables speculative decoding in SGLang |
12 | 12 | # |
13 | 13 | # Example parallelism configs for Qwen3.5-35B-A3B: |
14 | | -# 8 GPUs (1 node): train_tp=4 train_pp=2 EP=4 gen_tp=8 |
15 | | -# 16 GPUs (2 nodes): train_tp=4 train_pp=4 EP=4 gen_tp=8 |
| 14 | +# 16 GPUs (2 nodes): train_tp=4 train_pp=2 EP=4 gen_tp=8 |
16 | 15 | # |
17 | 16 | # Run: |
18 | 17 | # NNODES_TRAIN=1 NNODES_ROLLOUT=1 bash grpo_qwen35_35b_megatron_async.sh |
|
115 | 114 |
|
116 | 115 | CHECKPOINT_CONTENTS=['model','hf_model','extra'] |
117 | 116 |
|
118 | | -python -X faulthandler -m verl.experimental.fully_async_policy.fully_async_main \ |
| 117 | +python -m verl.experimental.fully_async_policy.fully_async_main \ |
119 | 118 | --config-path=config \ |
120 | 119 | --config-name='fully_async_ppo_megatron_trainer.yaml' \ |
121 | 120 | data.train_files="${TRAIN_FILE}" \ |
@@ -224,8 +223,6 @@ python -X faulthandler -m verl.experimental.fully_async_policy.fully_async_main |
224 | 223 | actor_rollout_ref.rollout.multi_turn.max_tool_response_length=${max_prompt_length} \ |
225 | 224 | actor_rollout_ref.rollout.agent.num_workers=2 \ |
226 | 225 | actor_rollout_ref.rollout.disable_log_stats=False \ |
227 | | - actor_rollout_ref.rollout.prometheus.enable=True \ |
228 | | - actor_rollout_ref.rollout.prometheus.port=44398 \ |
229 | 226 | actor_rollout_ref.rollout.checkpoint_engine.update_weights_bucket_megabytes=1024 \ |
230 | 227 | +actor_rollout_ref.rollout.engine_kwargs.sglang.mamba_scheduler_strategy=no_buffer \ |
231 | 228 | +actor_rollout_ref.rollout.engine_kwargs.sglang.disable_radix_cache=True \ |
|
0 commit comments