diff --git a/examples/grpo_trainer/run_qwen2-32b_sglang_fsdp_npu.sh b/examples/grpo_trainer/run_qwen2-32b_sglang_fsdp_npu.sh new file mode 100644 index 00000000000..b8122ed8bf4 --- /dev/null +++ b/examples/grpo_trainer/run_qwen2-32b_sglang_fsdp_npu.sh @@ -0,0 +1,182 @@ +#!/bin/bash +set -xeuo pipefail +mkdir -p logs + +# Project Configuration +project_name='GRPO-Qwen2.5-32B-BASE-SGLang' +exp_name='GRPO-Qwen2.5-32B-BASE-FSDP-SGLang' + +# Necessary env +export HCCL_CONNECT_TIMEOUT=1500 +export HCCL_HOST_SOCKET_PORT_RANGE=60000-60050 +export HCCL_NPU_SOCKET_PORT_RANGE=61000-61050 + +export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1 +# If the number of nodes is 16, ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 +export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + +export DISABLE_L2_CACHE=1 +export TASK_QUEUE_ENABLE=1 + +# Node Info +NNODES=${NNODES:-2} +NPUS_PER_NODE=${NPUS_PER_NODE:-8} + +# Model Weights Paths +MODEL_PATH=Qwen/Qwen2.5-32B +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} + +# File System Paths +TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/datasets/deepscaler/train.parquet"} +TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/datasets/deepscaler/test.parquet"} + +# Data Configuration +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 8)) + +# Training Batch Configuration +train_prompt_bsz=32 +train_prompt_mini_bsz=32 +n_resp_per_prompt=8 + +# Algorithm Configuration +adv_estimator=grpo +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=True +kl_loss_coef=0.001 + +# Performance and Memory Management Configuration +all_offload=True +use_dynamic_bsz=False + +# SGLang Configuration +gen_tp=4 +gen_sp=1 +gen_dp=1 +gen_ep=1 +gpu_memory_utilization=0.5 + +# Data Configuration +DATA_CONFIG=( + # File Paths + data.train_files="${TRAIN_FILE}" + data.val_files="${TEST_FILE}" + # Data Structure + data.prompt_key=prompt + # Batch and Length Configuration + data.train_batch_size=${train_prompt_bsz} + data.max_prompt_length=${max_prompt_length} + data.max_response_length=${max_response_length} + # Preprocessing + data.filter_overlong_prompts=False + data.truncation='left' +) + +# Model Configuration +MODEL_CONFIG=( + # Model Path + actor_rollout_ref.model.path="${MODEL_PATH}" + # Model Processing + actor_rollout_ref.model.use_remove_padding=True + actor_rollout_ref.model.enable_gradient_checkpointing=True +) + +# Reinforcement Learning Algorithm Configuration +ALGORITHM_CONFIG=( + # Advantage Estimation + algorithm.adv_estimator=${adv_estimator} + # KL Divergence Control + algorithm.use_kl_in_reward=${use_kl_in_reward} +) + +# Actor Model Configuration +ACTOR_CONFIG=( + # Core Runtime Settings + actor_rollout_ref.actor.use_torch_compile=False + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} + # Loss Function Configuration + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} + actor_rollout_ref.actor.kl_loss_type=low_var_kl + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} + actor_rollout_ref.actor.entropy_coeff=0 + # PPO Training Parameters + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} + # Optimizer Settings + actor_rollout_ref.actor.optim.lr=1e-6 + actor_rollout_ref.actor.fsdp_config.param_offload=${all_offload} + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${all_offload} + ) + +# Reference Model Configuration +REF_CONFIG=( + # Core Runtime Settings + actor_rollout_ref.ref.use_torch_compile=False + # Log Probability Inference + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} + # Memory Optimization + actor_rollout_ref.ref.fsdp_config.param_offload=${all_offload} +) + +# Rollout Configuration +ROLLOUT_CONFIG=( + # Rollout Engine + actor_rollout_ref.rollout.name=sglang + +actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend="ascend" + # Generation Parameters + actor_rollout_ref.rollout.n=${n_resp_per_prompt} + actor_rollout_ref.rollout.top_p=1.0 + actor_rollout_ref.rollout.top_k=-1 + actor_rollout_ref.rollout.temperature=1.0 + # Log Probability Inference + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} + # Memory Management + actor_rollout_ref.rollout.gpu_memory_utilization=${gpu_memory_utilization} + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} + actor_rollout_ref.rollout.data_parallel_size=${gen_dp} + actor_rollout_ref.rollout.expert_parallel_size=${gen_ep} + actor_rollout_ref.rollout.enable_chunked_prefill=False + actor_rollout_ref.rollout.multi_stage_wake_up=True + # Validation Generation + actor_rollout_ref.rollout.val_kwargs.n=1 + actor_rollout_ref.rollout.val_kwargs.do_sample=True + actor_rollout_ref.rollout.val_kwargs.top_p=1.0 + actor_rollout_ref.rollout.val_kwargs.top_k=-1 + actor_rollout_ref.rollout.val_kwargs.temperature=1.0 + actor_rollout_ref.nccl_timeout=1800 +) + +# Trainer Configuration +TRAINER_CONFIG=( + trainer.logger='["console"]' + trainer.project_name="${project_name}" + trainer.experiment_name="${exp_name}" + trainer.nnodes="${NNODES}" + trainer.n_gpus_per_node="${NPUS_PER_NODE}" + trainer.total_epochs=5 + trainer.val_before_train=False + trainer.test_freq=-1 + trainer.save_freq=100 + trainer.default_local_dir="${CKPTS_DIR}" + trainer.critic_warmup=0 +) + +# Main GRPO Training Command +# Add the reward function processing for the DeepScaler dataset here +python3 -m verl.trainer.main_ppo \ + --config-path=config \ + --config-name='ppo_trainer.yaml' \ + custom_reward_function.path=recipe/r1_ascend/deepscaler.py \ + custom_reward_function.name=compute_score \ + "${DATA_CONFIG[@]}" \ + "${MODEL_CONFIG[@]}" \ + "${ACTOR_CONFIG[@]}" \ + "${REF_CONFIG[@]}" \ + "${ROLLOUT_CONFIG[@]}" \ + "${ALGORITHM_CONFIG[@]}" \ + "${TRAINER_CONFIG[@]}" \ + "$@" | tee logs/run_qwen2_5-32b_grpo_fsdp_sglang_npu.log \ No newline at end of file