[ci] chore: migrate all rm related ci to reward loop (verl-project#4520)

vyomakesh0728 · vyomakesh0728 · commit 7290aef77c2d · 2025-12-17T11:43:01.000+08:00
### What does this PR do? - Migrate all Reward-Model-related CI to Reward Loop (verified) - Set the naive router as the default for the reward loop ### Checklist Before Starting - [ ] Search for similar PRs. Paste at least one query link here: ... - [ ] Format the PR title as `[{modules}] {type}: {description}` (This will be checked by the CI) - `{modules}` include `fsdp`, `megatron`, `sglang`, `vllm`, `rollout`, `trainer`, `ci`, `training_utils`, `recipe`, `hardware`, `deployment`, `ray`, `worker`, `single_controller`, `misc`, `perf`, `model`, `algo`, `env`, `tool`, `ckpt`, `doc`, `data` - If this PR involves multiple modules, separate them with `,` like `[megatron, fsdp, doc]` - `{type}` is in `feat`, `fix`, `refactor`, `chore`, `test` - If this PR breaks any API (CLI arguments, config, function signature, etc.), add `[BREAKING]` to the beginning of the title. - Example: `[BREAKING][fsdp, megatron] feat: dynamic batching` ### Test > For changes that can not be tested by CI (e.g., algorithm implementation, new model support), validate by experiment(s) and show results like training curve plots, evaluation results, etc. ### API and Usage Example > Demonstrate how the API changes if any, and provide usage example(s) if possible. ```python # Add code snippet or script demonstrating how to use this ``` ### Design & Code Changes > Demonstrate the high-level design if this PR is complex, and list the specific changes. ### Checklist Before Submitting > [!IMPORTANT] > Please check all the following items before requesting a review, otherwise the reviewer might deprioritize this PR for review. - [ ] Read the [Contribute Guide](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md). - [ ] Apply [pre-commit checks](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md#code-linting-and-formatting): `pre-commit install && pre-commit run --all-files --show-diff-on-failure --color=always` - [ ] Add / Update [the documentation](https://github.com/volcengine/verl/tree/main/docs). - [ ] Add unit or end-to-end test(s) to [the CI workflow](https://github.com/volcengine/verl/tree/main/.github/workflows) to cover all the code. If not feasible, explain why: ... - [ ] Once your PR is ready for CI, send a message in [the `ci-request` channel](https://verl-project.slack.com/archives/C091TCESWB1) in [the `verl` Slack workspace](https://join.slack.com/t/verl-project/shared_invite/zt-3855yhg8g-CTkqXu~hKojPCmo7k_yXTQ). (If not accessible, please try [the Feishu group (飞书群)](https://applink.larkoffice.com/client/chat/chatter/add_by_link?link_token=772jd4f1-cd91-441e-a820-498c6614126a).)
diff --git a/examples/grpo_trainer/run_mistral13b_skyworkrm_hhrlhf.sh b/examples/grpo_trainer/run_mistral13b_skyworkrm_hhrlhf.sh
@@ -34,10 +34,14 @@ python3 -m verl.trainer.main_ppo \
     actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
     actor_rollout_ref.model.enable_gradient_checkpointing=True \
     reward_model.enable=True \
-    reward_model.model.fsdp_config.param_offload=True \
     reward_model.model.path=Skywork/Skywork-Reward-Llama-3.1-8B \
-    reward_model.model.input_tokenizer=mistralai/Mistral-Nemo-Instruct-2407 \
-    reward_model.micro_batch_size_per_gpu=4 \
+    reward_model.use_reward_loop=True \
+    reward_model.rollout.name=vllm \
+    reward_model.rollout.gpu_memory_utilization=0.8 \
+    reward_model.rollout.tensor_model_parallel_size=1 \
+    reward_model.rollout.prompt_length=8192 \
+    reward_model.rollout.response_length=4096 \
+    reward_model.num_workers=8 \
     algorithm.use_kl_in_reward=False \
     trainer.logger='["console","wandb"]' \
     trainer.val_before_train=False \
diff --git a/examples/ppo_trainer/run_deepseek_full_hh_rlhf.sh b/examples/ppo_trainer/run_deepseek_full_hh_rlhf.sh
@@ -25,10 +25,14 @@ python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megat
     critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
     critic.ppo_micro_batch_size_per_gpu=4 \
     reward_model.enable=True \
-    reward_model.megatron.tensor_model_parallel_size=4 \
     reward_model.model.path=deepseek-ai/deepseek-llm-7b-chat \
-    reward_model.micro_batch_size_per_gpu=4 \
-    reward_model.param_offload=False \
+    reward_model.use_reward_loop=True \
+    reward_model.rollout.name=vllm \
+    reward_model.rollout.gpu_memory_utilization=0.8 \
+    reward_model.rollout.tensor_model_parallel_size=4 \
+    reward_model.rollout.prompt_length=256 \
+    reward_model.rollout.response_length=128 \
+    reward_model.num_workers=8 \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
     trainer.logger='["console","wandb"]' \
diff --git a/examples/ppo_trainer/run_qwen2-7b_rm.sh b/examples/ppo_trainer/run_qwen2-7b_rm.sh
@@ -55,9 +55,13 @@ python3 -m verl.trainer.main_ppo \
     critic.model.fsdp_config.optimizer_offload=False \
     reward_model.enable=True \
     reward_model.model.path="$HOME/models/FsfairX-LLaMA3-RM-v0.1" \
-    reward_model.model.use_remove_padding=True \
-    reward_model.model.fsdp_config.param_offload=True \
-    reward_model.micro_batch_size_per_gpu=32 \
+    reward_model.use_reward_loop=True \
+    reward_model.rollout.name=vllm \
+    reward_model.rollout.gpu_memory_utilization=0.8 \
+    reward_model.rollout.tensor_model_parallel_size=1 \
+    reward_model.rollout.prompt_length=2048 \
+    reward_model.rollout.response_length=1024 \
+    reward_model.num_workers=8 \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
     trainer.logger='["console","wandb"]' \
diff --git a/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance.sh b/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance.sh
@@ -42,11 +42,13 @@ python3 -m verl.trainer.main_ppo \
     critic.model.fsdp_config.optimizer_offload=False \
     reward_model.enable=True \
     reward_model.model.path=sfairXC/FsfairX-LLaMA3-RM-v0.1\
-    reward_model.model.use_remove_padding=True \
-    reward_model.model.fsdp_config.param_offload=True \
-    reward_model.micro_batch_size_per_gpu=32 \
-    reward_model.use_dynamic_bsz=True \
-    reward_model.forward_max_token_len_per_gpu=98304 \
+    reward_model.use_reward_loop=True \
+    reward_model.rollout.name=vllm \
+    reward_model.rollout.gpu_memory_utilization=0.8 \
+    reward_model.rollout.tensor_model_parallel_size=1 \
+    reward_model.rollout.prompt_length=8192 \
+    reward_model.rollout.response_length=4096 \
+    reward_model.num_workers=8 \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
     trainer.logger='["console","wandb"]' \
diff --git a/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_fused_kernels.sh b/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_fused_kernels.sh
@@ -45,12 +45,14 @@ python3 -m verl.trainer.main_ppo \
     critic.model.fsdp_config.param_offload=False \
     critic.model.fsdp_config.optimizer_offload=False \
     reward_model.enable=True \
-    reward_model.model.path=sfairXC/FsfairX-LLaMA3-RM-v0.1\
-    reward_model.model.use_remove_padding=True \
-    reward_model.model.fsdp_config.param_offload=True \
-    reward_model.micro_batch_size_per_gpu=32 \
-    reward_model.use_dynamic_bsz=True \
-    reward_model.forward_max_token_len_per_gpu=98304 \
+    reward_model.model.path=sfairXC/FsfairX-LLaMA3-RM-v0.1 \
+    reward_model.use_reward_loop=True \
+    reward_model.rollout.name=vllm \
+    reward_model.rollout.gpu_memory_utilization=0.8 \
+    reward_model.rollout.tensor_model_parallel_size=1 \
+    reward_model.rollout.prompt_length=8192 \
+    reward_model.rollout.response_length=4096 \
+    reward_model.num_workers=8 \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
     trainer.logger='["console","wandb"]' \
diff --git a/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_nsys.sh b/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_nsys.sh
@@ -55,14 +55,13 @@ python3 -m verl.trainer.main_ppo \
     critic.profiler.all_ranks=$PROFILE_RANKS_ALL \
     reward_model.enable=True \
     reward_model.model.path=sfairXC/FsfairX-LLaMA3-RM-v0.1\
-    reward_model.model.use_remove_padding=True \
-    reward_model.model.fsdp_config.param_offload=True \
-    reward_model.micro_batch_size_per_gpu=32 \
-    reward_model.use_dynamic_bsz=True \
-    reward_model.forward_max_token_len_per_gpu=98304 \
-    reward_model.profiler.enable=True \
-    reward_model.profiler.ranks=$PROFILE_RANKS \
-    reward_model.profiler.all_ranks=$PROFILE_RANKS_ALL \
+    reward_model.use_reward_loop=True \
+    reward_model.rollout.name=vllm \
+    reward_model.rollout.gpu_memory_utilization=0.8 \
+    reward_model.rollout.tensor_model_parallel_size=1 \
+    reward_model.rollout.prompt_length=8192 \
+    reward_model.rollout.response_length=4096 \
+    reward_model.num_workers=8 \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
     trainer.logger='["console","wandb"]' \
diff --git a/examples/ppo_trainer/run_qwen2.5-3b_rm_legacy.sh b/examples/ppo_trainer/run_qwen2.5-3b_rm_legacy.sh
@@ -0,0 +1,63 @@
+# download datasets and models
+# python3 examples/data_preprocess/gsm8k.py
+# python3 examples/data_preprocess/math_dataset.py
+# huggingface-cli download Skywork/Skywork-Reward-V2-Llama-3.2-3B --local-dir $HOME/models/Skywork-Reward-V2-Llama-3.2-3B
+# huggingface-cli download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen2.5-3B-Instruct
+
+gsm8k_train_path=$HOME/data/gsm8k/train.parquet
+gsm8k_test_path=$HOME/data/gsm8k/test.parquet
+math_train_path=$HOME/data/math/train.parquet
+math_test_path=$HOME/data/math/test.parquet
+
+train_files="['$gsm8k_train_path', '$math_train_path']"
+test_files="['$gsm8k_test_path', '$math_test_path']"
+
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=gae \
+    data.train_files="$train_files" \
+    data.val_files="$test_files" \
+    data.train_batch_size=1024 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=2048 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    data.return_raw_chat=True \
+    actor_rollout_ref.model.path="$HOME/models/Qwen2.5-3B-Instruct" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    critic.optim.lr=1e-5 \
+    critic.model.use_remove_padding=True \
+    critic.optim.lr_warmup_steps_ratio=0.05 \
+    critic.model.path="$HOME/models/Qwen2.5-3B-Instruct" \
+    critic.model.enable_gradient_checkpointing=True \
+    critic.ppo_micro_batch_size_per_gpu=32 \
+    critic.model.fsdp_config.param_offload=False \
+    critic.model.fsdp_config.optimizer_offload=False \
+    reward_model.enable=True \
+    reward_model.model.path="$HOME/models/Skywork-Reward-V2-Llama-3.2-3B" \
+    reward_model.use_reward_loop=False \
+    reward_model.model.use_remove_padding=True \
+    reward_model.model.fsdp_config.param_offload=True \
+    reward_model.micro_batch_size_per_gpu=32 \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_test_qwen25_rm' \
+    trainer.val_before_train=True \
+    trainer.experiment_name='legacy_fsdp_reward_model' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=-1 \
+    trainer.test_freq=10 \
+    trainer.total_epochs=15 $@
diff --git a/examples/ppo_trainer/run_qwen2.5-3b_rm_reward_loop_colocate.sh b/examples/ppo_trainer/run_qwen2.5-3b_rm_reward_loop_colocate.sh
@@ -0,0 +1,66 @@
+# download datasets and models
+# python3 examples/data_preprocess/gsm8k.py
+# python3 examples/data_preprocess/math_dataset.py
+# huggingface-cli download Skywork/Skywork-Reward-V2-Llama-3.2-3B --local-dir $HOME/models/Skywork-Reward-V2-Llama-3.2-3B
+# huggingface-cli download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen2.5-3B-Instruct
+
+gsm8k_train_path=$HOME/data/gsm8k/train.parquet
+gsm8k_test_path=$HOME/data/gsm8k/test.parquet
+math_train_path=$HOME/data/math/train.parquet
+math_test_path=$HOME/data/math/test.parquet
+
+train_files="['$gsm8k_train_path', '$math_train_path']"
+test_files="['$gsm8k_test_path', '$math_test_path']"
+
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=gae \
+    data.train_files="$train_files" \
+    data.val_files="$test_files" \
+    data.train_batch_size=1024 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=2048 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    data.return_raw_chat=True \
+    actor_rollout_ref.model.path="$HOME/models/Qwen2.5-3B-Instruct" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    critic.optim.lr=1e-5 \
+    critic.model.use_remove_padding=True \
+    critic.optim.lr_warmup_steps_ratio=0.05 \
+    critic.model.path="$HOME/models/Qwen2.5-3B-Instruct" \
+    critic.model.enable_gradient_checkpointing=True \
+    critic.ppo_micro_batch_size_per_gpu=32 \
+    critic.model.fsdp_config.param_offload=False \
+    critic.model.fsdp_config.optimizer_offload=False \
+    reward_model.enable=True \
+    reward_model.model.path="$HOME/models/Skywork-Reward-V2-Llama-3.2-3B" \
+    reward_model.use_reward_loop=True \
+    reward_model.rollout.name=vllm \
+    reward_model.rollout.gpu_memory_utilization=0.8 \
+    reward_model.rollout.tensor_model_parallel_size=1 \
+    reward_model.rollout.prompt_length=4096 \
+    reward_model.rollout.response_length=4096 \
+    reward_model.num_workers=8 \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_test_qwen25_rm' \
+    trainer.val_before_train=False \
+    trainer.experiment_name='reward_loop_colocate_reward_model' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=-1 \
+    trainer.test_freq=10 \
+    trainer.total_epochs=15 $@
diff --git a/recipe/fapo/README.md b/recipe/fapo/README.md
@@ -78,3 +78,12 @@ bash recipe/fapo/run_fapo_32b.sh  # 32b fapo model
 We implement RewardLoop to enable efficient and flexible reward computation.
 The core implementation can be found in `verl/experimental/reward/`.
 Refer to [this official document](https://verl.readthedocs.io/en/latest/advance/reward_loop.html) for more implementation details.
+
+```bibtex
+@article{ding2025fapo,
+  title={FAPO: Flawed-Aware Policy Optimization for Efficient and Reliable Reasoning},
+  author={Ding, Yuyang and Zhang, Chi and Li, Juntao and Lin, Haibin and Liu, Xin and Zhang, Min},
+  journal={arXiv preprint arXiv:2510.22543},
+  year={2025}
+}
+```
diff --git a/recipe/fapo/run_baseline_32b.sh b/recipe/fapo/run_baseline_32b.sh
@@ -53,15 +53,10 @@ offload=True
 gen_tp=4
 fsdp_size=32
 
-PROJECT_DIR="$(pwd)"
-CONFIG_PATH="$PROJECT_DIR/recipe/fapo/config"
-
 ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
     --address "${RAY_ADDRESS}" \
     --working-dir "${WORKING_DIR}" \
     -- python3 -m verl.trainer.main_ppo \
-    --config-path $CONFIG_PATH \
-    --config-name rm_config.yaml \
     data.train_files="${TRAIN_FILE}" \
     data.val_files="${TEST_FILE}" \
     data.prompt_key=prompt \
diff --git a/recipe/fapo/run_baseline_7b.sh b/recipe/fapo/run_baseline_7b.sh
@@ -54,15 +54,10 @@ offload=True
 gen_tp=1
 fsdp_size=8
 
-PROJECT_DIR="$(pwd)"
-CONFIG_PATH="$PROJECT_DIR/recipe/fapo/config"
-
 ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
     --address "${RAY_ADDRESS}" \
     --working-dir "${WORKING_DIR}" \
     -- python3 -m verl.trainer.main_ppo \
-    --config-path $CONFIG_PATH \
-    --config-name rm_config.yaml \
     data.train_files="${TRAIN_FILE}" \
     data.val_files="${TEST_FILE}" \
     data.prompt_key=prompt \
diff --git a/recipe/fapo/run_fapo_32b.sh b/recipe/fapo/run_fapo_32b.sh
@@ -55,15 +55,10 @@ offload=True
 gen_tp=4
 fsdp_size=32
 
-PROJECT_DIR="$(pwd)"
-CONFIG_PATH="$PROJECT_DIR/recipe/fapo/config"
-
 ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
     --address "${RAY_ADDRESS}" \
     --working-dir "${WORKING_DIR}" \
     -- python3 -m verl.trainer.main_ppo \
-    --config-path $CONFIG_PATH \
-    --config-name rm_config.yaml \
     data.train_files="${TRAIN_FILE}" \
     data.val_files="${TEST_FILE}" \
     data.prompt_key=prompt \
diff --git a/recipe/fapo/run_fapo_32b_remote.sh b/recipe/fapo/run_fapo_32b_remote.sh
@@ -53,15 +53,10 @@ offload=True
 gen_tp=4
 fsdp_size=32
 
-PROJECT_DIR="$(pwd)"
-CONFIG_PATH="$PROJECT_DIR/recipe/fapo/config"
-
 ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
     --address "${RAY_ADDRESS}" \
     --working-dir "${WORKING_DIR}" \
     -- python3 -m verl.trainer.main_ppo \
-    --config-path $CONFIG_PATH \
-    --config-name rm_config.yaml \
     data.train_files="${TRAIN_FILE}" \
     data.val_files="${TEST_FILE}" \
     data.prompt_key=prompt \
diff --git a/recipe/fapo/run_fapo_7b.sh b/recipe/fapo/run_fapo_7b.sh
@@ -56,15 +56,10 @@ offload=True
 gen_tp=1
 fsdp_size=8
 
-PROJECT_DIR="$(pwd)"
-CONFIG_PATH="$PROJECT_DIR/recipe/fapo/config"
-
 ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
     --address "${RAY_ADDRESS}" \
     --working-dir "${WORKING_DIR}" \
     -- python3 -m verl.trainer.main_ppo \
-    --config-path $CONFIG_PATH \
-    --config-name rm_config.yaml \
     data.train_files="${TRAIN_FILE}" \
     data.val_files="${TEST_FILE}" \
     data.prompt_key=prompt \
diff --git a/recipe/fapo/run_fapo_7b_remote.sh b/recipe/fapo/run_fapo_7b_remote.sh
@@ -54,15 +54,10 @@ offload=True
 gen_tp=1
 fsdp_size=8
 
-PROJECT_DIR="$(pwd)"
-CONFIG_PATH="$PROJECT_DIR/recipe/fapo/config"
-
 ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
     --address "${RAY_ADDRESS}" \
     --working-dir "${WORKING_DIR}" \
     -- python3 -m verl.trainer.main_ppo \
-    --config-path $CONFIG_PATH \
-    --config-name rm_config.yaml \
     data.train_files="${TRAIN_FILE}" \
     data.val_files="${TEST_FILE}" \
     data.prompt_key=prompt \
diff --git a/recipe/spin/utils.py b/recipe/spin/utils.py
@@ -92,7 +92,11 @@ def check_mutually_exclusive(mbs, mbs_per_gpu, name: str):
         )
 
     # Check for reward model micro-batch size conflicts
-    if config.reward_model.enable and not config.reward_model.use_dynamic_bsz:
+    if (
+        config.reward_model.enable
+        and not config.reward_model.use_dynamic_bsz
+        and not config.reward_model.use_reward_loop
+    ):
         check_mutually_exclusive(
             config.reward_model.micro_batch_size, config.reward_model.micro_batch_size_per_gpu, "reward_model"
         )
diff --git a/tests/experimental/reward/test_agent_loop_reward_manager.py b/tests/experimental/reward/test_agent_loop_reward_manager.py
@@ -35,8 +35,8 @@ def test_agent_loop_reward_manager():
             }
         }
     )
-    with initialize_config_dir(config_dir=os.path.abspath("recipe/fapo/config")):
-        config = compose("rm_config")
+    with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
+        config = compose(config_name="ppo_trainer")
 
     rollout_model_path = os.path.expanduser("~/models/Qwen/Qwen2.5-0.5B-Instruct")
     reward_model_path = os.path.expanduser("~/models/Qwen/Qwen2.5-1.5B-Instruct")
diff --git a/tests/experimental/reward/test_agent_reward_loop_colocate.py b/tests/experimental/reward/test_agent_reward_loop_colocate.py
@@ -39,8 +39,8 @@ def test_agent_loop_reward_manager():
             }
         }
     )
-    with initialize_config_dir(config_dir=os.path.abspath("recipe/fapo/config")):
-        config = compose("rm_config")
+    with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
+        config = compose(config_name="ppo_trainer")
 
     rollout_model_path = os.path.expanduser("~/models/Qwen/Qwen2.5-0.5B-Instruct")
     reward_model_path = os.path.expanduser("~/models/Qwen/Qwen2.5-1.5B-Instruct")
diff --git a/tests/experimental/reward/test_reward_model_disrm.py b/tests/experimental/reward/test_reward_model_disrm.py
diff --git a/tests/experimental/reward/test_reward_model_genrm.py b/tests/experimental/reward/test_reward_model_genrm.py
diff --git a/tests/special_e2e/ppo_trainer/run_model_reward.sh b/tests/special_e2e/ppo_trainer/run_model_reward.sh
diff --git a/tests/special_e2e/run_ppo_trainer_megatron.sh b/tests/special_e2e/run_ppo_trainer_megatron.sh
diff --git a/verl/experimental/reward/reward_model.py b/verl/experimental/reward/reward_model.py
diff --git a/verl/utils/config.py b/verl/utils/config.py