[data] feat: TransferQueue - Unify the return of reward (verl-project#4902)

walterchenchn · Copilot · web-flow · commit aeccf991489d · 2026-01-15T20:26:03.000+08:00
### What does this PR do? Unify the return values of the reward function to make the logic clearer ### Checklist Before Starting - [x] Search for similar PRs. Paste at least one query link here: ... - [x] Format the PR title as `[{modules}] {type}: {description}` (This will be checked by the CI) - `{modules}` include `fsdp`, `megatron`, `sglang`, `vllm`, `rollout`, `trainer`, `ci`, `training_utils`, `recipe`, `hardware`, `deployment`, `ray`, `worker`, `single_controller`, `misc`, `perf`, `model`, `algo`, `env`, `tool`, `ckpt`, `doc`, `data`, `cfg`, `reward` - If this PR involves multiple modules, separate them with `,` like `[megatron, fsdp, doc]` - `{type}` is in `feat`, `fix`, `refactor`, `chore`, `test` - If this PR breaks any API (CLI arguments, config, function signature, etc.), add `[BREAKING]` to the beginning of the title. - Example: `[BREAKING][fsdp, megatron] feat: dynamic batching` ### Test > For changes that can not be tested by CI (e.g., algorithm implementation, new model support), validate by experiment(s) and show results like training curve plots, evaluation results, etc. ### API and Usage Example > Demonstrate how the API changes if any, and provide usage example(s) if possible. ```python # Add code snippet or script demonstrating how to use this ``` ### Design & Code Changes > Demonstrate the high-level design if this PR is complex, and list the specific changes. ### Checklist Before Submitting > [!IMPORTANT] > Please check all the following items before requesting a review, otherwise the reviewer might deprioritize this PR for review. - [x] Read the [Contribute Guide](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md). - [x] Apply [pre-commit checks](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md#code-linting-and-formatting): `pre-commit install && pre-commit run --all-files --show-diff-on-failure --color=always` - [x] Add / Update [the documentation](https://github.com/volcengine/verl/tree/main/docs). - [x] Add unit or end-to-end test(s) to [the CI workflow](https://github.com/volcengine/verl/tree/main/.github/workflows) to cover all the code. If not feasible, explain why: ... - [x] Once your PR is ready for CI, send a message in [the `ci-request` channel](https://verl-project.slack.com/archives/C091TCESWB1) in [the `verl` Slack workspace](https://join.slack.com/t/verl-project/shared_invite/zt-3855yhg8g-CTkqXu~hKojPCmo7k_yXTQ). (If not accessible, please try [the Feishu group (飞书群)](https://applink.larkoffice.com/client/chat/chatter/add_by_link?link_token=772jd4f1-cd91-441e-a820-498c6614126a).) - [x] If your PR is related to the `recipe` submodule, please also update the reference to the submodule commit via `git submodule update --remote` or `cd recipe && git pull origin main`. --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
@@ -538,9 +538,9 @@ def _compute_or_extract_reward(
         self,
         batch: DataProto,
         reward_fn=None,
-        return_dict: bool = False,
+        reward_for_val: bool = False,
         sum_reward: bool = False,
-    ) -> tuple[torch.Tensor, dict[str, Any]] | torch.Tensor | dict[str, Any]:
+    ) -> tuple[torch.Tensor, dict[str, Any]] | torch.Tensor:
         """
         Compute or extract reward from batch.
 
@@ -551,49 +551,39 @@ def _compute_or_extract_reward(
         Args:
             batch: DataProto containing the batch data
             reward_fn: Reward function to use if rm_scores doesn't exist (for training/validation)
-            return_dict: Whether to return dict format with reward_extra_info (for validation)
+            reward_for_val: Whether this is for validation
             sum_reward: Whether to sum reward tensor along last dimension (for REMAX baseline)
 
         Returns:
-            If return_dict=True: dict with "reward_tensor" and "reward_extra_info"
-            If return_dict=False and sum_reward=True: summed reward_tensor (1D tensor)
-            If return_dict=False and sum_reward=False: reward_tensor (2D tensor)
+            If reward_for_val=False and sum_reward=True: summed reward_tensor (1D tensor)
+            Otherwise: tuple of (reward_tensor, reward_extra_infos_dict)
         """
         # When rm_scores already exists, extract it directly (format conversion only)
         if "rm_scores" in batch.batch.keys():
             reward_tensor = batch.batch["rm_scores"]
             if sum_reward:
                 reward_tensor = reward_tensor.sum(dim=-1)
 
-            if return_dict:
-                # Extract reward_extra_info if available
-                reward_extra_keys = batch.meta_info.get("reward_extra_keys", [])
-                reward_extra_info = (
-                    {key: batch.non_tensor_batch[key] for key in reward_extra_keys} if reward_extra_keys else {}
-                )
-                return {"reward_tensor": reward_tensor, "reward_extra_info": reward_extra_info}
-            else:
-                # If sum_reward=True, only return tensor (for REMAX baseline)
-                if sum_reward:
-                    return reward_tensor
-                # Otherwise, return tuple with reward_extra_info (for training loop)
-                reward_extra_keys = batch.meta_info.get("reward_extra_keys", [])
-                reward_extra_infos_dict = (
-                    {key: batch.non_tensor_batch[key] for key in reward_extra_keys} if reward_extra_keys else {}
-                )
-                return reward_tensor, reward_extra_infos_dict
+            if not reward_for_val and sum_reward:
+                return reward_tensor
+
+            reward_extra_keys = batch.meta_info.get("reward_extra_keys", [])
+            reward_extra_infos_dict = (
+                {key: batch.non_tensor_batch[key] for key in reward_extra_keys} if reward_extra_keys else {}
+            )
+            return reward_tensor, reward_extra_infos_dict
 
         # Otherwise, compute reward using reward_fn
         if reward_fn is None:
             raise ValueError("reward_fn must be provided when rm_scores is not available.")
 
-        if return_dict:
+        if reward_for_val:
             result = reward_fn(batch, return_dict=True)
             reward_tensor = result["reward_tensor"]
             if sum_reward:
                 reward_tensor = reward_tensor.sum(dim=-1)
-            reward_extra_info = result.get("reward_extra_info", {})
-            return {"reward_tensor": reward_tensor, "reward_extra_info": reward_extra_info}
+            reward_extra_infos_dict = result.get("reward_extra_info", {})
+            return reward_tensor, reward_extra_infos_dict
         else:
             reward_tensor, reward_extra_infos_dict = compute_reward(batch, reward_fn)
             if sum_reward:
@@ -695,13 +685,13 @@ def _validate(self, merged: bool = False):
             sample_uids.extend(test_batch.non_tensor_batch["uid"])
 
             # evaluate using reward_function
-            result = self._compute_or_extract_reward(test_batch, reward_fn=self.val_reward_fn, return_dict=True)
-            reward_tensor = result["reward_tensor"]
+            reward_tensor, reward_extra_info = self._compute_or_extract_reward(
+                test_batch, reward_fn=self.val_reward_fn, reward_for_val=True
+            )
             scores = reward_tensor.sum(-1).cpu().tolist()
             sample_scores.extend(scores)
 
             reward_extra_infos_dict["reward"].extend(scores)
-            reward_extra_info = result.get("reward_extra_info", {})
             for key, values in reward_extra_info.items():
                 if key not in reward_extra_infos_dict:
                     reward_extra_infos_dict[key] = []
@@ -1525,7 +1515,7 @@ def fit(self):
                             )
                         else:
                             reward_tensor, reward_extra_infos_dict = self._compute_or_extract_reward(
-                                batch, reward_fn=self.reward_fn, return_dict=False
+                                batch, reward_fn=self.reward_fn, reward_for_val=False
                             )
 
                     # Operating Mode Selection: