[reward] feat: add compute_score timing metrics to agent loop (#5971)

Stonesjtu · gemini-code-assist[bot] · web-flow · commit 93f1831e42dd · 2026-04-13T18:34:06.000+08:00
### What does this PR do? Add `compute_score` timing metric to `AgentLoopMetrics` in the agent loop to track the time spent on reward score computation (`_compute_score`). This helps identify reward computation bottlenecks during training. **Changes:** 1. Added `compute_score: float = 0.0` field to `AgentLoopMetrics` 2. Instrumented `_compute_score()` with `simple_timer` to measure reward computation time per sample 3. Added `agent_loop/compute_score/min|max|mean` and `agent_loop/slowest/compute_score` to `_performance_metrics` aggregation This follows the same pattern as the existing `generate_sequences` and `tool_calls` timing metrics. ### Checklist Before Starting - [x] Search for similar PRs. Paste at least one query link here: https://github.com/volcengine/verl/pulls?q=is%3Apr+compute_score+metrics - [x] Format the PR title as `[{modules}] {type}: {description}` (This will be checked by the CI) ### Test The change is backward-compatible: - `AgentLoopMetrics.compute_score` defaults to `0.0`, so existing agent loops that do not use async reward will report `0.0` without breaking. - When `reward_loop_worker_handles` is not `None`, `_compute_score` measures the full reward computation call and writes the elapsed time into `output.metrics.compute_score`. - The `_performance_metrics` method safely aggregates `compute_score` from all samples, consistent with how `generate_sequences` and `tool_calls` are handled. ### API and Usage Example No API changes. The new metric is automatically reported in the training logs alongside existing metrics: ``` agent_loop/compute_score/min: 0.12 agent_loop/compute_score/max: 2.34 agent_loop/compute_score/mean: 0.78 agent_loop/slowest/compute_score: 2.34 ``` ### Design & Code Changes - `verl/experimental/agent_loop/agent_loop.py`: - `AgentLoopMetrics`: added `compute_score` field - `AgentLoopWorker._compute_score()`: wrapped reward computation with `simple_timer` - `AgentLoopManager._performance_metrics()`: added min/max/mean/slowest aggregation for `compute_score` ### Checklist Before Submitting - [x] Read the [Contribute Guide](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md). - [x] Apply [pre-commit checks](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md#code-linting-and-formatting): `pre-commit install && pre-commit run --all-files --show-diff-on-failure --color=always` - [ ] Add / Update [the documentation](https://github.com/volcengine/verl/tree/main/docs). - [ ] Add unit or end-to-end test(s) to [the CI workflow](https://github.com/volcengine/verl/tree/main/.github/workflows) to cover all the code. If not feasible, explain why: The timing metric follows the exact same pattern as existing `generate_sequences` and `tool_calls` metrics. Testing requires GPU + reward model setup which is covered by existing integration tests in `tests/experimental/reward_loop/`. - [ ] Once your PR is ready for CI, send a message in [the `ci-request` channel](https://verl-project.slack.com/archives/C091TCESWB1) in [the `verl` Slack workspace](https://join.slack.com/t/verl-project/shared_invite/zt-3855yhg8g-CTkqXu~hKojPCmo7k_yXTQ). - [ ] If your PR is related to the `recipe` submodule, please also update the reference to the submodule commit via `git submodule update --remote` or `cd recipe && git pull origin main`. --------- Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
@@ -40,6 +40,7 @@
 from verl.utils.config import omega_conf_to_dataclass
 from verl.utils.dataset.rl_dataset import RLHFDataset, get_dataset_class
 from verl.utils.model import compute_position_id_with_mask
+from verl.utils.profiler import simple_timer
 from verl.utils.ray_utils import auto_await, get_event_loop
 from verl.utils.rollout_trace import (
     RolloutTraceConfig,
@@ -180,6 +181,7 @@ class AgentLoopMetrics(BaseModel):
 
     generate_sequences: float = 0.0
     tool_calls: float = 0.0
+    compute_score: float = 0.0
     num_preempted: int = -1  # -1 means not available
 
 
@@ -857,30 +859,33 @@ async def _compute_score(self, output, prompts, responses, attention_mask, input
         enable_async_reward = self.reward_loop_worker_handles is not None
 
         if output.reward_score is None and enable_async_reward:
-            batch = TensorDict(
-                {
-                    "prompts": prompts,  # [1, prompt_length]
-                    "responses": responses,  # [1, response_length]
-                    "attention_mask": attention_mask,  # [1, prompt_length + response_length]
-                    "input_ids": input_ids,  # [1, prompt_length + response_length]
-                    "position_ids": position_ids,
-                },
-                batch_size=1,
-            )
-            non_tensor_batch = {
-                **{k: np.array([v]) for k, v in kwargs.items()},
-                "__num_turns__": np.array([output.num_turns]),
-                "tool_extra_fields": np.array([output.extra_fields], dtype=object),
-            }
-
-            data = DataProto(
-                batch=batch,
-                non_tensor_batch=non_tensor_batch,
-            )
-            selected_reward_loop_worker_handle = random.choice(self.reward_loop_worker_handles)
-            result = await selected_reward_loop_worker_handle.compute_score.remote(data)
-            output.reward_score = result["reward_score"]
-            output.extra_fields["reward_extra_info"] = result["reward_extra_info"]
+            timing = {}
+            with simple_timer("compute_score", timing):
+                batch = TensorDict(
+                    {
+                        "prompts": prompts,  # [1, prompt_length]
+                        "responses": responses,  # [1, response_length]
+                        "attention_mask": attention_mask,  # [1, prompt_length + response_length]
+                        "input_ids": input_ids,  # [1, prompt_length + response_length]
+                        "position_ids": position_ids,
+                    },
+                    batch_size=1,
+                )
+                non_tensor_batch = {
+                    **{k: np.array([v]) for k, v in kwargs.items()},
+                    "__num_turns__": np.array([output.num_turns]),
+                    "tool_extra_fields": np.array([output.extra_fields], dtype=object),
+                }
+
+                data = DataProto(
+                    batch=batch,
+                    non_tensor_batch=non_tensor_batch,
+                )
+                selected_reward_loop_worker_handle = random.choice(self.reward_loop_worker_handles)
+                result = await selected_reward_loop_worker_handle.compute_score.remote(data)
+                output.reward_score = result["reward_score"]
+                output.extra_fields["reward_extra_info"] = result["reward_extra_info"]
+            output.metrics.compute_score = timing["compute_score"]
 
     async def _compute_teacher_logprobs(self, output: AgentLoopOutput, prompt_ids, response_ids, validate):
         """Compute teacher logprobs for single sample."""
@@ -1200,6 +1205,7 @@ def _performance_metrics(self, metrics: list[list[dict[str, str]]], output: Data
         timing = {}
         t_generate_sequences = np.array([metric["generate_sequences"] for chunk in metrics for metric in chunk])
         t_tool_calls = np.array([metric["tool_calls"] for chunk in metrics for metric in chunk])
+        t_compute_score = np.array([metric["compute_score"] for chunk in metrics for metric in chunk])
         num_preempted = np.array([metric["num_preempted"] for chunk in metrics for metric in chunk])
         timing["agent_loop/num_preempted/min"] = num_preempted.min()
         timing["agent_loop/num_preempted/max"] = num_preempted.max()
@@ -1210,12 +1216,16 @@ def _performance_metrics(self, metrics: list[list[dict[str, str]]], output: Data
         timing["agent_loop/tool_calls/min"] = t_tool_calls.min()
         timing["agent_loop/tool_calls/max"] = t_tool_calls.max()
         timing["agent_loop/tool_calls/mean"] = t_tool_calls.mean()
+        timing["agent_loop/compute_score/min"] = t_compute_score.min()
+        timing["agent_loop/compute_score/max"] = t_compute_score.max()
+        timing["agent_loop/compute_score/mean"] = t_compute_score.mean()
 
         # batch sequence generation is bounded by the slowest sample
-        slowest = np.argmax(t_generate_sequences + t_tool_calls)
+        slowest = np.argmax(t_generate_sequences + t_tool_calls + t_compute_score)
         prompt_length = output.batch["prompts"].shape[1]
         timing["agent_loop/slowest/generate_sequences"] = t_generate_sequences[slowest]
         timing["agent_loop/slowest/tool_calls"] = t_tool_calls[slowest]
+        timing["agent_loop/slowest/compute_score"] = t_compute_score[slowest]
         timing["agent_loop/slowest/num_preempted"] = num_preempted[slowest]
 
         if "attention_mask" in output.batch: