[dev] fix: validation metrics (volcengine#1374)

tongyx361 · web-flow · commit 1d14d57b6817 · 2025-05-04T09:06:53.000-07:00
### Checklist Before Starting - [x] Search for similar PR(s). ### What does this PR do? 1. Fix the error that `metric` is not added when `n == 1`. 2. Remove `std@1`. 3. Add assertation for doing initial validation but `val_metrics` is empty. ### Additional Info. - **Issue Number**: none - **Training**: none - **Inference**: none ### Checklist Before Submitting - [x] Read the [Contribute Guide](https://github.com/volcengine/verl?tab=readme-ov-file#contribution-guide). - [x] Apply [pre-commit checks](https://github.com/volcengine/verl?tab=readme-ov-file#code-linting-and-formatting). - [x] Add `[BREAKING]` to the PR title if it breaks any API. - [x] Update the documentation about your changes in the [docs](https://github.com/volcengine/verl/tree/main/docs). - [x] Add CI test(s) if necessary.
diff --git a/recipe/dapo/src/dapo_ray_trainer.py b/recipe/dapo/src/dapo_ray_trainer.py
@@ -67,6 +67,7 @@ def fit(self):
         # currently, we only support validation using the reward_function.
         if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
             val_metrics = self._validate()
+            assert val_metrics, f"{val_metrics=}"
             pprint(f"Initial validation metrics: {val_metrics}")
             logger.log(data=val_metrics, step=self.global_steps)
             if self.config.trainer.get("val_only", False):
diff --git a/recipe/prime/prime_ray_trainer.py b/recipe/prime/prime_ray_trainer.py
@@ -327,6 +327,7 @@ def fit(self):
         # currently, we only support validation using the reward_function.
         if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
             val_metrics = self._validate()
+            assert val_metrics, f"{val_metrics=}"
             pprint(f"Initial validation metrics: {val_metrics}")
             logger.log(data=val_metrics, step=self.global_steps)
             if self.config.trainer.get("val_only", False):
diff --git a/verl/trainer/ppo/metric_utils.py b/verl/trainer/ppo/metric_utils.py
@@ -208,45 +208,42 @@ def process_validation_metrics(data_sources: list[str], sample_inputs: list[str]
             for var_name, var_vals in var2vals.items():
                 if isinstance(var_vals[0], str):
                     continue
+
                 metric = {}
                 n_resps = len(var_vals)
                 metric[f"mean@{n_resps}"] = np.mean(var_vals)
-                metric[f"std@{n_resps}"] = np.std(var_vals)
-
-                ns = []
-                n = 2
-                while n < n_resps:
-                    ns.append(n)
-                    n *= 2
-                ns.append(n_resps)
 
-                # If there are multiple responses, we can compute the best/worst-of-N metrics
-                # If not, they are the same as the single response metrics
                 if n_resps > 1:
+                    # n = n_resps
+                    metric[f"std@{n_resps}"] = np.std(var_vals)
+
+                    metric[f"best@{n_resps}/mean"] = np.max(var_vals)
+                    metric[f"worst@{n_resps}/mean"] = np.min(var_vals)
+                    if var2vals.get("pred", None) is not None:
+                        vote_data = [{"val": val, "pred": pred} for val, pred in zip(var_vals, var2vals["pred"])]
+                        metric[f"maj@{n_resps}/mean"] = calc_maj_val(vote_data, vote_key="pred", val_key="val")
+                    # 1 < n < n_resps
+                    ns = []
+                    n = 2
+                    while n < n_resps:
+                        ns.append(n)
+                        n *= 2
+
                     for n in ns:
-                        if n == n_resps:
-                            # Non-bootstrapped
-                            metric[f"best@{n}/mean"] = np.max(var_vals)
-                            metric[f"worst@{n}/mean"] = np.min(var_vals)
-                            if var2vals.get("pred", None) is not None:
-                                vote_data = [{"val": val, "pred": pred} for val, pred in zip(var_vals, var2vals["pred"])]
-                                metric[f"maj@{n}/mean"] = calc_maj_val(vote_data, vote_key="pred", val_key="val")
-                        else:
-                            # Bootstrapped
-                            [(bon_mean, bon_std), (won_mean, won_std)] = bootstrap_metric(data=var_vals, subset_size=n, reduce_fns=[np.max, np.min], seed=seed)
-                            metric[f"best@{n}/mean"], metric[f"best@{n}/std"] = bon_mean, bon_std
-                            metric[f"worst@{n}/mean"], metric[f"worst@{n}/std"] = won_mean, won_std
-                            if var2vals.get("pred", None) is not None:
-                                vote_data = [{"val": val, "pred": pred} for val, pred in zip(var_vals, var2vals["pred"])]
-                                [(maj_n_mean, maj_n_std)] = bootstrap_metric(
-                                    data=vote_data,
-                                    subset_size=n,
-                                    reduce_fns=[partial(calc_maj_val, vote_key="pred", val_key="val")],
-                                    seed=seed,
-                                )
-                                metric[f"maj@{n}/mean"], metric[f"maj@{n}/std"] = maj_n_mean, maj_n_std
-
-                    data_src2prompt2var2metric[data_source][prompt][var_name] = metric
+                        [(bon_mean, bon_std), (won_mean, won_std)] = bootstrap_metric(data=var_vals, subset_size=n, reduce_fns=[np.max, np.min], seed=seed)
+                        metric[f"best@{n}/mean"], metric[f"best@{n}/std"] = bon_mean, bon_std
+                        metric[f"worst@{n}/mean"], metric[f"worst@{n}/std"] = won_mean, won_std
+                        if var2vals.get("pred", None) is not None:
+                            vote_data = [{"val": val, "pred": pred} for val, pred in zip(var_vals, var2vals["pred"])]
+                            [(maj_n_mean, maj_n_std)] = bootstrap_metric(
+                                data=vote_data,
+                                subset_size=n,
+                                reduce_fns=[partial(calc_maj_val, vote_key="pred", val_key="val")],
+                                seed=seed,
+                            )
+                            metric[f"maj@{n}/mean"], metric[f"maj@{n}/std"] = maj_n_mean, maj_n_std
+
+                data_src2prompt2var2metric[data_source][prompt][var_name] = metric
 
     # Aggregate metrics across prompts
     data_src2var2metric2prompt_vals = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
@@ -869,6 +869,7 @@ def fit(self):
         # currently, we only support validation using the reward_function.
         if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
             val_metrics = self._validate()
+            assert val_metrics, f"{val_metrics=}"
             pprint(f"Initial validation metrics: {val_metrics}")
             logger.log(data=val_metrics, step=self.global_steps)
             if self.config.trainer.get("val_only", False):