[python-package] standardize variable naming for evaluation results (#7163)

jameslamb · web-flow · commit a7d00a961163 · 2026-04-01T11:38:47.000-05:00
diff --git a/examples/python-guide/advanced_example.py b/examples/python-guide/advanced_example.py
@@ -148,7 +148,7 @@ def loglikelihood(preds, train_data):
 
 
 # self-defined eval metric
-# f(preds: array, train_data: Dataset) -> name: str, eval_result: float, is_higher_better: bool
+# f(preds: array, train_data: Dataset) -> metric_name: str, metric_value: float, maximize: bool
 # binary error
 # NOTE: when you do customized loss function, the default prediction value is margin
 # This may make built-in evaluation metric calculate wrong results
@@ -172,7 +172,7 @@ def binary_error(preds, train_data):
 
 
 # another self-defined eval metric
-# f(preds: array, train_data: Dataset) -> name: str, eval_result: float, is_higher_better: bool
+# f(preds: array, train_data: Dataset) -> metric_name: str, metric_value: float, maximize: bool
 # accuracy
 # NOTE: when you do customized loss function, the default prediction value is margin
 # This may make built-in evaluation metric calculate wrong results
diff --git a/examples/python-guide/sklearn_example.py b/examples/python-guide/sklearn_example.py
@@ -36,7 +36,7 @@
 
 
 # self-defined eval metric
-# f(y_true: array, y_pred: array) -> name: str, eval_result: float, is_higher_better: bool
+# f(y_true: array, y_pred: array) -> metric_name: str, metric_value: float, maximize: bool
 # Root Mean Squared Logarithmic Error (RMSLE)
 def rmsle(y_true, y_pred):
     return "RMSLE", np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False
@@ -48,7 +48,7 @@ def rmsle(y_true, y_pred):
 
 
 # another self-defined eval metric
-# f(y_true: array, y_pred: array) -> name: str, eval_result: float, is_higher_better: bool
+# f(y_true: array, y_pred: array) -> metric_name: str, metric_value: float, maximize: bool
 # Relative Absolute Error (RAE)
 def rae(y_true, y_pred):
     return "RAE", np.sum(np.abs(y_pred - y_true)) / np.sum(np.abs(np.mean(y_true) - y_true)), False
diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py
@@ -4345,7 +4345,7 @@ def eval(
         feval : callable, list of callable, or None, optional (default=None)
             Customized evaluation function.
             Each evaluation function should accept two parameters: preds, eval_data,
-            and return (eval_name, eval_result, is_higher_better) or list of such tuples.
+            and return (metric_name, metric_value, maximize) or list of such tuples.
 
                 preds : numpy 1-D array or numpy 2-D array (for multi-class task)
                     The predicted values.
@@ -4354,17 +4354,17 @@ def eval(
                     e.g. they are raw margin instead of probability of positive class for binary task in this case.
                 eval_data : Dataset
                     A ``Dataset`` to evaluate.
-                eval_name : str
-                    The name of evaluation function (without whitespace).
-                eval_result : float
-                    The eval result.
-                is_higher_better : bool
-                    Is eval result higher better, e.g. AUC is ``is_higher_better``.
+                metric_name : str
+                    Unique identifier for the metric (e.g. "custom_adjusted_mse").
+                metric_value : float
+                    Value of the evaluation metric.
+                maximize : bool
+                    Are higher values better? e.g. ``True`` for AUC and ``False`` for binary error.
 
         Returns
         -------
         result : list
-            List with (dataset_name, eval_name, eval_result, is_higher_better) tuples.
+            List with (dataset_name, metric_name, metric_value, maximize) tuples.
         """
         if not isinstance(data, Dataset):
             raise TypeError("Can only eval for Dataset instance")
@@ -4394,7 +4394,7 @@ def eval_train(
         feval : callable, list of callable, or None, optional (default=None)
             Customized evaluation function.
             Each evaluation function should accept two parameters: preds, eval_data,
-            and return (eval_name, eval_result, is_higher_better) or list of such tuples.
+            and return (metric_name, metric_value, maximize) or list of such tuples.
 
                 preds : numpy 1-D array or numpy 2-D array (for multi-class task)
                     The predicted values.
@@ -4403,17 +4403,17 @@ def eval_train(
                     e.g. they are raw margin instead of probability of positive class for binary task in this case.
                 eval_data : Dataset
                     The training dataset.
-                eval_name : str
-                    The name of evaluation function (without whitespace).
-                eval_result : float
-                    The eval result.
-                is_higher_better : bool
-                    Is eval result higher better, e.g. AUC is ``is_higher_better``.
+                metric_name : str
+                    Unique identifier for the metric (e.g. "custom_adjusted_mse").
+                metric_value : float
+                    Value of the evaluation metric.
+                maximize : bool
+                    Are higher values better? e.g. ``True`` for AUC and ``False`` for binary error.
 
         Returns
         -------
         result : list
-            List with (train_dataset_name, eval_name, eval_result, is_higher_better) tuples.
+            List with (train_dataset_name, metric_name, metric_value, maximize) tuples.
         """
         return self.__inner_eval(data_name=self._train_data_name, data_idx=0, feval=feval)
 
@@ -4428,7 +4428,7 @@ def eval_valid(
         feval : callable, list of callable, or None, optional (default=None)
             Customized evaluation function.
             Each evaluation function should accept two parameters: preds, eval_data,
-            and return (eval_name, eval_result, is_higher_better) or list of such tuples.
+            and return (metric_name, metric_value, maximize) or list of such tuples.
 
                 preds : numpy 1-D array or numpy 2-D array (for multi-class task)
                     The predicted values.
@@ -4437,17 +4437,17 @@ def eval_valid(
                     e.g. they are raw margin instead of probability of positive class for binary task in this case.
                 eval_data : Dataset
                     The validation dataset.
-                eval_name : str
-                    The name of evaluation function (without whitespace).
-                eval_result : float
-                    The eval result.
-                is_higher_better : bool
-                    Is eval result higher better, e.g. AUC is ``is_higher_better``.
+                metric_name : str
+                    Unique identifier for the metric (e.g. "custom_adjusted_mse").
+                metric_value : float
+                    Value of the evaluation metric.
+                maximize : bool
+                    Are higher values better? e.g. ``True`` for AUC and ``False`` for binary error.
 
         Returns
         -------
         result : list
-            List with (validation_dataset_name, eval_name, eval_result, is_higher_better) tuples.
+            List with (validation_dataset_name, metric_name, metric_value, maximize) tuples.
         """
         return [
             item
@@ -5215,11 +5215,11 @@ def __inner_eval(
                     continue
                 feval_ret = eval_function(self.__inner_predict(data_idx=data_idx), cur_data)
                 if isinstance(feval_ret, list):
-                    for eval_name, val, is_higher_better in feval_ret:
-                        ret.append((data_name, eval_name, val, is_higher_better))
+                    for metric_name, metric_value, maximize in feval_ret:
+                        ret.append((data_name, metric_name, metric_value, maximize))
                 else:
-                    eval_name, val, is_higher_better = feval_ret
-                    ret.append((data_name, eval_name, val, is_higher_better))
+                    metric_name, metric_value, maximize = feval_ret
+                    ret.append((data_name, metric_name, metric_value, maximize))
         return ret
 
     def __inner_predict(self, *, data_idx: int) -> np.ndarray:
diff --git a/python-package/lightgbm/callback.py b/python-package/lightgbm/callback.py
@@ -52,7 +52,7 @@ def __init__(self, best_iteration: int, best_score: _ListOfEvalResultTuples) ->
         best_iteration : int
             The best iteration stopped.
             0-based... pass ``best_iteration=2`` to indicate that the third iteration was the best one.
-        best_score : list of (eval_name, metric_name, eval_result, is_higher_better) tuple or (eval_name, metric_name, eval_result, is_higher_better, stdv) tuple
+        best_score : list of (dataset_name, metric_name, metric_value, maximize) tuple or (dataset_name, metric_name, metric_value, maximize, metric_std_dev) tuple
             Scores for each metric, on each validation set, as of the best iteration.
         """
         super().__init__()
@@ -317,7 +317,7 @@ def _is_train_set(self, *, dataset_name: str, env: CallbackEnv) -> bool:
         if _is_using_cv(env) and dataset_name == "train":
             return True
 
-        # for lgb.train(), it's possible to pass the training data via valid_sets with any eval_name
+        # for lgb.train(), it's possible to pass the training data via valid_sets with any name
         if isinstance(env.model, Booster) and dataset_name == env.model._train_data_name:
             return True
 
@@ -397,7 +397,10 @@ def _final_iteration_check(self, *, env: CallbackEnv, metric_name: str, i: int)
                 )
                 if self.first_metric_only:
                     _log_info(f"Evaluated only: {metric_name}")
-            raise EarlyStopException(self.best_iter[i], self.best_score_list[i])
+            raise EarlyStopException(
+                best_iteration=self.best_iter[i],
+                best_score=self.best_score_list[i],
+            )
 
     def __call__(self, env: CallbackEnv) -> None:
         if env.iteration == env.begin_iteration:
@@ -437,7 +440,10 @@ def __call__(self, env: CallbackEnv) -> None:
                     _log_info(f"Early stopping, best iteration is:\n[{self.best_iter[i] + 1}]\t{eval_result_str}")
                     if self.first_metric_only:
                         _log_info(f"Evaluated only: {metric_name}")
-                raise EarlyStopException(self.best_iter[i], self.best_score_list[i])
+                raise EarlyStopException(
+                    best_iteration=self.best_iter[i],
+                    best_score=self.best_score_list[i],
+                )
             self._final_iteration_check(env=env, metric_name=metric_name, i=i)
 
 
diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py
@@ -482,7 +482,8 @@ def _train(
         lightgbm estimator is not trained using any chunks of a particular eval set, its corresponding component
         of ``evals_result_`` and ``best_score_`` will be empty dictionaries.
     eval_names : list of str, or None, optional (default=None)
-        Names of eval_set.
+        Unique identifiers for each evaluation dataset.
+        Should be the same length as ``eval_set`` / ``eval_X``.
     eval_X : Dask Array or Dask DataFrame, tuple thereof or None, optional (default=None)
         Feature matrix or tuple thereof, e.g. ``(X_val0, X_val1)``, to use as validation sets.
     eval_y : Dask Array or Dask DataFrame or Dask Series, tuple thereof or None, optional (default=None)
diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py
@@ -135,7 +135,7 @@ def train(
     feval : callable, list of callable, or None, optional (default=None)
         Customized evaluation function.
         Each evaluation function should accept two parameters: preds, eval_data,
-        and return (eval_name, eval_result, is_higher_better) or list of such tuples.
+        and return (metric_name, metric_value, maximize) or list of such tuples.
 
             preds : numpy 1-D array or numpy 2-D array (for multi-class task)
                 The predicted values.
@@ -144,12 +144,12 @@ def train(
                 e.g. they are raw margin instead of probability of positive class for binary task in this case.
             eval_data : Dataset
                 A ``Dataset`` to evaluate.
-            eval_name : str
-                The name of evaluation function (without whitespaces).
-            eval_result : float
-                The eval result.
-            is_higher_better : bool
-                Is eval result higher better, e.g. AUC is ``is_higher_better``.
+            metric_name : str
+                Unique identifier for the metric (e.g. "custom_adjusted_mse").
+            metric_value : float
+                Value of the evaluation metric.
+            maximize : bool
+                Are higher values better? e.g. ``True`` for AUC and ``False`` for binary error.
 
         To ignore the default metric corresponding to the used objective,
         set the ``metric`` parameter to the string ``"None"`` in ``params``.
@@ -346,8 +346,8 @@ def train(
             evaluation_result_list = [item[:4] for item in earlyStopException.best_score]
             break
     booster.best_score = defaultdict(OrderedDict)
-    for dataset_name, eval_name, score, _ in evaluation_result_list:
-        booster.best_score[dataset_name][eval_name] = score
+    for dataset_name, metric_name, metric_value, _ in evaluation_result_list:
+        booster.best_score[dataset_name][metric_name] = metric_value
     if not keep_training_booster:
         booster.model_from_string(booster.model_to_string()).free_dataset()
     return booster
@@ -600,7 +600,7 @@ def _agg_cv_result(
     # build up 2 maps, of the form:
     #
     # OrderedDict{
-    #     (<dataset_name>, <metric_name>): <is_higher_better>
+    #     (<dataset_name>, <metric_name>): <maximize>
     # }
     #
     # OrderedDict{
@@ -610,16 +610,16 @@ def _agg_cv_result(
     metric_types: Dict[Tuple[str, str], bool] = OrderedDict()
     metric_values: Dict[Tuple[str, str], List[float]] = OrderedDict()
     for one_result in raw_results:
-        for dataset_name, metric_name, metric_value, is_higher_better in one_result:
+        for dataset_name, metric_name, metric_value, maximize in one_result:
             key = (dataset_name, metric_name)
-            metric_types[key] = is_higher_better
+            metric_types[key] = maximize
             metric_values.setdefault(key, [])
             metric_values[key].append(metric_value)
 
     # turn that into a list of tuples of the form:
     #
     # [
-    #     (<dataset_name>, <metric_name>, mean(<values>), <is_higher_better>, std_dev(<values>))
+    #     (<dataset_name>, <metric_name>, mean(<values>), <maximize>, std_dev(<values>))
     # ]
     return [(k[0], k[1], float(np.mean(v)), metric_types[k], float(np.std(v))) for k, v in metric_values.items()]
 
@@ -670,7 +670,7 @@ def cv(
     feval : callable, list of callable, or None, optional (default=None)
         Customized evaluation function.
         Each evaluation function should accept two parameters: preds, eval_data,
-        and return (eval_name, eval_result, is_higher_better) or list of such tuples.
+        and return (metric_name, metric_value, maximize) or list of such tuples.
 
             preds : numpy 1-D array or numpy 2-D array (for multi-class task)
                 The predicted values.
@@ -679,12 +679,12 @@ def cv(
                 e.g. they are raw margin instead of probability of positive class for binary task in this case.
             eval_data : Dataset
                 A ``Dataset`` to evaluate.
-            eval_name : str
-                The name of evaluation function (without whitespace).
-            eval_result : float
-                The eval result.
-            is_higher_better : bool
-                Is eval result higher better, e.g. AUC is ``is_higher_better``.
+            metric_name : str
+                Unique identifier for the metric (e.g. "custom_adjusted_mse").
+            metric_value : float
+                Value of the evaluation metric.
+            maximize : bool
+                Are higher values better? e.g. ``True`` for AUC and ``False`` for binary error.
 
         To ignore the default metric corresponding to the used objective,
         set ``metrics`` to the string ``"None"``.
diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py