Skip to content

Commit a7d00a9

Browse files
authored
[python-package] standardize variable naming for evaluation results (#7163)
1 parent 80ed9f6 commit a7d00a9

8 files changed

Lines changed: 90 additions & 82 deletions

File tree

examples/python-guide/advanced_example.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ def loglikelihood(preds, train_data):
148148

149149

150150
# self-defined eval metric
151-
# f(preds: array, train_data: Dataset) -> name: str, eval_result: float, is_higher_better: bool
151+
# f(preds: array, train_data: Dataset) -> metric_name: str, metric_value: float, maximize: bool
152152
# binary error
153153
# NOTE: when you do customized loss function, the default prediction value is margin
154154
# This may make built-in evaluation metric calculate wrong results
@@ -172,7 +172,7 @@ def binary_error(preds, train_data):
172172

173173

174174
# another self-defined eval metric
175-
# f(preds: array, train_data: Dataset) -> name: str, eval_result: float, is_higher_better: bool
175+
# f(preds: array, train_data: Dataset) -> metric_name: str, metric_value: float, maximize: bool
176176
# accuracy
177177
# NOTE: when you do customized loss function, the default prediction value is margin
178178
# This may make built-in evaluation metric calculate wrong results

examples/python-guide/sklearn_example.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636

3737

3838
# self-defined eval metric
39-
# f(y_true: array, y_pred: array) -> name: str, eval_result: float, is_higher_better: bool
39+
# f(y_true: array, y_pred: array) -> metric_name: str, metric_value: float, maximize: bool
4040
# Root Mean Squared Logarithmic Error (RMSLE)
4141
def rmsle(y_true, y_pred):
4242
return "RMSLE", np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False
@@ -48,7 +48,7 @@ def rmsle(y_true, y_pred):
4848

4949

5050
# another self-defined eval metric
51-
# f(y_true: array, y_pred: array) -> name: str, eval_result: float, is_higher_better: bool
51+
# f(y_true: array, y_pred: array) -> metric_name: str, metric_value: float, maximize: bool
5252
# Relative Absolute Error (RAE)
5353
def rae(y_true, y_pred):
5454
return "RAE", np.sum(np.abs(y_pred - y_true)) / np.sum(np.abs(np.mean(y_true) - y_true)), False

python-package/lightgbm/basic.py

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -4345,7 +4345,7 @@ def eval(
43454345
feval : callable, list of callable, or None, optional (default=None)
43464346
Customized evaluation function.
43474347
Each evaluation function should accept two parameters: preds, eval_data,
4348-
and return (eval_name, eval_result, is_higher_better) or list of such tuples.
4348+
and return (metric_name, metric_value, maximize) or list of such tuples.
43494349
43504350
preds : numpy 1-D array or numpy 2-D array (for multi-class task)
43514351
The predicted values.
@@ -4354,17 +4354,17 @@ def eval(
43544354
e.g. they are raw margin instead of probability of positive class for binary task in this case.
43554355
eval_data : Dataset
43564356
A ``Dataset`` to evaluate.
4357-
eval_name : str
4358-
The name of evaluation function (without whitespace).
4359-
eval_result : float
4360-
The eval result.
4361-
is_higher_better : bool
4362-
Is eval result higher better, e.g. AUC is ``is_higher_better``.
4357+
metric_name : str
4358+
Unique identifier for the metric (e.g. "custom_adjusted_mse").
4359+
metric_value : float
4360+
Value of the evaluation metric.
4361+
maximize : bool
4362+
Are higher values better? e.g. ``True`` for AUC and ``False`` for binary error.
43634363
43644364
Returns
43654365
-------
43664366
result : list
4367-
List with (dataset_name, eval_name, eval_result, is_higher_better) tuples.
4367+
List with (dataset_name, metric_name, metric_value, maximize) tuples.
43684368
"""
43694369
if not isinstance(data, Dataset):
43704370
raise TypeError("Can only eval for Dataset instance")
@@ -4394,7 +4394,7 @@ def eval_train(
43944394
feval : callable, list of callable, or None, optional (default=None)
43954395
Customized evaluation function.
43964396
Each evaluation function should accept two parameters: preds, eval_data,
4397-
and return (eval_name, eval_result, is_higher_better) or list of such tuples.
4397+
and return (metric_name, metric_value, maximize) or list of such tuples.
43984398
43994399
preds : numpy 1-D array or numpy 2-D array (for multi-class task)
44004400
The predicted values.
@@ -4403,17 +4403,17 @@ def eval_train(
44034403
e.g. they are raw margin instead of probability of positive class for binary task in this case.
44044404
eval_data : Dataset
44054405
The training dataset.
4406-
eval_name : str
4407-
The name of evaluation function (without whitespace).
4408-
eval_result : float
4409-
The eval result.
4410-
is_higher_better : bool
4411-
Is eval result higher better, e.g. AUC is ``is_higher_better``.
4406+
metric_name : str
4407+
Unique identifier for the metric (e.g. "custom_adjusted_mse").
4408+
metric_value : float
4409+
Value of the evaluation metric.
4410+
maximize : bool
4411+
Are higher values better? e.g. ``True`` for AUC and ``False`` for binary error.
44124412
44134413
Returns
44144414
-------
44154415
result : list
4416-
List with (train_dataset_name, eval_name, eval_result, is_higher_better) tuples.
4416+
List with (train_dataset_name, metric_name, metric_value, maximize) tuples.
44174417
"""
44184418
return self.__inner_eval(data_name=self._train_data_name, data_idx=0, feval=feval)
44194419

@@ -4428,7 +4428,7 @@ def eval_valid(
44284428
feval : callable, list of callable, or None, optional (default=None)
44294429
Customized evaluation function.
44304430
Each evaluation function should accept two parameters: preds, eval_data,
4431-
and return (eval_name, eval_result, is_higher_better) or list of such tuples.
4431+
and return (metric_name, metric_value, maximize) or list of such tuples.
44324432
44334433
preds : numpy 1-D array or numpy 2-D array (for multi-class task)
44344434
The predicted values.
@@ -4437,17 +4437,17 @@ def eval_valid(
44374437
e.g. they are raw margin instead of probability of positive class for binary task in this case.
44384438
eval_data : Dataset
44394439
The validation dataset.
4440-
eval_name : str
4441-
The name of evaluation function (without whitespace).
4442-
eval_result : float
4443-
The eval result.
4444-
is_higher_better : bool
4445-
Is eval result higher better, e.g. AUC is ``is_higher_better``.
4440+
metric_name : str
4441+
Unique identifier for the metric (e.g. "custom_adjusted_mse").
4442+
metric_value : float
4443+
Value of the evaluation metric.
4444+
maximize : bool
4445+
Are higher values better? e.g. ``True`` for AUC and ``False`` for binary error.
44464446
44474447
Returns
44484448
-------
44494449
result : list
4450-
List with (validation_dataset_name, eval_name, eval_result, is_higher_better) tuples.
4450+
List with (validation_dataset_name, metric_name, metric_value, maximize) tuples.
44514451
"""
44524452
return [
44534453
item
@@ -5215,11 +5215,11 @@ def __inner_eval(
52155215
continue
52165216
feval_ret = eval_function(self.__inner_predict(data_idx=data_idx), cur_data)
52175217
if isinstance(feval_ret, list):
5218-
for eval_name, val, is_higher_better in feval_ret:
5219-
ret.append((data_name, eval_name, val, is_higher_better))
5218+
for metric_name, metric_value, maximize in feval_ret:
5219+
ret.append((data_name, metric_name, metric_value, maximize))
52205220
else:
5221-
eval_name, val, is_higher_better = feval_ret
5222-
ret.append((data_name, eval_name, val, is_higher_better))
5221+
metric_name, metric_value, maximize = feval_ret
5222+
ret.append((data_name, metric_name, metric_value, maximize))
52235223
return ret
52245224

52255225
def __inner_predict(self, *, data_idx: int) -> np.ndarray:

python-package/lightgbm/callback.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def __init__(self, best_iteration: int, best_score: _ListOfEvalResultTuples) ->
5252
best_iteration : int
5353
The best iteration stopped.
5454
0-based... pass ``best_iteration=2`` to indicate that the third iteration was the best one.
55-
best_score : list of (eval_name, metric_name, eval_result, is_higher_better) tuple or (eval_name, metric_name, eval_result, is_higher_better, stdv) tuple
55+
best_score : list of (dataset_name, metric_name, metric_value, maximize) tuple or (dataset_name, metric_name, metric_value, maximize, metric_std_dev) tuple
5656
Scores for each metric, on each validation set, as of the best iteration.
5757
"""
5858
super().__init__()
@@ -317,7 +317,7 @@ def _is_train_set(self, *, dataset_name: str, env: CallbackEnv) -> bool:
317317
if _is_using_cv(env) and dataset_name == "train":
318318
return True
319319

320-
# for lgb.train(), it's possible to pass the training data via valid_sets with any eval_name
320+
# for lgb.train(), it's possible to pass the training data via valid_sets with any name
321321
if isinstance(env.model, Booster) and dataset_name == env.model._train_data_name:
322322
return True
323323

@@ -397,7 +397,10 @@ def _final_iteration_check(self, *, env: CallbackEnv, metric_name: str, i: int)
397397
)
398398
if self.first_metric_only:
399399
_log_info(f"Evaluated only: {metric_name}")
400-
raise EarlyStopException(self.best_iter[i], self.best_score_list[i])
400+
raise EarlyStopException(
401+
best_iteration=self.best_iter[i],
402+
best_score=self.best_score_list[i],
403+
)
401404

402405
def __call__(self, env: CallbackEnv) -> None:
403406
if env.iteration == env.begin_iteration:
@@ -437,7 +440,10 @@ def __call__(self, env: CallbackEnv) -> None:
437440
_log_info(f"Early stopping, best iteration is:\n[{self.best_iter[i] + 1}]\t{eval_result_str}")
438441
if self.first_metric_only:
439442
_log_info(f"Evaluated only: {metric_name}")
440-
raise EarlyStopException(self.best_iter[i], self.best_score_list[i])
443+
raise EarlyStopException(
444+
best_iteration=self.best_iter[i],
445+
best_score=self.best_score_list[i],
446+
)
441447
self._final_iteration_check(env=env, metric_name=metric_name, i=i)
442448

443449

python-package/lightgbm/dask.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -482,7 +482,8 @@ def _train(
482482
lightgbm estimator is not trained using any chunks of a particular eval set, its corresponding component
483483
of ``evals_result_`` and ``best_score_`` will be empty dictionaries.
484484
eval_names : list of str, or None, optional (default=None)
485-
Names of eval_set.
485+
Unique identifiers for each evaluation dataset.
486+
Should be the same length as ``eval_set`` / ``eval_X``.
486487
eval_X : Dask Array or Dask DataFrame, tuple thereof or None, optional (default=None)
487488
Feature matrix or tuple thereof, e.g. ``(X_val0, X_val1)``, to use as validation sets.
488489
eval_y : Dask Array or Dask DataFrame or Dask Series, tuple thereof or None, optional (default=None)

python-package/lightgbm/engine.py

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ def train(
135135
feval : callable, list of callable, or None, optional (default=None)
136136
Customized evaluation function.
137137
Each evaluation function should accept two parameters: preds, eval_data,
138-
and return (eval_name, eval_result, is_higher_better) or list of such tuples.
138+
and return (metric_name, metric_value, maximize) or list of such tuples.
139139
140140
preds : numpy 1-D array or numpy 2-D array (for multi-class task)
141141
The predicted values.
@@ -144,12 +144,12 @@ def train(
144144
e.g. they are raw margin instead of probability of positive class for binary task in this case.
145145
eval_data : Dataset
146146
A ``Dataset`` to evaluate.
147-
eval_name : str
148-
The name of evaluation function (without whitespaces).
149-
eval_result : float
150-
The eval result.
151-
is_higher_better : bool
152-
Is eval result higher better, e.g. AUC is ``is_higher_better``.
147+
metric_name : str
148+
Unique identifier for the metric (e.g. "custom_adjusted_mse").
149+
metric_value : float
150+
Value of the evaluation metric.
151+
maximize : bool
152+
Are higher values better? e.g. ``True`` for AUC and ``False`` for binary error.
153153
154154
To ignore the default metric corresponding to the used objective,
155155
set the ``metric`` parameter to the string ``"None"`` in ``params``.
@@ -346,8 +346,8 @@ def train(
346346
evaluation_result_list = [item[:4] for item in earlyStopException.best_score]
347347
break
348348
booster.best_score = defaultdict(OrderedDict)
349-
for dataset_name, eval_name, score, _ in evaluation_result_list:
350-
booster.best_score[dataset_name][eval_name] = score
349+
for dataset_name, metric_name, metric_value, _ in evaluation_result_list:
350+
booster.best_score[dataset_name][metric_name] = metric_value
351351
if not keep_training_booster:
352352
booster.model_from_string(booster.model_to_string()).free_dataset()
353353
return booster
@@ -600,7 +600,7 @@ def _agg_cv_result(
600600
# build up 2 maps, of the form:
601601
#
602602
# OrderedDict{
603-
# (<dataset_name>, <metric_name>): <is_higher_better>
603+
# (<dataset_name>, <metric_name>): <maximize>
604604
# }
605605
#
606606
# OrderedDict{
@@ -610,16 +610,16 @@ def _agg_cv_result(
610610
metric_types: Dict[Tuple[str, str], bool] = OrderedDict()
611611
metric_values: Dict[Tuple[str, str], List[float]] = OrderedDict()
612612
for one_result in raw_results:
613-
for dataset_name, metric_name, metric_value, is_higher_better in one_result:
613+
for dataset_name, metric_name, metric_value, maximize in one_result:
614614
key = (dataset_name, metric_name)
615-
metric_types[key] = is_higher_better
615+
metric_types[key] = maximize
616616
metric_values.setdefault(key, [])
617617
metric_values[key].append(metric_value)
618618

619619
# turn that into a list of tuples of the form:
620620
#
621621
# [
622-
# (<dataset_name>, <metric_name>, mean(<values>), <is_higher_better>, std_dev(<values>))
622+
# (<dataset_name>, <metric_name>, mean(<values>), <maximize>, std_dev(<values>))
623623
# ]
624624
return [(k[0], k[1], float(np.mean(v)), metric_types[k], float(np.std(v))) for k, v in metric_values.items()]
625625

@@ -670,7 +670,7 @@ def cv(
670670
feval : callable, list of callable, or None, optional (default=None)
671671
Customized evaluation function.
672672
Each evaluation function should accept two parameters: preds, eval_data,
673-
and return (eval_name, eval_result, is_higher_better) or list of such tuples.
673+
and return (metric_name, metric_value, maximize) or list of such tuples.
674674
675675
preds : numpy 1-D array or numpy 2-D array (for multi-class task)
676676
The predicted values.
@@ -679,12 +679,12 @@ def cv(
679679
e.g. they are raw margin instead of probability of positive class for binary task in this case.
680680
eval_data : Dataset
681681
A ``Dataset`` to evaluate.
682-
eval_name : str
683-
The name of evaluation function (without whitespace).
684-
eval_result : float
685-
The eval result.
686-
is_higher_better : bool
687-
Is eval result higher better, e.g. AUC is ``is_higher_better``.
682+
metric_name : str
683+
Unique identifier for the metric (e.g. "custom_adjusted_mse").
684+
metric_value : float
685+
Value of the evaluation metric.
686+
maximize : bool
687+
Are higher values better? e.g. ``True`` for AUC and ``False`` for binary error.
688688
689689
To ignore the default metric corresponding to the used objective,
690690
set ``metrics`` to the string ``"None"``.

0 commit comments

Comments
 (0)