Skip to content

Commit 9c84805

Browse files
authored
feat: Remove global (cross-worker) stopping criteria (#252)
1 parent 99174d6 commit 9c84805

File tree

15 files changed

+275
-523
lines changed

15 files changed

+275
-523
lines changed

docs/reference/evaluate_pipeline.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ neps.save_pipeline_results(
132132

133133
### 3.4 Common pitfalls
134134

135-
* When using async approach, one worker, may create as many trials as possible, of course that in `Slurm` or other workload managers it's impossible to overload the system because of limitations set for each user, but if you want to control resources used for optimization, it's crucial to set `max_evaluations_per_run` when calling `neps.run`.
135+
* When using async approach, one worker, may create as many trials as possible, of course that in `Slurm` or other workload managers it's impossible to overload the system because of limitations set for each user, but if you want to control resources used for optimization, it's crucial to set `evaluations_to_spend` when calling `neps.run`.
136136

137137
## 4  Extra injected arguments
138138

docs/reference/neps_run.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -167,8 +167,7 @@ Any new workers that come online will automatically pick up work and work togeth
167167
evaluate_pipeline=...,
168168
pipeline_space=...,
169169
root_directory="some/path",
170-
evaluations_to_spend=100,
171-
max_evaluations_per_run=10, # (1)!
170+
evaluations_to_spend=100, # (1)!
172171
continue_until_max_evaluation_completed=True, # (2)!
173172
overwrite_root_directory=False, #!!!
174173
)
@@ -220,7 +219,7 @@ neps.run(
220219

221220
!!! note
222221

223-
Any runs that error will still count towards the total `evaluations_to_spend` or `max_evaluations_per_run`.
222+
Any runs that error will still count towards the total `evaluations_to_spend`.
224223

225224
### Re-running Failed Configurations
226225

neps/api.py

Lines changed: 18 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
logger = logging.getLogger(__name__)
2929

3030

31-
def run( # noqa: C901, D417, PLR0913
31+
def run( # noqa: D417, PLR0913
3232
evaluate_pipeline: Callable[..., EvaluatePipelineReturn] | str,
3333
pipeline_space: (
3434
Mapping[str, dict | str | int | float | Parameter]
@@ -39,11 +39,9 @@ def run( # noqa: C901, D417, PLR0913
3939
root_directory: str | Path = "neps_results",
4040
overwrite_root_directory: bool = False,
4141
evaluations_to_spend: int | None = None,
42-
max_evaluations_total: int | None = None,
4342
max_evaluations_per_run: int | None = None,
4443
continue_until_max_evaluation_completed: bool = False,
4544
cost_to_spend: int | float | None = None,
46-
max_cost_total: int | float | None = None,
4745
fidelities_to_spend: int | float | None = None,
4846
ignore_errors: bool = False,
4947
objective_value_on_error: float | None = None,
@@ -104,7 +102,6 @@ def evaluate_pipeline(some_parameter: float) -> float:
104102
},
105103
root_directory="usage_example",
106104
evaluations_to_spend=5,
107-
max_evaluations_per_run=10,
108105
)
109106
```
110107
@@ -200,18 +197,15 @@ def evaluate_pipeline(some_parameter: float) -> float:
200197
overwrite_root_directory: If true, delete the working directory at the start of
201198
the run. This is, e.g., useful when debugging a evaluate_pipeline function.
202199
203-
max_evaluations_per_run: Number of evaluations this specific call should do.
200+
evaluations_to_spend: Number of evaluations this specific call/worker should do.
204201
??? note "Limitation on Async mode"
205202
Currently, there is no specific number to control number of parallel evaluations running with
206203
the same worker, so in case you want to limit the number of parallel evaluations,
207-
it's crucial to limit the number of evaluations per run.
208-
209-
evaluations_to_spend: Number of evaluations after which to terminate.
210-
This is shared between all workers operating in the same `root_directory`.
204+
it's crucial to limit the `evaluations_to_spend` accordingly.
211205
212206
continue_until_max_evaluation_completed:
213-
If true, only stop after evaluations_to_spend have been completed.
214-
This is only relevant in the parallel setting.
207+
If true, stop only after evaluations_to_spend have fully completed. In other words,
208+
pipelines that are still running do not count toward the stopping criterion.
215209
216210
cost_to_spend: No new evaluations will start when this cost is exceeded. Requires
217211
returning a cost in the evaluate_pipeline function, e.g.,
@@ -423,54 +417,24 @@ def __call__(
423417
runtime to run your optimizer.
424418
425419
""" # noqa: E501
426-
if (
427-
evaluations_to_spend is None
428-
and max_evaluations_total is None
429-
and max_evaluations_per_run is None
430-
and cost_to_spend is None
431-
and max_cost_total is None
432-
and fidelities_to_spend is None
433-
):
434-
warnings.warn(
435-
"None of the following were set, this will run idefinitely until the worker"
436-
" process is stopped."
437-
f"\n * {evaluations_to_spend=}"
438-
f"\n * {max_evaluations_per_run=}"
439-
f"\n * {cost_to_spend=}"
440-
f"\n * {fidelities_to_spend}",
441-
UserWarning,
442-
stacklevel=2,
443-
)
444-
445-
if max_evaluations_total is not None:
446-
warnings.warn(
447-
"`max_evaluations_total` is deprecated and will be removed in"
448-
" a future release. Please use `evaluations_to_spend` instead.",
449-
DeprecationWarning,
450-
stacklevel=2,
451-
)
452-
evaluations_to_spend = max_evaluations_total
453-
454-
if max_cost_total is not None:
455-
warnings.warn(
456-
"`max_cost_total` is deprecated and will be removed in a future release. "
457-
"Please use `cost_to_spend` instead.",
458-
DeprecationWarning,
459-
stacklevel=2,
420+
if max_evaluations_per_run is not None:
421+
raise ValueError(
422+
"`max_evaluations_per_run` is deprecated, please use "
423+
"`evaluations_to_spend` for limiting the number of evaluations for this run.",
460424
)
461-
cost_to_spend = max_cost_total
462425

463-
criteria = {
426+
controling_params = {
464427
"evaluations_to_spend": evaluations_to_spend,
465-
"max_evaluations_per_run": max_evaluations_per_run,
466428
"cost_to_spend": cost_to_spend,
467429
"fidelities_to_spend": fidelities_to_spend,
468430
}
469-
set_criteria = [k for k, v in criteria.items() if v is not None]
470-
if len(set_criteria) > 1:
471-
raise ValueError(
472-
f"Multiple stopping criteria specified: {', '.join(set_criteria)}. "
473-
"Only one is allowed."
431+
if all(x is None for x in controling_params.values()):
432+
warnings.warn(
433+
"None of the following were set, this will run idefinitely until the worker"
434+
" process is stopped."
435+
f"{', '.join(list(controling_params.keys()))}.",
436+
UserWarning,
437+
stacklevel=2,
474438
)
475439

476440
logger.info(f"Starting neps.run using root directory {root_directory}")
@@ -491,13 +455,7 @@ def __call__(
491455

492456
is_multi_fidelity = _optimizer_info["name"] in multi_fidelity_optimizers
493457

494-
if is_multi_fidelity:
495-
if evaluations_to_spend is not None:
496-
raise ValueError(
497-
"`evaluations_to_spend` is not allowed for multi-fidelity optimizers. "
498-
"Only `fidelities_to_spend` or `cost_to_spend`"
499-
)
500-
elif fidelities_to_spend is not None:
458+
if not is_multi_fidelity and fidelities_to_spend is not None:
501459
raise ValueError(
502460
"`fidelities_to_spend` is not allowed for non-multi-fidelity optimizers."
503461
)
@@ -527,7 +485,6 @@ def __call__(
527485
fidelities_to_spend=fidelities_to_spend,
528486
optimization_dir=Path(root_directory),
529487
evaluations_to_spend=evaluations_to_spend,
530-
max_evaluations_for_worker=max_evaluations_per_run,
531488
continue_until_max_evaluation_completed=continue_until_max_evaluation_completed,
532489
objective_value_on_error=objective_value_on_error,
533490
cost_value_on_error=cost_value_on_error,

neps/runtime.py

Lines changed: 26 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -181,15 +181,6 @@ class DefaultWorker:
181181
worker_id: str
182182
"""The id of the worker."""
183183

184-
worker_cumulative_eval_count: int = 0
185-
"""The number of evaluations done by this worker."""
186-
187-
worker_cumulative_eval_cost: float = 0.0
188-
"""The cost of the evaluations done by this worker."""
189-
190-
worker_cumulative_evaluation_time_seconds: float = 0.0
191-
"""The time spent evaluating configurations by this worker."""
192-
193184
_GRACE: ClassVar = FS_SYNC_GRACE_BASE
194185

195186
@classmethod
@@ -251,46 +242,13 @@ def _check_worker_local_settings(
251242
raise WorkerRaiseError(msg) from error_from_this_worker
252243
return msg
253244

254-
if (
255-
self.settings.max_evaluations_for_worker is not None
256-
and self.worker_cumulative_eval_count
257-
>= self.settings.max_evaluations_for_worker
258-
):
259-
return (
260-
"Worker has reached the maximum number of evaluations it is allowed to do"
261-
f" as given by `{self.settings.max_evaluations_for_worker=}`."
262-
"\nTo allow more evaluations, increase this value or use a different"
263-
" stopping criterion."
264-
)
265-
266-
if (
267-
self.settings.max_cost_for_worker is not None
268-
and self.worker_cumulative_eval_cost >= self.settings.max_cost_for_worker
269-
):
270-
return (
271-
"Worker has reached the maximum cost it is allowed to spend"
272-
f" which is given by `{self.settings.max_cost_for_worker=}`."
273-
f" This worker has spend '{self.worker_cumulative_eval_cost}'."
274-
"\n To allow more evaluations, increase this value or use a different"
275-
" stopping criterion."
276-
)
277-
278-
if self.settings.max_wallclock_time_for_worker_seconds is not None and (
245+
if self.settings.max_wallclock_time_seconds is not None and (
279246
time.monotonic() - time_monotonic_start
280-
>= self.settings.max_wallclock_time_for_worker_seconds
247+
>= self.settings.max_wallclock_time_seconds
281248
):
282249
return (
283250
"Worker has reached the maximum wallclock time it is allowed to spend"
284-
f", given by `{self.settings.max_wallclock_time_for_worker_seconds=}`."
285-
)
286-
287-
if self.settings.max_evaluation_time_for_worker_seconds is not None and (
288-
self.worker_cumulative_evaluation_time_seconds
289-
>= self.settings.max_evaluation_time_for_worker_seconds
290-
):
291-
return (
292-
"Worker has reached the maximum evaluation time it is allowed to spend"
293-
f", given by `{self.settings.max_evaluation_time_for_worker_seconds=}`."
251+
f", given by `{self.settings.max_wallclock_time_seconds=}`."
294252
)
295253

296254
return False
@@ -328,23 +286,30 @@ def _check_global_stopping_criterion(
328286
self,
329287
trials: Mapping[str, Trial],
330288
) -> str | Literal[False]:
289+
# worker related stopping criterion
290+
worker_trials = {
291+
_id: trial
292+
for _id, trial in trials.items()
293+
if trial.metadata.evaluating_worker_id == self.worker_id
294+
}
331295
if self.settings.evaluations_to_spend is not None:
332296
if self.settings.include_in_progress_evaluations_towards_maximum:
333297
count = sum(
334298
1
335-
for _, trial in trials.items()
336-
if trial.metadata.state
337-
not in (Trial.State.PENDING, Trial.State.SUBMITTED)
299+
for _, trial in worker_trials.items()
300+
if trial.metadata.state != Trial.State.PENDING
338301
)
339302
else:
340303
# This indicates they have completed.
341-
count = sum(1 for _, trial in trials.items() if trial.report is not None)
304+
count = sum(
305+
1 for _, trial in worker_trials.items() if trial.report is not None
306+
)
342307

343308
if count >= self.settings.evaluations_to_spend:
344309
return (
345-
"The total number of evaluations has reached the maximum allowed of"
346-
f" `{self.settings.evaluations_to_spend=}`."
347-
" To allow more evaluations, increase this value or use a different"
310+
"Worker has reached the maximum number of evaluations it is allowed"
311+
f" to do as given by `{self.settings.evaluations_to_spend=}`."
312+
"\nTo allow more evaluations, increase this value or use a different"
348313
" stopping criterion."
349314
)
350315

@@ -354,7 +319,7 @@ def _check_global_stopping_criterion(
354319
fidelity_name = next(iter(self.optimizer.space.fidelities.keys()))
355320
count = sum(
356321
trial.config[fidelity_name]
357-
for _, trial in trials.items()
322+
for _, trial in worker_trials.items()
358323
if trial.report is not None and trial.config[fidelity_name] is not None
359324
)
360325
if count >= self.settings.fidelities_to_spend:
@@ -368,20 +333,22 @@ def _check_global_stopping_criterion(
368333
if self.settings.cost_to_spend is not None:
369334
cost = sum(
370335
trial.report.cost
371-
for _, trial in trials.items()
336+
for _, trial in worker_trials.items()
372337
if trial.report is not None and trial.report.cost is not None
373338
)
374339
if cost >= self.settings.cost_to_spend:
375340
return (
376-
f"The maximum cost `{self.settings.cost_to_spend=}` has been"
377-
" reached by all of the evaluated trials. To allow more evaluations,"
378-
" increase this value or use a different stopping criterion."
341+
"Worker has reached the maximum cost it is allowed to spend"
342+
f" which is given by `{self.settings.cost_to_spend=}`."
343+
f" This worker has spend '{cost}'."
344+
"\n To allow more evaluations, increase this value or use a different"
345+
" stopping criterion."
379346
)
380347

381348
if self.settings.max_evaluation_time_total_seconds is not None:
382349
time_spent = sum(
383350
trial.report.evaluation_duration
384-
for _, trial in trials.items()
351+
for _, trial in worker_trials.items()
385352
if trial.report is not None
386353
if trial.report.evaluation_duration is not None
387354
)
@@ -658,13 +625,6 @@ def run(self) -> None: # noqa: C901, PLR0912, PLR0915
658625
evaluation_fn=self.evaluation_fn,
659626
default_report_values=self.settings.default_report_values,
660627
)
661-
evaluation_duration = evaluated_trial.metadata.evaluation_duration
662-
assert (evaluation_duration is not None) | (report is None)
663-
self.worker_cumulative_evaluation_time_seconds += (
664-
evaluation_duration if evaluation_duration else 0
665-
)
666-
667-
self.worker_cumulative_eval_count += 1
668628

669629
if report is None:
670630
logger.info(
@@ -681,9 +641,6 @@ def run(self) -> None: # noqa: C901, PLR0912, PLR0915
681641
evaluated_trial.metadata.state,
682642
)
683643

684-
if report.cost is not None:
685-
self.worker_cumulative_eval_cost += report.cost
686-
687644
if report.err is not None:
688645
logger.error(
689646
f"Error during evaluation of '{evaluated_trial.id}'"
@@ -960,7 +917,6 @@ def _launch_runtime( # noqa: PLR0913
960917
overwrite_optimization_dir: bool,
961918
evaluations_to_spend: int | None,
962919
fidelities_to_spend: int | float | None,
963-
max_evaluations_for_worker: int | None,
964920
sample_batch_size: int | None,
965921
worker_id: str | None = None,
966922
) -> None:
@@ -1034,11 +990,8 @@ def _launch_runtime( # noqa: PLR0913
1034990
not continue_until_max_evaluation_completed
1035991
),
1036992
cost_to_spend=cost_to_spend,
1037-
max_evaluations_for_worker=max_evaluations_for_worker,
1038993
max_evaluation_time_total_seconds=None, # TODO: User can't specify yet
1039-
max_wallclock_time_for_worker_seconds=None, # TODO: User can't specify yet
1040-
max_evaluation_time_for_worker_seconds=None, # TODO: User can't specify yet
1041-
max_cost_for_worker=None, # TODO: User can't specify yet
994+
max_wallclock_time_seconds=None, # TODO: User can't specify yet
1042995
)
1043996

1044997
# HACK: Due to nfs file-systems, locking with the default `flock()` is not reliable.

0 commit comments

Comments
 (0)