Skip to content

Commit 9f4b855

Browse files
[Fix] Refactor development reproducibility (#172)
* [Fix] pass random state to randomized algorithms * [Fix] double instantiation of random state * [fix] Flaky for sample configuration * [FIX] Runtime warning * [FIX] hardcoded budget * [FIX] flake * [Fix] try forked * [Fix] try forked * [FIX] budget * [Fix] missing random_state in trainer * [Fix] overwrite in random_state * [FIX] fix seed in splits * [Rebase] * [FIX] Update cv score after split num change * [FIX] CV split
1 parent fae72a4 commit 9f4b855

File tree

26 files changed

+225
-127
lines changed

26 files changed

+225
-127
lines changed

.github/workflows/pytest.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ jobs:
2929
- name: Run tests
3030
run: |
3131
if [ ${{ matrix.code-cov }} ]; then codecov='--cov=autoPyTorch --cov-report=xml'; fi
32-
python -m pytest --durations=20 --timeout=600 --timeout-method=signal -v $codecov test
32+
python -m pytest --forked --durations=20 --timeout=600 --timeout-method=signal -v $codecov test
3333
- name: Check for files left behind by test
3434
if: ${{ always() }}
3535
run: |

autoPyTorch/api/base_task.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1235,7 +1235,8 @@ def __del__(self) -> None:
12351235
# When a multiprocessing work is done, the
12361236
# objects are deleted. We don't want to delete run areas
12371237
# until the estimator is deleted
1238-
self._backend.context.delete_directories(force=False)
1238+
if hasattr(self, '_backend'):
1239+
self._backend.context.delete_directories(force=False)
12391240

12401241
@typing.no_type_check
12411242
def get_incumbent_results(

autoPyTorch/datasets/base_dataset.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ def __init__(
118118
self.train_tensors, self.val_tensors, self.test_tensors = train_tensors, val_tensors, test_tensors
119119
self.cross_validators: Dict[str, CrossValFunc] = {}
120120
self.holdout_validators: Dict[str, HoldOutFunc] = {}
121-
self.rng = np.random.RandomState(seed=seed)
121+
self.random_state = np.random.RandomState(seed=seed)
122122
self.shuffle = shuffle
123123
self.resampling_strategy = resampling_strategy
124124
self.resampling_strategy_args = resampling_strategy_args
@@ -205,7 +205,7 @@ def __len__(self) -> int:
205205
return self.train_tensors[0].shape[0]
206206

207207
def _get_indices(self) -> np.ndarray:
208-
return self.rng.permutation(len(self)) if self.shuffle else np.arange(len(self))
208+
return self.random_state.permutation(len(self)) if self.shuffle else np.arange(len(self))
209209

210210
def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]]]:
211211
"""
@@ -271,7 +271,7 @@ def create_cross_val_splits(
271271
# we need additional information about the data for stratification
272272
kwargs["stratify"] = self.train_tensors[-1]
273273
splits = self.cross_validators[cross_val_type.name](
274-
num_splits, self._get_indices(), **kwargs)
274+
self.random_state, num_splits, self._get_indices(), **kwargs)
275275
return splits
276276

277277
def create_holdout_val_split(
@@ -305,7 +305,8 @@ def create_holdout_val_split(
305305
if holdout_val_type.is_stratified():
306306
# we need additional information about the data for stratification
307307
kwargs["stratify"] = self.train_tensors[-1]
308-
train, val = self.holdout_validators[holdout_val_type.name](val_share, self._get_indices(), **kwargs)
308+
train, val = self.holdout_validators[holdout_val_type.name](
309+
self.random_state, val_share, self._get_indices(), **kwargs)
309310
return train, val
310311

311312
def get_dataset_for_training(self, split_id: int) -> Tuple[Dataset, Dataset]:

autoPyTorch/datasets/resampling_strategy.py

Lines changed: 34 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,16 @@
1818
# Use callback protocol as workaround, since callable with function fields count 'self' as argument
1919
class CrossValFunc(Protocol):
2020
def __call__(self,
21+
random_state: np.random.RandomState,
2122
num_splits: int,
2223
indices: np.ndarray,
2324
stratify: Optional[Any]) -> List[Tuple[np.ndarray, np.ndarray]]:
2425
...
2526

2627

2728
class HoldOutFunc(Protocol):
28-
def __call__(self, val_share: float, indices: np.ndarray, stratify: Optional[Any]
29+
def __call__(self, random_state: np.random.RandomState, val_share: float,
30+
indices: np.ndarray, stratify: Optional[Any]
2931
) -> Tuple[np.ndarray, np.ndarray]:
3032
...
3133

@@ -85,35 +87,42 @@ def is_stratified(self) -> bool:
8587
'val_share': 0.33,
8688
},
8789
CrossValTypes.k_fold_cross_validation: {
88-
'num_splits': 3,
90+
'num_splits': 5,
8991
},
9092
CrossValTypes.stratified_k_fold_cross_validation: {
91-
'num_splits': 3,
93+
'num_splits': 5,
9294
},
9395
CrossValTypes.shuffle_split_cross_validation: {
94-
'num_splits': 3,
96+
'num_splits': 5,
9597
},
9698
CrossValTypes.time_series_cross_validation: {
97-
'num_splits': 3,
99+
'num_splits': 5,
98100
},
99101
} # type: Dict[Union[HoldoutValTypes, CrossValTypes], Dict[str, Any]]
100102

101103

102104
class HoldOutFuncs():
103105
@staticmethod
104-
def holdout_validation(val_share: float,
106+
def holdout_validation(random_state: np.random.RandomState,
107+
val_share: float,
105108
indices: np.ndarray,
106109
**kwargs: Any
107110
) -> Tuple[np.ndarray, np.ndarray]:
108-
train, val = train_test_split(indices, test_size=val_share, shuffle=False)
111+
shuffle = kwargs.get('shuffle', True)
112+
train, val = train_test_split(indices, test_size=val_share,
113+
shuffle=shuffle,
114+
random_state=random_state if shuffle else None,
115+
)
109116
return train, val
110117

111118
@staticmethod
112-
def stratified_holdout_validation(val_share: float,
119+
def stratified_holdout_validation(random_state: np.random.RandomState,
120+
val_share: float,
113121
indices: np.ndarray,
114122
**kwargs: Any
115123
) -> Tuple[np.ndarray, np.ndarray]:
116-
train, val = train_test_split(indices, test_size=val_share, shuffle=True, stratify=kwargs["stratify"])
124+
train, val = train_test_split(indices, test_size=val_share, shuffle=True, stratify=kwargs["stratify"],
125+
random_state=random_state)
117126
return train, val
118127

119128
@classmethod
@@ -128,34 +137,38 @@ def get_holdout_validators(cls, *holdout_val_types: HoldoutValTypes) -> Dict[str
128137

129138
class CrossValFuncs():
130139
@staticmethod
131-
def shuffle_split_cross_validation(num_splits: int,
140+
def shuffle_split_cross_validation(random_state: np.random.RandomState,
141+
num_splits: int,
132142
indices: np.ndarray,
133143
**kwargs: Any
134144
) -> List[Tuple[np.ndarray, np.ndarray]]:
135-
cv = ShuffleSplit(n_splits=num_splits)
145+
cv = ShuffleSplit(n_splits=num_splits, random_state=random_state)
136146
splits = list(cv.split(indices))
137147
return splits
138148

139149
@staticmethod
140-
def stratified_shuffle_split_cross_validation(num_splits: int,
150+
def stratified_shuffle_split_cross_validation(random_state: np.random.RandomState,
151+
num_splits: int,
141152
indices: np.ndarray,
142153
**kwargs: Any
143154
) -> List[Tuple[np.ndarray, np.ndarray]]:
144-
cv = StratifiedShuffleSplit(n_splits=num_splits)
155+
cv = StratifiedShuffleSplit(n_splits=num_splits, random_state=random_state)
145156
splits = list(cv.split(indices, kwargs["stratify"]))
146157
return splits
147158

148159
@staticmethod
149-
def stratified_k_fold_cross_validation(num_splits: int,
160+
def stratified_k_fold_cross_validation(random_state: np.random.RandomState,
161+
num_splits: int,
150162
indices: np.ndarray,
151163
**kwargs: Any
152164
) -> List[Tuple[np.ndarray, np.ndarray]]:
153-
cv = StratifiedKFold(n_splits=num_splits)
165+
cv = StratifiedKFold(n_splits=num_splits, random_state=random_state)
154166
splits = list(cv.split(indices, kwargs["stratify"]))
155167
return splits
156168

157169
@staticmethod
158-
def k_fold_cross_validation(num_splits: int,
170+
def k_fold_cross_validation(random_state: np.random.RandomState,
171+
num_splits: int,
159172
indices: np.ndarray,
160173
**kwargs: Any
161174
) -> List[Tuple[np.ndarray, np.ndarray]]:
@@ -169,12 +182,14 @@ def k_fold_cross_validation(num_splits: int,
169182
Returns:
170183
splits (List[Tuple[List, List]]): list of tuples of training and validation indices
171184
"""
172-
cv = KFold(n_splits=num_splits)
185+
shuffle = kwargs.get('shuffle', True)
186+
cv = KFold(n_splits=num_splits, random_state=random_state if shuffle else None, shuffle=shuffle)
173187
splits = list(cv.split(indices))
174188
return splits
175189

176190
@staticmethod
177-
def time_series_cross_validation(num_splits: int,
191+
def time_series_cross_validation(random_state: np.random.RandomState,
192+
num_splits: int,
178193
indices: np.ndarray,
179194
**kwargs: Any
180195
) -> List[Tuple[np.ndarray, np.ndarray]]:
@@ -196,7 +211,7 @@ def time_series_cross_validation(num_splits: int,
196211
([0, 1, 2], [3])]
197212
198213
"""
199-
cv = TimeSeriesSplit(n_splits=num_splits)
214+
cv = TimeSeriesSplit(n_splits=num_splits, random_state=random_state)
200215
splits = list(cv.split(indices))
201216
return splits
202217

autoPyTorch/evaluation/abstract_evaluator.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,8 @@ def __init__(self, config: str,
8080
self.random_state = random_state
8181
self.init_params = init_params
8282
self.pipeline = autoPyTorch.pipeline.traditional_tabular_classification.\
83-
TraditionalTabularClassificationPipeline(dataset_properties=dataset_properties)
83+
TraditionalTabularClassificationPipeline(dataset_properties=dataset_properties,
84+
random_state=self.random_state)
8485
configuration_space = self.pipeline.get_hyperparameter_search_space()
8586
default_configuration = configuration_space.get_default_configuration().get_dictionary()
8687
default_configuration['model_trainer:tabular_classifier:classifier'] = config

autoPyTorch/evaluation/tae.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import logging
55
import math
66
import multiprocessing
7+
import os
78
import time
89
import traceback
910
import typing
@@ -25,6 +26,7 @@
2526
from autoPyTorch.evaluation.utils import empty_queue, extract_learning_curve, read_queue
2627
from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
2728
from autoPyTorch.utils.backend import Backend
29+
from autoPyTorch.utils.common import replace_string_bool_to_bool
2830
from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
2931
from autoPyTorch.utils.logging_ import PicklableClientLogger, get_named_client_logger
3032

@@ -144,7 +146,12 @@ def __init__(
144146
self.exclude = exclude
145147
self.disable_file_output = disable_file_output
146148
self.init_params = init_params
147-
self.pipeline_config = pipeline_config
149+
self.pipeline_config: typing.Dict[str, typing.Union[int, str, float]] = dict()
150+
if pipeline_config is None:
151+
pipeline_config = replace_string_bool_to_bool(json.load(open(
152+
os.path.join(os.path.dirname(__file__), '../configs/default_pipeline_options.json'))))
153+
self.pipeline_config.update(pipeline_config)
154+
148155
self.budget_type = pipeline_config['budget_type'] if pipeline_config is not None else budget_type
149156
self.logger_port = logger_port
150157
if self.logger_port is None:
@@ -199,7 +206,7 @@ def run_wrapper(
199206
)
200207
else:
201208
if run_info.budget == 0:
202-
run_info = run_info._replace(budget=100.0)
209+
run_info = run_info._replace(budget=self.pipeline_config[self.budget_type])
203210
elif run_info.budget <= 0 or run_info.budget > 100:
204211
raise ValueError('Illegal value for budget, must be >0 and <=100, but is %f' %
205212
run_info.budget)

autoPyTorch/pipeline/base_pipeline.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,10 @@ def __init__(
7070
self.include = include if include is not None else {}
7171
self.exclude = exclude if exclude is not None else {}
7272
self.search_space_updates = search_space_updates
73+
if random_state is None:
74+
self.random_state = check_random_state(1)
75+
else:
76+
self.random_state = check_random_state(random_state)
7377

7478
if steps is None:
7579
self.steps = self._get_pipeline_steps(dataset_properties)
@@ -98,10 +102,6 @@ def __init__(
98102

99103
self.set_hyperparameters(self.config, init_params=init_params)
100104

101-
if random_state is None:
102-
self.random_state = check_random_state(1)
103-
else:
104-
self.random_state = check_random_state(random_state)
105105
super().__init__(steps=self.steps)
106106

107107
self._additional_run_info = {} # type: Dict[str, str]

autoPyTorch/pipeline/components/base_component.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,10 @@
88

99
from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
1010

11+
import numpy as np
12+
1113
from sklearn.base import BaseEstimator
14+
from sklearn.utils import check_random_state
1215

1316
from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace
1417
from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdate
@@ -93,8 +96,12 @@ def add_component(self, obj: BaseEstimator) -> None:
9396
class autoPyTorchComponent(BaseEstimator):
9497
_required_properties: Optional[List[str]] = None
9598

96-
def __init__(self) -> None:
99+
def __init__(self, random_state: Optional[np.random.RandomState] = None) -> None:
97100
super().__init__()
101+
if random_state is None:
102+
self.random_state = check_random_state(1)
103+
else:
104+
self.random_state = check_random_state(random_state)
98105
self._fit_requirements: List[FitRequirement] = list()
99106
self._cs_updates: Dict[str, HyperparameterSearchSpaceUpdate] = dict()
100107

autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/TruncatedSVD.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ def __init__(self, target_dim: int = 128,
2626

2727
def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
2828

29-
self.preprocessor['numerical'] = sklearn.decomposition.TruncatedSVD(self.target_dim, algorithm="randomized")
29+
self.preprocessor['numerical'] = sklearn.decomposition.TruncatedSVD(self.target_dim, algorithm="randomized",
30+
random_state=self.random_state)
3031

3132
return self
3233

autoPyTorch/pipeline/components/setup/network_backbone/utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import random
21
import typing
32
import warnings
43

@@ -120,7 +119,8 @@ def shake_drop_get_bl(
120119
pl = 1 - ((block_index + 1) / num_blocks) * (1 - min_prob_no_shake)
121120

122121
if not is_training:
123-
bl = torch.tensor(1.0) if random.random() <= pl else torch.tensor(0.0)
122+
# Move to torch.randn(1) for reproducibility
123+
bl = torch.tensor(1.0) if torch.randn(1) <= pl else torch.tensor(0.0)
124124
if is_training:
125125
bl = torch.tensor(pl)
126126

autoPyTorch/pipeline/components/setup/traditional_ml/base_model.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchSetupComponent:
7474

7575
# instantiate model
7676
self.model = self.build_model(input_shape=input_shape,
77+
logger_port=X['logger_port'],
7778
output_shape=output_shape)
7879

7980
# train model
@@ -91,7 +92,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchSetupComponent:
9192
return self
9293

9394
@abstractmethod
94-
def build_model(self, input_shape: Tuple[int, ...], output_shape: Tuple[int, ...]) -> BaseClassifier:
95+
def build_model(self, input_shape: Tuple[int, ...], output_shape: Tuple[int, ...],
96+
logger_port: int) -> BaseClassifier:
9597
"""
9698
This method returns a pytorch model, that is dynamically built using
9799
a self.config that is model specific, and contains the additional

autoPyTorch/pipeline/components/setup/traditional_ml/classifier_models/base_classifier.py

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,37 @@
11
import json
2-
import logging
2+
import logging.handlers
33
import os as os
44
from abc import abstractmethod
55
from typing import Any, Dict, List, Optional
66

77
import numpy as np
88

9+
from sklearn.utils import check_random_state
10+
911
from autoPyTorch.metrics import accuracy
12+
from autoPyTorch.utils.logging_ import get_named_client_logger
1013

1114

12-
class BaseClassifier():
15+
class BaseClassifier:
1316
"""
1417
Base class for classifiers.
1518
"""
1619

17-
def __init__(self, name: str = ''):
18-
19-
self.configure_logging()
20+
def __init__(self, logger_port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
21+
random_state: Optional[np.random.RandomState] = None, name: str = ''):
2022

2123
self.name = name
24+
self.logger_port = logger_port
25+
self.logger = get_named_client_logger(
26+
name=name,
27+
host='localhost',
28+
port=logger_port,
29+
)
30+
31+
if random_state is None:
32+
self.random_state = check_random_state(1)
33+
else:
34+
self.random_state = check_random_state(random_state)
2235
self.config = self.get_config()
2336

2437
self.categoricals: np.ndarray = np.array(())
@@ -28,17 +41,6 @@ def __init__(self, name: str = ''):
2841

2942
self.metric = accuracy
3043

31-
def configure_logging(self) -> None:
32-
"""
33-
Setup self.logger
34-
"""
35-
self.logger = logging.getLogger(__name__)
36-
self.logger.setLevel(logging.INFO)
37-
38-
ch = logging.StreamHandler()
39-
ch.setLevel(logging.INFO)
40-
self.logger.addHandler(ch)
41-
4244
def get_config(self) -> Dict[str, Any]:
4345
"""
4446
Load the parameters for the classifier model from ../classifier_configs/modelname.json.

0 commit comments

Comments
 (0)