Skip to content

Commit 55c01b7

Browse files
committed
Create fit evaluator, no resampling strategy and fix bug for test statistics
Fix mypy and flake Fix check for X_test while making test data loader fix bug in lookahead hyperparameters where lookahead was repeated for the hyperparameter name Make passing tests in api easier Fix bug in trainer weighted loss code for regression
1 parent 613eb62 commit 55c01b7

16 files changed

+338
-14
lines changed

autoPyTorch/api/base_task.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1502,6 +1502,7 @@ def fit_pipeline(
15021502
(BaseDataset):
15031503
Dataset created from the given tensors
15041504
"""
1505+
self.dataset_name = dataset.dataset_name
15051506

15061507
if dataset is None:
15071508
if (

autoPyTorch/api/tabular_classification.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,9 +91,17 @@ def __init__(
9191
output_directory: Optional[str] = None,
9292
delete_tmp_folder_after_terminate: bool = True,
9393
delete_output_folder_after_terminate: bool = True,
94+
<<<<<<< HEAD
9495
include_components: Optional[Dict[str, Any]] = None,
9596
exclude_components: Optional[Dict[str, Any]] = None,
9697
resampling_strategy: ResamplingStrategies = HoldoutValTypes.holdout_validation,
98+
=======
99+
include_components: Optional[Dict] = None,
100+
exclude_components: Optional[Dict] = None,
101+
resampling_strategy: Union[CrossValTypes,
102+
HoldoutValTypes,
103+
NoResamplingStrategyTypes] = HoldoutValTypes.holdout_validation,
104+
>>>>>>> Create fit evaluator, no resampling strategy and fix bug for test statistics
97105
resampling_strategy_args: Optional[Dict[str, Any]] = None,
98106
backend: Optional[Backend] = None,
99107
search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None

autoPyTorch/api/tabular_regression.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,9 +92,17 @@ def __init__(
9292
output_directory: Optional[str] = None,
9393
delete_tmp_folder_after_terminate: bool = True,
9494
delete_output_folder_after_terminate: bool = True,
95+
<<<<<<< HEAD
9596
include_components: Optional[Dict[str, Any]] = None,
9697
exclude_components: Optional[Dict[str, Any]] = None,
9798
resampling_strategy: ResamplingStrategies = HoldoutValTypes.holdout_validation,
99+
=======
100+
include_components: Optional[Dict] = None,
101+
exclude_components: Optional[Dict] = None,
102+
resampling_strategy:Union[CrossValTypes,
103+
HoldoutValTypes,
104+
NoResamplingStrategyTypes] = HoldoutValTypes.holdout_validation,
105+
>>>>>>> Create fit evaluator, no resampling strategy and fix bug for test statistics
98106
resampling_strategy_args: Optional[Dict[str, Any]] = None,
99107
backend: Optional[Backend] = None,
100108
search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None

autoPyTorch/datasets/base_dataset.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,13 @@ def __init__(
112112
dataset_name: Optional[str] = None,
113113
val_tensors: Optional[BaseDatasetInputType] = None,
114114
test_tensors: Optional[BaseDatasetInputType] = None,
115+
<<<<<<< HEAD
115116
resampling_strategy: ResamplingStrategies = HoldoutValTypes.holdout_validation,
117+
=======
118+
resampling_strategy: Union[CrossValTypes,
119+
HoldoutValTypes,
120+
NoResamplingStrategyTypes] = HoldoutValTypes.holdout_validation,
121+
>>>>>>> Create fit evaluator, no resampling strategy and fix bug for test statistics
116122
resampling_strategy_args: Optional[Dict[str, Any]] = None,
117123
shuffle: Optional[bool] = True,
118124
seed: Optional[int] = 42,
@@ -129,7 +135,12 @@ def __init__(
129135
validation data
130136
test_tensors (An optional tuple of objects that have a __len__ and a __getitem__ attribute):
131137
test data
138+
<<<<<<< HEAD
132139
resampling_strategy (RESAMPLING_STRATEGIES: default=HoldoutValTypes.holdout_validation):
140+
=======
141+
resampling_strategy (Union[CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes]),
142+
(default=HoldoutValTypes.holdout_validation):
143+
>>>>>>> Create fit evaluator, no resampling strategy and fix bug for test statistics
133144
strategy to split the training data.
134145
resampling_strategy_args (Optional[Dict[str, Any]]): arguments
135146
required for the chosen resampling strategy. If None, uses
@@ -151,10 +162,17 @@ def __init__(
151162
if not hasattr(train_tensors[0], 'shape'):
152163
type_check(train_tensors, val_tensors)
153164
self.train_tensors, self.val_tensors, self.test_tensors = train_tensors, val_tensors, test_tensors
165+
<<<<<<< HEAD
154166
self.cross_validators: Dict[str, CrossValFunc] = {}
155167
self.holdout_validators: Dict[str, HoldOutFunc] = {}
156168
self.no_resampling_validators: Dict[str, NoResamplingFunc] = {}
157169
self.random_state = np.random.RandomState(seed=seed)
170+
=======
171+
self.cross_validators: Dict[str, CROSS_VAL_FN] = {}
172+
self.holdout_validators: Dict[str, HOLDOUT_FN] = {}
173+
self.no_resampling_validators: Dict[str, NO_RESAMPLING_FN] = {}
174+
self.rng = np.random.RandomState(seed=seed)
175+
>>>>>>> Fix mypy and flake
158176
self.shuffle = shuffle
159177
self.resampling_strategy = resampling_strategy
160178
self.resampling_strategy_args = resampling_strategy_args
@@ -171,7 +189,11 @@ def __init__(
171189
# Make sure cross validation splits are created once
172190
self.cross_validators = CrossValFuncs.get_cross_validators(*CrossValTypes)
173191
self.holdout_validators = HoldOutFuncs.get_holdout_validators(*HoldoutValTypes)
192+
<<<<<<< HEAD
174193
self.no_resampling_validators = NoResamplingFuncs.get_no_resampling_validators(*NoResamplingStrategyTypes)
194+
=======
195+
self.no_resampling_validators = get_no_resampling_validators(*NoResamplingStrategyTypes)
196+
>>>>>>> Create fit evaluator, no resampling strategy and fix bug for test statistics
175197

176198
self.splits = self.get_splits_from_resampling_strategy()
177199

@@ -272,8 +294,12 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], Optional[
272294
)
273295
)
274296
elif isinstance(self.resampling_strategy, NoResamplingStrategyTypes):
297+
<<<<<<< HEAD
275298
splits.append((self.no_resampling_validators[self.resampling_strategy.name](self.random_state,
276299
self._get_indices()), None))
300+
=======
301+
splits.append((self.no_resampling_validators[self.resampling_strategy.name](self._get_indices()), None))
302+
>>>>>>> Create fit evaluator, no resampling strategy and fix bug for test statistics
277303
else:
278304
raise ValueError(f"Unsupported resampling strategy={self.resampling_strategy}")
279305
return splits
@@ -345,7 +371,11 @@ def create_holdout_val_split(
345371
self.random_state, val_share, self._get_indices(), **kwargs)
346372
return train, val
347373

374+
<<<<<<< HEAD
348375
def get_dataset(self, split_id: int, train: bool) -> Dataset:
376+
=======
377+
def get_dataset_for_training(self, split_id: int, train: bool) -> Dataset:
378+
>>>>>>> Create fit evaluator, no resampling strategy and fix bug for test statistics
349379
"""
350380
The above split methods employ the Subset to internally subsample the whole dataset.
351381
@@ -360,6 +390,7 @@ def get_dataset(self, split_id: int, train: bool) -> Dataset:
360390
Dataset: the reduced dataset to be used for testing
361391
"""
362392
# Subset creates a dataset. Splits is a (train_indices, test_indices) tuple
393+
<<<<<<< HEAD
363394
if split_id >= len(self.splits): # old version: split_id > len(self.splits)
364395
raise IndexError(f"self.splits index out of range, got split_id={split_id}"
365396
f" (>= num_splits={len(self.splits)})")
@@ -368,6 +399,9 @@ def get_dataset(self, split_id: int, train: bool) -> Dataset:
368399
raise ValueError("Specified fold (or subset) does not exist")
369400

370401
return TransformSubset(self, indices, train=train)
402+
=======
403+
return TransformSubset(self, self.splits[split_id][0], train=train)
404+
>>>>>>> Create fit evaluator, no resampling strategy and fix bug for test statistics
371405

372406
def replace_data(self, X_train: BaseDatasetInputType,
373407
X_test: Optional[BaseDatasetInputType]) -> 'BaseDataset':

autoPyTorch/datasets/resampling_strategy.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,11 @@ def __call__(self, random_state: np.random.RandomState, val_share: float,
3939
...
4040

4141

42+
class NO_RESAMPLING_FN(Protocol):
43+
def __call__(self, indices: np.ndarray) -> np.ndarray:
44+
...
45+
46+
4247
class CrossValTypes(IntEnum):
4348
"""The type of cross validation
4449
@@ -85,13 +90,22 @@ def is_stratified(self) -> bool:
8590

8691
class NoResamplingStrategyTypes(IntEnum):
8792
no_resampling = 8
93+
<<<<<<< HEAD
8894

8995
def is_stratified(self) -> bool:
9096
return False
9197

9298

9399
# TODO: replace it with another way
94100
ResamplingStrategies = Union[CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes]
101+
=======
102+
shuffle_no_resampling = 9
103+
104+
105+
# TODO: replace it with another way
106+
RESAMPLING_STRATEGIES = [CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes]
107+
108+
>>>>>>> Create fit evaluator, no resampling strategy and fix bug for test statistics
95109

96110
DEFAULT_RESAMPLING_PARAMETERS: Dict[
97111
ResamplingStrategies,

autoPyTorch/evaluation/abstract_evaluator.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -686,9 +686,9 @@ def _loss(self, y_true: np.ndarray, y_hat: np.ndarray) -> Dict[str, float]:
686686
y_true, y_hat, self.task_type, metrics)
687687

688688
def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
689-
opt_pred: np.ndarray, valid_pred: Optional[np.ndarray],
690-
test_pred: Optional[np.ndarray], additional_run_info: Optional[Dict],
691-
file_output: bool, status: StatusType
689+
valid_pred: Optional[np.ndarray], test_pred: Optional[np.ndarray],
690+
additional_run_info: Optional[Dict], file_output: bool, status: StatusType,
691+
opt_pred: Optional[np.ndarray],
692692
) -> Optional[Tuple[float, float, int, Dict]]:
693693
"""This function does everything necessary after the fitting is done:
694694
@@ -730,6 +730,9 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
730730
Additional run information, like train/test loss
731731
"""
732732

733+
assert opt_pred is not None, "Cases where 'opt_pred' is None should be handled " \
734+
"specifically with special child classes"
735+
733736
self.duration = time.time() - self.starttime
734737

735738
if file_output:

autoPyTorch/evaluation/train_evaluator.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,11 @@ def __init__(self, backend: Backend, queue: Queue,
151151
pipeline_config=pipeline_config,
152152
search_space_updates=search_space_updates
153153
)
154+
assert isinstance(self.datamanager.resampling_strategy, (CrossValTypes, HoldoutValTypes)),\
155+
"This Evaluator is used for HPO Search. " \
156+
"Val Split is required for HPO search. " \
157+
"Expected 'self.resampling_strategy' in" \
158+
" '(CrossValTypes, HoldoutValTypes)' got {}".format(self.datamanager.resampling_strategy)
154159

155160
if not isinstance(self.resampling_strategy, (CrossValTypes, HoldoutValTypes)):
156161
raise ValueError(
@@ -408,7 +413,11 @@ def _predict(self, pipeline: BaseEstimator,
408413

409414

410415
# create closure for evaluating an algorithm
416+
<<<<<<< HEAD
411417
def eval_train_function(
418+
=======
419+
def eval_function(
420+
>>>>>>> Create fit evaluator, no resampling strategy and fix bug for test statistics
412421
backend: Backend,
413422
queue: Queue,
414423
metric: autoPyTorchMetric,

autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,15 +59,15 @@ def __init__(self, batch_size: int = 64,
5959
FitRequirement("Backend", (Backend,), user_defined=True, dataset_property=False),
6060
FitRequirement("is_small_preprocess", (bool,), user_defined=True, dataset_property=True)])
6161

62-
def transform(self, X: np.ndarray) -> np.ndarray:
62+
def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
6363
"""The transform function calls the transform function of the
6464
underlying model and returns the transformed array.
6565
6666
Args:
67-
X (np.ndarray): input features
67+
X (Dict[str, Any])): 'X' dictionary
6868
6969
Returns:
70-
np.ndarray: Transformed features
70+
(Dict[str, Any]): the updated 'X' dictionary
7171
"""
7272
X.update({'train_data_loader': self.train_data_loader,
7373
'val_data_loader': self.val_data_loader,
@@ -107,7 +107,11 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
107107
# Overwrite the datamanager with the pre-processes data
108108
datamanager.replace_data(X['X_train'], X['X_test'] if 'X_test' in X else None)
109109

110+
<<<<<<< HEAD
110111
train_dataset = datamanager.get_dataset(split_id=X['split_id'], train=True)
112+
=======
113+
train_dataset = datamanager.get_dataset_for_training(split_id=X['split_id'], train=True)
114+
>>>>>>> Create fit evaluator, no resampling strategy and fix bug for test statistics
111115

112116
self.train_data_loader = torch.utils.data.DataLoader(
113117
train_dataset,
@@ -119,8 +123,13 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
119123
collate_fn=custom_collate_fn,
120124
)
121125

126+
<<<<<<< HEAD
122127
if X.get('val_indices', None) is not None:
123128
val_dataset = datamanager.get_dataset(split_id=X['split_id'], train=False)
129+
=======
130+
if X['val_indices'] is not None:
131+
val_dataset = datamanager.get_dataset_for_training(split_id=X['split_id'], train=False)
132+
>>>>>>> Create fit evaluator, no resampling strategy and fix bug for test statistics
124133
self.val_data_loader = torch.utils.data.DataLoader(
125134
val_dataset,
126135
batch_size=min(self.batch_size, len(val_dataset)),

autoPyTorch/pipeline/components/training/trainer/AdversarialTrainer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ class AdversarialTrainer(BaseTrainerComponent):
2424
def __init__(
2525
self,
2626
epsilon: float,
27-
weighted_loss: int = 1,
27+
weighted_loss: int = 0,
2828
random_state: Optional[np.random.RandomState] = None,
2929
use_stochastic_weight_averaging: bool = False,
3030
use_snapshot_ensemble: bool = False,

autoPyTorch/pipeline/components/training/trainer/StandardTrainer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414

1515
class StandardTrainer(BaseTrainerComponent):
16-
def __init__(self, weighted_loss: bool = False,
16+
def __init__(self, weighted_loss: int = 0,
1717
use_stochastic_weight_averaging: bool = False,
1818
use_snapshot_ensemble: bool = False,
1919
se_lastk: int = 3,

autoPyTorch/pipeline/components/training/trainer/__init__.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -382,7 +382,11 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic
382382

383383
val_loss, val_metrics, test_loss, test_metrics = None, {}, None, {}
384384
if self.eval_valid_each_epoch(X):
385+
<<<<<<< HEAD
385386
if X['val_data_loader']:
387+
=======
388+
if 'val_data_loader' in X and X['val_data_loader']:
389+
>>>>>>> Create fit evaluator, no resampling strategy and fix bug for test statistics
386390
val_loss, val_metrics = self.choice.evaluate(X['val_data_loader'], epoch, writer)
387391
if 'test_data_loader' in X and X['test_data_loader']:
388392
test_loss, test_metrics = self.choice.evaluate(X['test_data_loader'], epoch, writer)
@@ -436,10 +440,17 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic
436440

437441
# wrap up -- add score if not evaluating every epoch
438442
if not self.eval_valid_each_epoch(X):
443+
<<<<<<< HEAD
439444
if X['val_data_loader']:
440445
val_loss, val_metrics = self.choice.evaluate(X['val_data_loader'], epoch, writer)
441446
if 'test_data_loader' in X and X['val_data_loader']:
442447
test_loss, test_metrics = self.choice.evaluate(X['test_data_loader'], epoch, writer)
448+
=======
449+
if 'val_data_loader' in X and X['val_data_loader']:
450+
val_loss, val_metrics = self.choice.evaluate(X['val_data_loader'], epoch, writer)
451+
if 'test_data_loader' in X and X['test_data_loader']:
452+
test_loss, test_metrics = self.choice.evaluate(X['test_data_loader'])
453+
>>>>>>> Create fit evaluator, no resampling strategy and fix bug for test statistics
443454
self.run_summary.add_performance(
444455
epoch=epoch,
445456
start_time=start_time,
@@ -652,8 +663,15 @@ def _get_search_space_updates(self, prefix: Optional[str] = None) -> Dict[str, H
652663

653664
# iterate over all search space updates of this node and filter the ones out, that have the given prefix
654665
for key in updates.keys():
655-
if key.startswith(Lookahead.__name__):
656-
result[key[len(Lookahead.__name__) + 1:]] = updates[key]
666+
if Lookahead.__name__ in key:
667+
# need to also remove lookahead from the hyperparameter name
668+
new_update = HyperparameterSearchSpace(
669+
updates[key].hyperparameter.replace('{}:'.format(Lookahead.__name__), ''),
670+
value_range=updates[key].value_range,
671+
default_value=updates[key].default_value,
672+
log=updates[key].log
673+
)
674+
result[key.replace('{}:'.format(Lookahead.__name__), '')] = new_update
657675
else:
658676
result[key] = updates[key]
659677
return result

autoPyTorch/pipeline/components/training/trainer/base_trainer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ class BaseTrainerComponent(autoPyTorchTrainingComponent):
206206
"""
207207
Base class for training
208208
Args:
209-
weighted_loss (int, default=1): In case for classification, whether to weight
209+
weighted_loss (int, default=0): In case for classification, whether to weight
210210
the loss function according to the distribution of classes in the target
211211
use_stochastic_weight_averaging (bool, default=True): whether to use stochastic
212212
weight averaging. Stochastic weight averaging is a simple average of
@@ -221,7 +221,7 @@ class BaseTrainerComponent(autoPyTorchTrainingComponent):
221221
random_state:
222222
**lookahead_config:
223223
"""
224-
def __init__(self, weighted_loss: int = 1,
224+
def __init__(self, weighted_loss: int = 0,
225225
use_stochastic_weight_averaging: bool = True,
226226
use_snapshot_ensemble: bool = True,
227227
se_lastk: int = 3,

autoPyTorch/pipeline/components/training/trainer/cutout_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
class CutOut:
1919
def __init__(self, patch_ratio: float,
2020
cutout_prob: float,
21-
weighted_loss: int = 1,
21+
weighted_loss: int = 0,
2222
random_state: Optional[np.random.RandomState] = None,
2323
use_stochastic_weight_averaging: bool = False,
2424
use_snapshot_ensemble: bool = False,

autoPyTorch/pipeline/components/training/trainer/mixup_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
class MixUp:
1919
def __init__(self, alpha: float,
20-
weighted_loss: int = 1,
20+
weighted_loss: int = 0,
2121
random_state: Optional[np.random.RandomState] = None,
2222
use_stochastic_weight_averaging: bool = False,
2323
use_snapshot_ensemble: bool = False,

0 commit comments

Comments
 (0)