Skip to content

Commit 1a1918c

Browse files
authored
Merge branch 'development' into feature-return_best
2 parents 36576f0 + 94df1e3 commit 1a1918c

File tree

32 files changed

+668
-298
lines changed

32 files changed

+668
-298
lines changed

.github/workflows/pytest.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ jobs:
2929
- name: Run tests
3030
run: |
3131
if [ ${{ matrix.code-cov }} ]; then codecov='--cov=autoPyTorch --cov-report=xml'; fi
32-
python -m pytest --durations=20 --timeout=600 --timeout-method=signal -v $codecov test
32+
python -m pytest --forked --durations=20 --timeout=600 --timeout-method=signal -v $codecov test
3333
- name: Check for files left behind by test
3434
if: ${{ always() }}
3535
run: |

autoPyTorch/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
"""Version information."""
22

33
# The following line *must* be the last in the module, exactly as formatted:
4-
__version__ = "0.0.3"
4+
__version__ = "0.1.0"

autoPyTorch/api/base_task.py

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
import time
1111
import typing
1212
import unittest.mock
13-
import uuid
1413
import warnings
1514
from abc import abstractmethod
1615
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
@@ -569,6 +568,7 @@ def _do_traditional_prediction(self, time_left: int, func_eval_time_limit_secs:
569568
assert self._dask_client is not None
570569

571570
self._logger.info("Starting to create traditional classifier predictions.")
571+
starttime = time.time()
572572

573573
# Initialise run history for the traditional classifiers
574574
run_history = RunHistory()
@@ -643,7 +643,9 @@ def _do_traditional_prediction(self, time_left: int, func_eval_time_limit_secs:
643643
additional_info.pop('pipeline_configuration')
644644
run_history.add(config=configuration, cost=cost,
645645
time=runtime, status=status, seed=self.seed,
646-
origin=origin, additional_info=additional_info)
646+
additional_info=additional_info,
647+
starttime=starttime, endtime=starttime + runtime,
648+
origin=origin)
647649
else:
648650
if additional_info.get('exitcode') == -6:
649651
self._logger.error(
@@ -780,13 +782,15 @@ def _search(
780782
":{}".format(self.task_type, dataset.task_type))
781783

782784
# Initialise information needed for the experiment
783-
experiment_task_name = 'runSearch'
785+
experiment_task_name: str = 'runSearch'
784786
dataset_requirements = get_dataset_requirements(
785787
info=self._get_required_dataset_properties(dataset))
786788
self._dataset_requirements = dataset_requirements
787789
dataset_properties = dataset.get_dataset_properties(dataset_requirements)
788790
self._stopwatch.start_task(experiment_task_name)
789791
self.dataset_name = dataset.dataset_name
792+
assert self.dataset_name is not None
793+
790794
if self._logger is None:
791795
self._logger = self._get_logger(self.dataset_name)
792796
self._all_supported_metrics = all_supported_metrics
@@ -895,7 +899,7 @@ def _search(
895899
start_time=time.time(),
896900
time_left_for_ensembles=time_left_for_ensembles,
897901
backend=copy.deepcopy(self._backend),
898-
dataset_name=dataset.dataset_name,
902+
dataset_name=str(dataset.dataset_name),
899903
output_type=STRING_TO_OUTPUT_TYPES[dataset.output_type],
900904
task_type=STRING_TO_TASK_TYPES[self.task_type],
901905
metrics=[self._metric],
@@ -914,7 +918,7 @@ def _search(
914918
self._stopwatch.stop_task(ensemble_task_name)
915919

916920
# ==> Run SMAC
917-
smac_task_name = 'runSMAC'
921+
smac_task_name: str = 'runSMAC'
918922
self._stopwatch.start_task(smac_task_name)
919923
elapsed_time = self._stopwatch.wall_elapsed(experiment_task_name)
920924
time_left_for_smac = max(0, total_walltime_limit - elapsed_time)
@@ -926,7 +930,7 @@ def _search(
926930

927931
_proc_smac = AutoMLSMBO(
928932
config_space=self.search_space,
929-
dataset_name=dataset.dataset_name,
933+
dataset_name=str(dataset.dataset_name),
930934
backend=self._backend,
931935
total_walltime_limit=total_walltime_limit,
932936
func_eval_time_limit_secs=func_eval_time_limit_secs,
@@ -1036,11 +1040,11 @@ def refit(
10361040
Returns:
10371041
self
10381042
"""
1039-
if self.dataset_name is None:
1040-
self.dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))
1043+
1044+
self.dataset_name = dataset.dataset_name
10411045

10421046
if self._logger is None:
1043-
self._logger = self._get_logger(self.dataset_name)
1047+
self._logger = self._get_logger(str(self.dataset_name))
10441048

10451049
dataset_requirements = get_dataset_requirements(
10461050
info=self._get_required_dataset_properties(dataset))
@@ -1106,11 +1110,10 @@ def fit(self,
11061110
Returns:
11071111
(BasePipeline): fitted pipeline
11081112
"""
1109-
if self.dataset_name is None:
1110-
self.dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))
1113+
self.dataset_name = dataset.dataset_name
11111114

11121115
if self._logger is None:
1113-
self._logger = self._get_logger(self.dataset_name)
1116+
self._logger = self._get_logger(str(self.dataset_name))
11141117

11151118
# get dataset properties
11161119
dataset_requirements = get_dataset_requirements(
@@ -1236,7 +1239,8 @@ def __del__(self) -> None:
12361239
# When a multiprocessing work is done, the
12371240
# objects are deleted. We don't want to delete run areas
12381241
# until the estimator is deleted
1239-
self._backend.context.delete_directories(force=False)
1242+
if hasattr(self, '_backend'):
1243+
self._backend.context.delete_directories(force=False)
12401244

12411245
def get_incumbent_results(
12421246
self,

autoPyTorch/datasets/base_dataset.py

Lines changed: 33 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import os
2+
import uuid
13
from abc import ABCMeta
24
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union, cast
35

@@ -13,18 +15,17 @@
1315

1416
from autoPyTorch.constants import CLASSIFICATION_OUTPUTS, STRING_TO_OUTPUT_TYPES
1517
from autoPyTorch.datasets.resampling_strategy import (
16-
CROSS_VAL_FN,
18+
CrossValFunc,
19+
CrossValFuncs,
1720
CrossValTypes,
1821
DEFAULT_RESAMPLING_PARAMETERS,
19-
HOLDOUT_FN,
20-
HoldoutValTypes,
21-
get_cross_validators,
22-
get_holdout_validators,
23-
is_stratified,
22+
HoldOutFunc,
23+
HoldOutFuncs,
24+
HoldoutValTypes
2425
)
25-
from autoPyTorch.utils.common import FitRequirement, hash_array_or_matrix
26+
from autoPyTorch.utils.common import FitRequirement
2627

27-
BaseDatasetType = Union[Tuple[np.ndarray, np.ndarray], Dataset]
28+
BaseDatasetInputType = Union[Tuple[np.ndarray, np.ndarray], Dataset]
2829

2930

3031
def check_valid_data(data: Any) -> None:
@@ -33,7 +34,8 @@ def check_valid_data(data: Any) -> None:
3334
'The specified Data for Dataset must have both __getitem__ and __len__ attribute.')
3435

3536

36-
def type_check(train_tensors: BaseDatasetType, val_tensors: Optional[BaseDatasetType] = None) -> None:
37+
def type_check(train_tensors: BaseDatasetInputType,
38+
val_tensors: Optional[BaseDatasetInputType] = None) -> None:
3739
"""To avoid unexpected behavior, we use loops over indices."""
3840
for i in range(len(train_tensors)):
3941
check_valid_data(train_tensors[i])
@@ -49,8 +51,8 @@ class TransformSubset(Subset):
4951
we require different transformation for each data point.
5052
This class helps to take the subset of the dataset
5153
with either training or validation transformation.
52-
53-
We achieve so by adding a train flag to the pytorch subset
54+
The TransformSubset allows to add train flags
55+
while indexing the main dataset towards this goal.
5456
5557
Attributes:
5658
dataset (BaseDataset/Dataset): Dataset to sample the subset
@@ -71,10 +73,10 @@ def __getitem__(self, idx: int) -> np.ndarray:
7173
class BaseDataset(Dataset, metaclass=ABCMeta):
7274
def __init__(
7375
self,
74-
train_tensors: BaseDatasetType,
76+
train_tensors: BaseDatasetInputType,
7577
dataset_name: Optional[str] = None,
76-
val_tensors: Optional[BaseDatasetType] = None,
77-
test_tensors: Optional[BaseDatasetType] = None,
78+
val_tensors: Optional[BaseDatasetInputType] = None,
79+
test_tensors: Optional[BaseDatasetInputType] = None,
7880
resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
7981
resampling_strategy_args: Optional[Dict[str, Any]] = None,
8082
shuffle: Optional[bool] = True,
@@ -106,15 +108,17 @@ def __init__(
106108
val_transforms (Optional[torchvision.transforms.Compose]):
107109
Additional Transforms to be applied to the validation/test data
108110
"""
109-
self.dataset_name = dataset_name if dataset_name is not None \
110-
else hash_array_or_matrix(train_tensors[0])
111+
self.dataset_name = dataset_name
112+
113+
if self.dataset_name is None:
114+
self.dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))
111115

112116
if not hasattr(train_tensors[0], 'shape'):
113117
type_check(train_tensors, val_tensors)
114118
self.train_tensors, self.val_tensors, self.test_tensors = train_tensors, val_tensors, test_tensors
115-
self.cross_validators: Dict[str, CROSS_VAL_FN] = {}
116-
self.holdout_validators: Dict[str, HOLDOUT_FN] = {}
117-
self.rng = np.random.RandomState(seed=seed)
119+
self.cross_validators: Dict[str, CrossValFunc] = {}
120+
self.holdout_validators: Dict[str, HoldOutFunc] = {}
121+
self.random_state = np.random.RandomState(seed=seed)
118122
self.shuffle = shuffle
119123
self.resampling_strategy = resampling_strategy
120124
self.resampling_strategy_args = resampling_strategy_args
@@ -134,8 +138,8 @@ def __init__(
134138
self.is_small_preprocess = True
135139

136140
# Make sure cross validation splits are created once
137-
self.cross_validators = get_cross_validators(*CrossValTypes)
138-
self.holdout_validators = get_holdout_validators(*HoldoutValTypes)
141+
self.cross_validators = CrossValFuncs.get_cross_validators(*CrossValTypes)
142+
self.holdout_validators = HoldOutFuncs.get_holdout_validators(*HoldoutValTypes)
139143
self.splits = self.get_splits_from_resampling_strategy()
140144

141145
# We also need to be able to transform the data, be it for pre-processing
@@ -201,7 +205,7 @@ def __len__(self) -> int:
201205
return self.train_tensors[0].shape[0]
202206

203207
def _get_indices(self) -> np.ndarray:
204-
return self.rng.permutation(len(self)) if self.shuffle else np.arange(len(self))
208+
return self.random_state.permutation(len(self)) if self.shuffle else np.arange(len(self))
205209

206210
def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]]]:
207211
"""
@@ -263,11 +267,11 @@ def create_cross_val_splits(
263267
if not isinstance(cross_val_type, CrossValTypes):
264268
raise NotImplementedError(f'The selected `cross_val_type` "{cross_val_type}" is not implemented.')
265269
kwargs = {}
266-
if is_stratified(cross_val_type):
270+
if cross_val_type.is_stratified():
267271
# we need additional information about the data for stratification
268272
kwargs["stratify"] = self.train_tensors[-1]
269273
splits = self.cross_validators[cross_val_type.name](
270-
num_splits, self._get_indices(), **kwargs)
274+
self.random_state, num_splits, self._get_indices(), **kwargs)
271275
return splits
272276

273277
def create_holdout_val_split(
@@ -298,10 +302,11 @@ def create_holdout_val_split(
298302
if not isinstance(holdout_val_type, HoldoutValTypes):
299303
raise NotImplementedError(f'The specified `holdout_val_type` "{holdout_val_type}" is not supported.')
300304
kwargs = {}
301-
if is_stratified(holdout_val_type):
305+
if holdout_val_type.is_stratified():
302306
# we need additional information about the data for stratification
303307
kwargs["stratify"] = self.train_tensors[-1]
304-
train, val = self.holdout_validators[holdout_val_type.name](val_share, self._get_indices(), **kwargs)
308+
train, val = self.holdout_validators[holdout_val_type.name](
309+
self.random_state, val_share, self._get_indices(), **kwargs)
305310
return train, val
306311

307312
def get_dataset_for_training(self, split_id: int) -> Tuple[Dataset, Dataset]:
@@ -321,7 +326,8 @@ def get_dataset_for_training(self, split_id: int) -> Tuple[Dataset, Dataset]:
321326
return (TransformSubset(self, self.splits[split_id][0], train=True),
322327
TransformSubset(self, self.splits[split_id][1], train=False))
323328

324-
def replace_data(self, X_train: BaseDatasetType, X_test: Optional[BaseDatasetType]) -> 'BaseDataset':
329+
def replace_data(self, X_train: BaseDatasetInputType,
330+
X_test: Optional[BaseDatasetInputType]) -> 'BaseDataset':
325331
"""
326332
To speed up the training of small dataset, early pre-processing of the data
327333
can be made on the fly by the pipeline.

0 commit comments

Comments
 (0)