automl
diff --git a/‎.github/workflows/pytest.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/pytest.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎autoPyTorch/__version__.py
Lines changed: 1 addition & 1 deletion b/‎autoPyTorch/__version__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎autoPyTorch/api/base_task.py
Lines changed: 17 additions & 13 deletions b/‎autoPyTorch/api/base_task.py
Lines changed: 17 additions & 13 deletions
diff --git a/‎autoPyTorch/datasets/base_dataset.py
Lines changed: 33 additions & 27 deletions b/‎autoPyTorch/datasets/base_dataset.py
Lines changed: 33 additions & 27 deletions
@@ -29,7 +29,7 @@ jobs:
     - name: Run tests
       run: |
         if [ ${{ matrix.code-cov }} ]; then codecov='--cov=autoPyTorch --cov-report=xml'; fi
-        python -m pytest --durations=20 --timeout=600 --timeout-method=signal -v $codecov test
+        python -m pytest --forked --durations=20 --timeout=600 --timeout-method=signal -v $codecov test
     - name: Check for files left behind by test
       if: ${{ always() }}
       run: |
 
@@ -1,4 +1,4 @@
 """Version information."""
 
 # The following line *must* be the last in the module, exactly as formatted:
-__version__ = "0.0.3"
+__version__ = "0.1.0"
@@ -10,7 +10,6 @@
 import time
 import typing
 import unittest.mock
-import uuid
 import warnings
 from abc import abstractmethod
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
@@ -569,6 +568,7 @@ def _do_traditional_prediction(self, time_left: int, func_eval_time_limit_secs:
         assert self._dask_client is not None
 
         self._logger.info("Starting to create traditional classifier predictions.")
+        starttime = time.time()
 
         # Initialise run history for the traditional classifiers
         run_history = RunHistory()
@@ -643,7 +643,9 @@ def _do_traditional_prediction(self, time_left: int, func_eval_time_limit_secs:
                         additional_info.pop('pipeline_configuration')
                         run_history.add(config=configuration, cost=cost,
                                         time=runtime, status=status, seed=self.seed,
-                                        origin=origin, additional_info=additional_info)
+                                        additional_info=additional_info, 
+                                        starttime=starttime, endtime=starttime + runtime,
+                                        origin=origin)
                     else:
                         if additional_info.get('exitcode') == -6:
                             self._logger.error(
@@ -780,13 +782,15 @@ def _search(
                              ":{}".format(self.task_type, dataset.task_type))
 
         # Initialise information needed for the experiment
-        experiment_task_name = 'runSearch'
+        experiment_task_name: str = 'runSearch'
         dataset_requirements = get_dataset_requirements(
             info=self._get_required_dataset_properties(dataset))
         self._dataset_requirements = dataset_requirements
         dataset_properties = dataset.get_dataset_properties(dataset_requirements)
         self._stopwatch.start_task(experiment_task_name)
         self.dataset_name = dataset.dataset_name
+        assert self.dataset_name is not None
+
         if self._logger is None:
             self._logger = self._get_logger(self.dataset_name)
         self._all_supported_metrics = all_supported_metrics
@@ -895,7 +899,7 @@ def _search(
                 start_time=time.time(),
                 time_left_for_ensembles=time_left_for_ensembles,
                 backend=copy.deepcopy(self._backend),
-                dataset_name=dataset.dataset_name,
+                dataset_name=str(dataset.dataset_name),
                 output_type=STRING_TO_OUTPUT_TYPES[dataset.output_type],
                 task_type=STRING_TO_TASK_TYPES[self.task_type],
                 metrics=[self._metric],
@@ -914,7 +918,7 @@ def _search(
             self._stopwatch.stop_task(ensemble_task_name)
 
         # ==> Run SMAC
-        smac_task_name = 'runSMAC'
+        smac_task_name: str = 'runSMAC'
         self._stopwatch.start_task(smac_task_name)
         elapsed_time = self._stopwatch.wall_elapsed(experiment_task_name)
         time_left_for_smac = max(0, total_walltime_limit - elapsed_time)
@@ -926,7 +930,7 @@ def _search(
 
             _proc_smac = AutoMLSMBO(
                 config_space=self.search_space,
-                dataset_name=dataset.dataset_name,
+                dataset_name=str(dataset.dataset_name),
                 backend=self._backend,
                 total_walltime_limit=total_walltime_limit,
                 func_eval_time_limit_secs=func_eval_time_limit_secs,
@@ -1036,11 +1040,11 @@ def refit(
         Returns:
             self
         """
-        if self.dataset_name is None:
-            self.dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))
+
+        self.dataset_name = dataset.dataset_name
 
         if self._logger is None:
-            self._logger = self._get_logger(self.dataset_name)
+            self._logger = self._get_logger(str(self.dataset_name))
 
         dataset_requirements = get_dataset_requirements(
             info=self._get_required_dataset_properties(dataset))
@@ -1106,11 +1110,10 @@ def fit(self,
         Returns:
             (BasePipeline): fitted pipeline
         """
-        if self.dataset_name is None:
-            self.dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))
+        self.dataset_name = dataset.dataset_name
 
         if self._logger is None:
-            self._logger = self._get_logger(self.dataset_name)
+            self._logger = self._get_logger(str(self.dataset_name))
 
         # get dataset properties
         dataset_requirements = get_dataset_requirements(
@@ -1236,7 +1239,8 @@ def __del__(self) -> None:
         # When a multiprocessing work is done, the
         # objects are deleted. We don't want to delete run areas
         # until the estimator is deleted
-        self._backend.context.delete_directories(force=False)
+        if hasattr(self, '_backend'):
+            self._backend.context.delete_directories(force=False)
 
     def get_incumbent_results(
         self,
 
@@ -1,3 +1,5 @@
+import os
+import uuid
 from abc import ABCMeta
 from typing import Any, Dict, List, Optional, Sequence, Tuple, Union, cast
 
@@ -13,18 +15,17 @@
 
 from autoPyTorch.constants import CLASSIFICATION_OUTPUTS, STRING_TO_OUTPUT_TYPES
 from autoPyTorch.datasets.resampling_strategy import (
-    CROSS_VAL_FN,
+    CrossValFunc,
+    CrossValFuncs,
     CrossValTypes,
     DEFAULT_RESAMPLING_PARAMETERS,
-    HOLDOUT_FN,
-    HoldoutValTypes,
-    get_cross_validators,
-    get_holdout_validators,
-    is_stratified,
+    HoldOutFunc,
+    HoldOutFuncs,
+    HoldoutValTypes
 )
-from autoPyTorch.utils.common import FitRequirement, hash_array_or_matrix
+from autoPyTorch.utils.common import FitRequirement
 
-BaseDatasetType = Union[Tuple[np.ndarray, np.ndarray], Dataset]
+BaseDatasetInputType = Union[Tuple[np.ndarray, np.ndarray], Dataset]
 
 
 def check_valid_data(data: Any) -> None:
@@ -33,7 +34,8 @@ def check_valid_data(data: Any) -> None:
             'The specified Data for Dataset must have both __getitem__ and __len__ attribute.')
 
 
-def type_check(train_tensors: BaseDatasetType, val_tensors: Optional[BaseDatasetType] = None) -> None:
+def type_check(train_tensors: BaseDatasetInputType,
+               val_tensors: Optional[BaseDatasetInputType] = None) -> None:
     """To avoid unexpected behavior, we use loops over indices."""
     for i in range(len(train_tensors)):
         check_valid_data(train_tensors[i])
@@ -49,8 +51,8 @@ class TransformSubset(Subset):
     we require different transformation for each data point.
     This class helps to take the subset of the dataset
     with either training or validation transformation.
-
-    We achieve so by adding a train flag to the pytorch subset
+    The TransformSubset allows to add train flags
+    while indexing the main dataset towards this goal.
 
     Attributes:
         dataset (BaseDataset/Dataset): Dataset to sample the subset
@@ -71,10 +73,10 @@ def __getitem__(self, idx: int) -> np.ndarray:
 class BaseDataset(Dataset, metaclass=ABCMeta):
     def __init__(
         self,
-        train_tensors: BaseDatasetType,
+        train_tensors: BaseDatasetInputType,
         dataset_name: Optional[str] = None,
-        val_tensors: Optional[BaseDatasetType] = None,
-        test_tensors: Optional[BaseDatasetType] = None,
+        val_tensors: Optional[BaseDatasetInputType] = None,
+        test_tensors: Optional[BaseDatasetInputType] = None,
         resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         shuffle: Optional[bool] = True,
@@ -106,15 +108,17 @@ def __init__(
             val_transforms (Optional[torchvision.transforms.Compose]):
                 Additional Transforms to be applied to the validation/test data
         """
-        self.dataset_name = dataset_name if dataset_name is not None \
-            else hash_array_or_matrix(train_tensors[0])
+        self.dataset_name = dataset_name
+
+        if self.dataset_name is None:
+            self.dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))
 
         if not hasattr(train_tensors[0], 'shape'):
             type_check(train_tensors, val_tensors)
         self.train_tensors, self.val_tensors, self.test_tensors = train_tensors, val_tensors, test_tensors
-        self.cross_validators: Dict[str, CROSS_VAL_FN] = {}
-        self.holdout_validators: Dict[str, HOLDOUT_FN] = {}
-        self.rng = np.random.RandomState(seed=seed)
+        self.cross_validators: Dict[str, CrossValFunc] = {}
+        self.holdout_validators: Dict[str, HoldOutFunc] = {}
+        self.random_state = np.random.RandomState(seed=seed)
         self.shuffle = shuffle
         self.resampling_strategy = resampling_strategy
         self.resampling_strategy_args = resampling_strategy_args
@@ -134,8 +138,8 @@ def __init__(
         self.is_small_preprocess = True
 
         # Make sure cross validation splits are created once
-        self.cross_validators = get_cross_validators(*CrossValTypes)
-        self.holdout_validators = get_holdout_validators(*HoldoutValTypes)
+        self.cross_validators = CrossValFuncs.get_cross_validators(*CrossValTypes)
+        self.holdout_validators = HoldOutFuncs.get_holdout_validators(*HoldoutValTypes)
         self.splits = self.get_splits_from_resampling_strategy()
 
         # We also need to be able to transform the data, be it for pre-processing
@@ -201,7 +205,7 @@ def __len__(self) -> int:
         return self.train_tensors[0].shape[0]
 
     def _get_indices(self) -> np.ndarray:
-        return self.rng.permutation(len(self)) if self.shuffle else np.arange(len(self))
+        return self.random_state.permutation(len(self)) if self.shuffle else np.arange(len(self))
 
     def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]]]:
         """
@@ -263,11 +267,11 @@ def create_cross_val_splits(
         if not isinstance(cross_val_type, CrossValTypes):
             raise NotImplementedError(f'The selected `cross_val_type` "{cross_val_type}" is not implemented.')
         kwargs = {}
-        if is_stratified(cross_val_type):
+        if cross_val_type.is_stratified():
             # we need additional information about the data for stratification
             kwargs["stratify"] = self.train_tensors[-1]
         splits = self.cross_validators[cross_val_type.name](
-            num_splits, self._get_indices(), **kwargs)
+            self.random_state, num_splits, self._get_indices(), **kwargs)
         return splits
 
     def create_holdout_val_split(
@@ -298,10 +302,11 @@ def create_holdout_val_split(
         if not isinstance(holdout_val_type, HoldoutValTypes):
             raise NotImplementedError(f'The specified `holdout_val_type` "{holdout_val_type}" is not supported.')
         kwargs = {}
-        if is_stratified(holdout_val_type):
+        if holdout_val_type.is_stratified():
             # we need additional information about the data for stratification
             kwargs["stratify"] = self.train_tensors[-1]
-        train, val = self.holdout_validators[holdout_val_type.name](val_share, self._get_indices(), **kwargs)
+        train, val = self.holdout_validators[holdout_val_type.name](
+            self.random_state, val_share, self._get_indices(), **kwargs)
         return train, val
 
     def get_dataset_for_training(self, split_id: int) -> Tuple[Dataset, Dataset]:
@@ -321,7 +326,8 @@ def get_dataset_for_training(self, split_id: int) -> Tuple[Dataset, Dataset]:
         return (TransformSubset(self, self.splits[split_id][0], train=True),
                 TransformSubset(self, self.splits[split_id][1], train=False))
 
-    def replace_data(self, X_train: BaseDatasetType, X_test: Optional[BaseDatasetType]) -> 'BaseDataset':
+    def replace_data(self, X_train: BaseDatasetInputType,
+                     X_test: Optional[BaseDatasetInputType]) -> 'BaseDataset':
         """
         To speed up the training of small dataset, early pre-processing of the data
         can be made on the fly by the pipeline.