automl
diff --git a/‎.pre-commit-config.yaml
Lines changed: 10 additions & 6 deletions b/‎.pre-commit-config.yaml
Lines changed: 10 additions & 6 deletions
diff --git a/‎autoPyTorch/api/base_task.py
Lines changed: 11 additions & 8 deletions b/‎autoPyTorch/api/base_task.py
Lines changed: 11 additions & 8 deletions
diff --git a/‎autoPyTorch/data/base_target_validator.py
Lines changed: 0 additions & 1 deletion b/‎autoPyTorch/data/base_target_validator.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎autoPyTorch/data/tabular_feature_validator.py
Lines changed: 0 additions & 1 deletion b/‎autoPyTorch/data/tabular_feature_validator.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎autoPyTorch/data/tabular_target_validator.py
Lines changed: 3 additions & 2 deletions b/‎autoPyTorch/data/tabular_target_validator.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎autoPyTorch/datasets/base_dataset.py
Lines changed: 9 additions & 7 deletions b/‎autoPyTorch/datasets/base_dataset.py
Lines changed: 9 additions & 7 deletions
diff --git a/‎autoPyTorch/datasets/tabular_dataset.py
Lines changed: 3 additions & 2 deletions b/‎autoPyTorch/datasets/tabular_dataset.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎autoPyTorch/ensemble/ensemble_builder.py
Lines changed: 12 additions & 20 deletions b/‎autoPyTorch/ensemble/ensemble_builder.py
Lines changed: 12 additions & 20 deletions
diff --git a/‎autoPyTorch/ensemble/ensemble_selection.py
Lines changed: 3 additions & 3 deletions b/‎autoPyTorch/ensemble/ensemble_selection.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎autoPyTorch/evaluation/abstract_evaluator.py
Lines changed: 4 additions & 8 deletions b/‎autoPyTorch/evaluation/abstract_evaluator.py
Lines changed: 4 additions & 8 deletions
diff --git a/‎autoPyTorch/evaluation/tae.py
Lines changed: 6 additions & 3 deletions b/‎autoPyTorch/evaluation/tae.py
Lines changed: 6 additions & 3 deletions
diff --git a/‎autoPyTorch/optimizer/smbo.py
Lines changed: 1 addition & 2 deletions b/‎autoPyTorch/optimizer/smbo.py
Lines changed: 1 addition & 2 deletions
@@ -3,21 +3,25 @@ repos:
     rev: v0.761
     hooks:
       - id: mypy
-        args: [--show-error-codes]
-        name: mypy AutoPyTorch
+        args: [--show-error-codes,
+               --warn-redundant-casts,
+               --warn-return-any,
+               --warn-unreachable,
+        ]
         files: autoPyTorch/.*
+        exclude: autoPyTorch/ensemble/
   - repo: https://gitlab.com/pycqa/flake8
     rev: 3.8.3
     hooks:
       - id: flake8
-        name: flake8 AutoPyTorch
-        files: autoPyTorch/.*
         additional_dependencies:
           - flake8-print==3.1.4
           - flake8-import-order
+        name: flake8 autoPyTorch
+        files: autoPyTorch/.*
       - id: flake8
-        name: flake8 tests
-        files: test/.*
         additional_dependencies:
           - flake8-print==3.1.4
           - flake8-import-order
+        name: flake8 test
+        files: test/.*
@@ -12,11 +12,12 @@
 import unittest.mock
 import warnings
 from abc import abstractmethod
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
 
 import dask
+import dask.distributed
 
 import joblib
 
@@ -38,7 +39,6 @@
 from autoPyTorch.datasets.base_dataset import BaseDataset
 from autoPyTorch.datasets.resampling_strategy import CrossValTypes, HoldoutValTypes
 from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilderManager
-from autoPyTorch.ensemble.ensemble_selection import EnsembleSelection
 from autoPyTorch.ensemble.singlebest_ensemble import SingleBest
 from autoPyTorch.evaluation.abstract_evaluator import fit_and_suppress_warnings
 from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash
@@ -198,7 +198,7 @@ def __init__(
         # examples. Nevertheless, multi-process runs
         # have spawn as requirement to reduce the
         # possibility of a deadlock
-        self._dask_client = None
+        self._dask_client: Optional[dask.distributed.Client] = None
         self._multiprocessing_context = 'forkserver'
         if self.n_jobs == 1:
             self._multiprocessing_context = 'fork'
@@ -711,7 +711,8 @@ def _search(
         precision: int = 32,
         disable_file_output: List = [],
         load_models: bool = True,
-        portfolio_selection: Optional[str] = None
+        portfolio_selection: Optional[str] = None,
+        dask_client: Optional[dask.distributed.Client] = None
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -857,10 +858,11 @@ def _search(
         # If no dask client was provided, we create one, so that we can
         # start a ensemble process in parallel to smbo optimize
         if (
-            self._dask_client is None and (self.ensemble_size > 0 or self.n_jobs is not None and self.n_jobs > 1)
+            dask_client is None and (self.ensemble_size > 0 or self.n_jobs > 1)
         ):
             self._create_dask_client()
         else:
+            self._dask_client = dask_client
             self._is_dask_client_internally_created = False
 
         # Handle time resource allocation
@@ -1206,7 +1208,6 @@ def predict(
 
         # Mypy assert
         assert self.ensemble_ is not None, "Load models should error out if no ensemble"
-        self.ensemble_ = cast(Union[SingleBest, EnsembleSelection], self.ensemble_)
 
         if isinstance(self.resampling_strategy, HoldoutValTypes):
             models = self.models_
@@ -1315,15 +1316,17 @@ def get_models_with_weights(self) -> List:
             self._load_models()
 
         assert self.ensemble_ is not None
-        return self.ensemble_.get_models_with_weights(self.models_)
+        models_with_weights: List[Tuple[float, BasePipeline]] = self.ensemble_.get_models_with_weights(self.models_)
+        return models_with_weights
 
     def show_models(self) -> str:
         df = []
         for weight, model in self.get_models_with_weights():
             representation = model.get_pipeline_representation()
             representation.update({'Weight': weight})
             df.append(representation)
-        return pd.DataFrame(df).to_markdown()
+        models_markdown: str = pd.DataFrame(df).to_markdown()
+        return models_markdown
 
     def _print_debug_info_to_log(self) -> None:
         """
 
@@ -95,7 +95,6 @@ def fit(
                                      np.shape(y_test)
                                  ))
             if isinstance(y_train, pd.DataFrame):
-                y_train = typing.cast(pd.DataFrame, y_train)
                 y_test = typing.cast(pd.DataFrame, y_test)
                 if y_train.columns.tolist() != y_test.columns.tolist():
                     raise ValueError(
 
@@ -145,7 +145,6 @@ def transform(
             X = self.numpy_array_to_pandas(X)
 
         if hasattr(X, "iloc") and not scipy.sparse.issparse(X):
-            X = typing.cast(pd.DataFrame, X)
             if np.any(pd.isnull(X)):
                 for column in X.columns:
                     if X[column].isna().all():
 
@@ -194,8 +194,9 @@ def _check_data(
                 A set of features whose dimensionality and data type is going to be checked
         """
 
-        if not isinstance(
-                y, (np.ndarray, pd.DataFrame, list, pd.Series)) and not scipy.sparse.issparse(y):
+        if not isinstance(y, (np.ndarray, pd.DataFrame,
+                              typing.List, pd.Series)) \
+                and not scipy.sparse.issparse(y):  # type: ignore[misc]
             raise ValueError("AutoPyTorch only supports Numpy arrays, Pandas DataFrames,"
                              " pd.Series, sparse data and Python Lists as targets, yet, "
                              "the provided input is of type {}".format(
 
@@ -26,6 +26,7 @@
 from autoPyTorch.utils.common import FitRequirement
 
 BaseDatasetInputType = Union[Tuple[np.ndarray, np.ndarray], Dataset]
+BaseDatasetPropertiesType = Union[int, float, str, List, bool]
 
 
 def check_valid_data(data: Any) -> None:
@@ -125,7 +126,6 @@ def __init__(
         self.task_type: Optional[str] = None
         self.issparse: bool = issparse(self.train_tensors[0])
         self.input_shape: Tuple[int] = self.train_tensors[0].shape[1:]
-
         if len(self.train_tensors) == 2 and self.train_tensors[1] is not None:
             self.output_type: str = type_of_target(self.train_tensors[1])
 
@@ -205,7 +205,7 @@ def __getitem__(self, index: int, train: bool = True) -> Tuple[np.ndarray, ...]:
         return X, Y
 
     def __len__(self) -> int:
-        return self.train_tensors[0].shape[0]
+        return int(self.train_tensors[0].shape[0])
 
     def _get_indices(self) -> np.ndarray:
         return self.random_state.permutation(len(self)) if self.shuffle else np.arange(len(self))
@@ -349,7 +349,9 @@ def replace_data(self, X_train: BaseDatasetInputType,
             self.test_tensors = (X_test, self.test_tensors[1])
         return self
 
-    def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) -> Dict[str, Any]:
+    def get_dataset_properties(
+        self, dataset_requirements: List[FitRequirement]
+    ) -> Dict[str, BaseDatasetPropertiesType]:
         """
         Gets the dataset properties required in the fit dictionary.
         This depends on the components that are active in the
@@ -364,7 +366,7 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) ->
                 <https://github.com/automl/Auto-PyTorch/blob/refactor_development/autoPyTorch/utils/pipeline.py#L25>`
 
         Returns:
-            dataset_properties (Dict[str, Any]):
+            dataset_properties (Dict[str, BaseDatasetPropertiesType]):
                 Dict of the dataset properties.
         """
         dataset_properties = dict()
@@ -376,11 +378,11 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) ->
         dataset_properties.update(self.get_required_dataset_info())
         return dataset_properties
 
-    def get_required_dataset_info(self) -> Dict[str, Any]:
+    def get_required_dataset_info(self) -> Dict[str, BaseDatasetPropertiesType]:
         """
         Returns a dictionary containing required dataset
         properties to instantiate a pipeline.
         """
-        info = {'output_type': self.output_type,
-                'issparse': self.issparse}
+        info: Dict[str, BaseDatasetPropertiesType] = {'output_type': self.output_type,
+                                                      'issparse': self.issparse}
         return info
@@ -17,7 +17,7 @@
     TASK_TYPES_TO_STRING,
 )
 from autoPyTorch.data.base_validator import BaseInputValidator
-from autoPyTorch.datasets.base_dataset import BaseDataset
+from autoPyTorch.datasets.base_dataset import BaseDataset, BaseDatasetPropertiesType
 from autoPyTorch.datasets.resampling_strategy import (
     CrossValTypes,
     HoldoutValTypes,
@@ -98,7 +98,7 @@ def __init__(self,
         if STRING_TO_TASK_TYPES[self.task_type] in CLASSIFICATION_TASKS:
             self.num_classes: int = len(np.unique(self.train_tensors[1]))
 
-    def get_required_dataset_info(self) -> Dict[str, Any]:
+    def get_required_dataset_info(self) -> Dict[str, BaseDatasetPropertiesType]:
         """
         Returns a dictionary containing required dataset
         properties to instantiate a pipeline.
@@ -120,6 +120,7 @@ def get_required_dataset_info(self) -> Dict[str, Any]:
                 <https://github.com/automl/Auto-PyTorch/blob/refactor_development/autoPyTorch/constants.py>`
         """
         info = super().get_required_dataset_info()
+        assert self.task_type is not None, "Expected value for task type but got None"
         info.update({
             'numerical_columns': self.numerical_columns,
             'categorical_columns': self.categorical_columns,
 
@@ -91,21 +91,21 @@ def __init__(
                 Both wrt to validation predictions
                 If performance_range_threshold > 0, might return less models
             max_models_on_disc: Union[float, int]
-            Defines the maximum number of models that are kept in the disc.
-            If int, it must be greater or equal than 1, and dictates the max number of
-            models to keep.
-            If float, it will be interpreted as the max megabytes allowed of disc space. That
-            is, if the number of ensemble candidates require more disc space than this float
-            value, the worst models will be deleted to keep within this budget.
-            Models and predictions of the worst-performing models will be deleted then.
-            If None, the feature is disabled.
-            It defines an upper bound on the models that can be used in the ensemble.
+                Defines the maximum number of models that are kept in the disc.
+                If int, it must be greater or equal than 1, and dictates the max number of
+                models to keep.
+                If float, it will be interpreted as the max megabytes allowed of disc space. That
+                is, if the number of ensemble candidates require more disc space than this float
+                value, the worst models will be deleted to keep within this budget.
+                Models and predictions of the worst-performing models will be deleted then.
+                If None, the feature is disabled.
+                It defines an upper bound on the models that can be used in the ensemble.
             seed: int
                 random seed
             max_iterations: int
                 maximal number of iterations to run this script
                 (default None --> deactivated)
-            precision: [16,32,64,128]
+            precision (int): [16,32,64,128]
                 precision of floats to read the predictions
             memory_limit: Optional[int]
                 memory limit in mb. If ``None``, no memory limit is enforced.
@@ -324,7 +324,7 @@ def fit_and_return_ensemble(
            It defines an upper bound on the models that can be used in the ensemble.
         seed: int
             random seed
-        precision: [16,32,64,128]
+        precision (int): [16,32,64,128]
             precision of floats to read the predictions
         memory_limit: Optional[int]
             memory limit in mb. If ``None``, no memory limit is enforced.
@@ -1506,15 +1506,7 @@ def _delete_excess_models(self, selected_keys: List[str]) -> None:
                 )
 
     def _read_np_fn(self, path: str) -> np.ndarray:
-
-        # Support for string precision
-        if isinstance(self.precision, str):
-            precision = int(self.precision)
-            self.logger.warning("Interpreted str-precision as {}".format(
-                precision
-            ))
-        else:
-            precision = self.precision
+        precision = self.precision
 
         if path.endswith("gz"):
             fp = gzip.open(path, 'rb')
 
@@ -149,9 +149,9 @@ def _fit(
             if len(predictions) == 1:
                 break
 
-        self.indices_ = order
-        self.trajectory_ = trajectory
-        self.train_loss_ = trajectory[-1]
+        self.indices_: List[int] = order
+        self.trajectory_: List[float] = trajectory
+        self.train_loss_: float = trajectory[-1]
 
     def _calculate_weights(self) -> None:
         """
 
@@ -31,7 +31,7 @@
     STRING_TO_TASK_TYPES,
     TABULAR_TASKS,
 )
-from autoPyTorch.datasets.base_dataset import BaseDataset
+from autoPyTorch.datasets.base_dataset import BaseDataset, BaseDatasetPropertiesType
 from autoPyTorch.evaluation.utils import (
     VotingRegressorWrapper,
     convert_multioutput_multiclass_to_multilabel
@@ -63,7 +63,7 @@ class MyTraditionalTabularClassificationPipeline(BaseEstimator):
     learning model, and is the final object that is stored for inference.
 
     Attributes:
-        dataset_properties (Dict[str, Any]):
+        dataset_properties (Dict[str, BaseDatasetPropertiesType]):
             A dictionary containing dataset specific information
         random_state (Optional[np.random.RandomState]):
             Object that contains a seed and allows for reproducible results
@@ -73,8 +73,8 @@ class MyTraditionalTabularClassificationPipeline(BaseEstimator):
     """
 
     def __init__(self, config: str,
-                 dataset_properties: Dict[str, Any],
-                 random_state: Optional[np.random.RandomState] = None,
+                 dataset_properties: Dict[str, BaseDatasetPropertiesType],
+                 random_state: Optional[Union[int, np.random.RandomState]] = None,
                  init_params: Optional[Dict] = None):
         self.config = config
         self.dataset_properties = dataset_properties
@@ -197,8 +197,6 @@ class DummyClassificationPipeline(DummyClassifier):
     worst performing model. In case of failure, at least this model will be fitted.
 
     Attributes:
-        dataset_properties (Dict[str, Any]):
-            A dictionary containing dataset specific information
         random_state (Optional[Union[int, np.random.RandomState]]):
             Object that contains a seed and allows for reproducible results
         init_params  (Optional[Dict]):
@@ -262,8 +260,6 @@ class DummyRegressionPipeline(DummyRegressor):
     worst performing model. In case of failure, at least this model will be fitted.
 
     Attributes:
-        dataset_properties (Dict[str, Any]):
-            A dictionary containing dataset specific information
         random_state (Optional[Union[int, np.random.RandomState]]):
             Object that contains a seed and allows for reproducible results
         init_params  (Optional[Dict]):
 
@@ -36,7 +36,7 @@ def fit_predict_try_except_decorator(
         ta: typing.Callable,
         queue: multiprocessing.Queue, cost_for_crash: float, **kwargs: typing.Any) -> None:
     try:
-        return ta(queue=queue, **kwargs)
+        ta(queue=queue, **kwargs)
     except Exception as e:
         if isinstance(e, (MemoryError, pynisher.TimeoutException)):
             # Re-raise the memory error to let the pynisher handle that correctly
@@ -147,13 +147,15 @@ def __init__(
         self.exclude = exclude
         self.disable_file_output = disable_file_output
         self.init_params = init_params
+
+        self.budget_type = pipeline_config['budget_type'] if pipeline_config is not None else budget_type
+
         self.pipeline_config: typing.Dict[str, typing.Union[int, str, float]] = dict()
         if pipeline_config is None:
             pipeline_config = replace_string_bool_to_bool(json.load(open(
                 os.path.join(os.path.dirname(__file__), '../configs/default_pipeline_options.json'))))
         self.pipeline_config.update(pipeline_config)
 
-        self.budget_type = pipeline_config['budget_type'] if pipeline_config is not None else budget_type
         self.logger_port = logger_port
         if self.logger_port is None:
             self.logger: typing.Union[logging.Logger, PicklableClientLogger] = logging.getLogger("TAE")
@@ -237,7 +239,8 @@ def run_wrapper(
             run_info = run_info._replace(cutoff=int(np.ceil(run_info.cutoff)))
 
         self.logger.info("Starting to evaluate configuration %s" % run_info.config.config_id)
-        return super().run_wrapper(run_info=run_info)
+        run_info, run_value = super().run_wrapper(run_info=run_info)
+        return run_info, run_value
 
     def run(
             self,
 
@@ -213,12 +213,11 @@ def __init__(self,
 
         self.search_space_updates = search_space_updates
 
-        dataset_name_ = "" if dataset_name is None else dataset_name
         if logger_port is None:
             self.logger_port = logging.handlers.DEFAULT_TCP_LOGGING_PORT
         else:
             self.logger_port = logger_port
-        logger_name = '%s(%d):%s' % (self.__class__.__name__, self.seed, ":" + dataset_name_)
+        logger_name = '%s(%d):%s' % (self.__class__.__name__, self.seed, ":" + self.dataset_name)
         self.logger = get_named_client_logger(name=logger_name,
                                               port=self.logger_port)
         self.logger.info("initialised {}".format(self.__class__.__name__))