diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index 48f395609..e9344e213 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -101,7 +101,10 @@ class BaseTask:
     Base class for the tasks that serve as API to the pipelines.
     Args:
         seed (int), (default=1): seed to be used for reproducibility.
-        n_jobs (int), (default=1): number of consecutive processes to spawn.
+        n_jobs (int), (default=1):
+            number of consecutive processes to spawn.
+        n_threads (int), (default=1):
+            number of threads to use for each process.
         logging_config (Optional[Dict]): specifies configuration
             for logging, if None, it is loaded from the logging.yaml
         ensemble_size (int), (default=50): Number of models added to the ensemble built by
@@ -138,6 +141,7 @@ def __init__(
         self,
         seed: int = 1,
         n_jobs: int = 1,
+        n_threads: int = 1,
         logging_config: Optional[Dict] = None,
         ensemble_size: int = 50,
         ensemble_nbest: int = 50,
@@ -158,6 +162,7 @@ def __init__(
     ) -> None:
         self.seed = seed
         self.n_jobs = n_jobs
+        self.n_threads = n_threads
         self.ensemble_size = ensemble_size
         self.ensemble_nbest = ensemble_nbest
         self.max_models_on_disc = max_models_on_disc
@@ -189,6 +194,9 @@ def __init__(
         self.trajectory: Optional[List] = None
         self.dataset_name: Optional[str] = None
         self.cv_models_: Dict = {}
+        self.precision: Optional[int] = None
+        self.opt_metric: Optional[str] = None
+        self.dataset: Optional[BaseDataset] = None
 
         # By default try to use the TCP logging port or get a new port
         self._logger_port = logging.handlers.DEFAULT_TCP_LOGGING_PORT
@@ -394,6 +402,7 @@ def _clean_logger(self) -> None:
             self.logging_server.join(timeout=5)
             self.logging_server.terminate()
             del self.stop_logging_server
+            self._logger = None
 
     def _create_dask_client(self) -> None:
         """
@@ -408,7 +417,7 @@ def _create_dask_client(self) -> None:
             dask.distributed.LocalCluster(
                 n_workers=self.n_jobs,
                 processes=True,
-                threads_per_worker=1,
+                threads_per_worker=self.n_threads,
                 # We use the temporal directory to save the
                 # dask workers, because deleting workers
                 # more time than deleting backend directories
@@ -488,6 +497,23 @@ def _load_models(self) -> bool:
 
         return True
 
+    def _cleanup(self) -> None:
+        """
+        Closes the different servers created during api search.
+        Returns:
+                None
+        """
+        if hasattr(self, '_logger') and self._logger is not None:
+            self._logger.info("Closing the dask infrastructure")
+            self._close_dask_client()
+            self._logger.info("Finished closing the dask infrastructure")
+
+            # Clean up the logger
+            self._logger.info("Starting to clean up the logger")
+            self._clean_logger()
+        else:
+            self._close_dask_client()
+
     def _load_best_individual_model(self) -> SingleBest:
         """
         In case of failure during ensemble building,
@@ -725,6 +751,38 @@ def _do_traditional_prediction(self, time_left: int, func_eval_time_limit_secs:
                               save_external=True)
         return
 
+    def run_traditional_ml(
+        self,
+        current_task_name: str,
+        runtime_limit: int,
+        func_eval_time_limit_secs: int
+    ) -> None:
+        """
+        This function can be used to run the suite of traditional machine
+        learning models during the current task (for e.g, ensemble fit, search)
+
+        Args:
+            current_task_name (str): name of the current task,
+            runtime_limit (int): time limit for fitting traditional models,
+            func_eval_time_limit_secs (int): Time limit
+                for a single call to the machine learning model.
+                Model fitting will be terminated if the machine
+                learning algorithm runs over the time limit.
+        """
+        assert self._logger is not None  # for mypy compliancy
+        if STRING_TO_TASK_TYPES[self.task_type] in REGRESSION_TASKS:
+            self._logger.warning("Traditional Pipeline is not enabled for regression. Skipping...")
+        else:
+            traditional_task_name = 'runTraditional'
+            self._stopwatch.start_task(traditional_task_name)
+            elapsed_time = self._stopwatch.wall_elapsed(current_task_name)
+            time_for_traditional = int(runtime_limit - elapsed_time)
+            self._do_traditional_prediction(
+                func_eval_time_limit_secs=func_eval_time_limit_secs,
+                time_left=time_for_traditional,
+            )
+            self._stopwatch.stop_task(traditional_task_name)
+
     def _search(
         self,
         optimize_metric: str,
@@ -819,8 +877,10 @@ def _search(
 
         if self.task_type != dataset.task_type:
             raise ValueError("Incompatible dataset entered for current task,"
-                             "expected dataset to have task type :{} got "
+                             "expected dataset to have task type :{} but got "
                              ":{}".format(self.task_type, dataset.task_type))
+        if precision not in [16, 32, 64]:
+            raise ValueError("precision must be one of 16, 32, 64 but got {}".format(precision))
 
         # Initialise information needed for the experiment
         experiment_task_name: str = 'runSearch'
@@ -866,7 +926,7 @@ def _search(
         # If no dask client was provided, we create one, so that we can
         # start a ensemble process in parallel to smbo optimize
         if (
-            self._dask_client is None and (self.ensemble_size > 0 or self.n_jobs is not None and self.n_jobs > 1)
+            self._dask_client is None and (self.ensemble_size > 0 or self.n_jobs > 1)
         ):
             self._create_dask_client()
         else:
@@ -895,31 +955,25 @@ def _search(
             )
 
         # ============> Run dummy predictions
-        dummy_task_name = 'runDummy'
-        self._stopwatch.start_task(dummy_task_name)
-        self._do_dummy_prediction()
-        self._stopwatch.stop_task(dummy_task_name)
+        # We only want to run dummy predictions in case we want to build an ensemble
+        if self.ensemble_size > 0:
+            dummy_task_name = 'runDummy'
+            self._stopwatch.start_task(dummy_task_name)
+            self._do_dummy_prediction()
+            self._stopwatch.stop_task(dummy_task_name)
 
         # ============> Run traditional ml
-
-        if enable_traditional_pipeline:
-            if STRING_TO_TASK_TYPES[self.task_type] in REGRESSION_TASKS:
-                self._logger.warning("Traditional Pipeline is not enabled for regression. Skipping...")
-            else:
-                traditional_task_name = 'runTraditional'
-                self._stopwatch.start_task(traditional_task_name)
-                elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name)
-                # We want time for at least 1 Neural network in SMAC
-                time_for_traditional = int(
-                    self._time_for_task - elapsed_time - func_eval_time_limit_secs
-                )
-                self._do_traditional_prediction(
-                    func_eval_time_limit_secs=func_eval_time_limit_secs,
-                    time_left=time_for_traditional,
-                )
-                self._stopwatch.stop_task(traditional_task_name)
+        # We only want to run traditional predictions in case we want to build an ensemble
+        # We want time for at least 1 Neural network in SMAC
+        if enable_traditional_pipeline and self.ensemble_size > 0:
+            traditional_runtime_limit = int(self._time_for_task - func_eval_time_limit_secs)
+            self.run_traditional_ml(current_task_name=self.dataset_name,
+                                    runtime_limit=traditional_runtime_limit,
+                                    func_eval_time_limit_secs=func_eval_time_limit_secs)
 
         # ============> Starting ensemble
+        self.precision = precision
+        self.opt_metric = optimize_metric
         elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name)
         time_left_for_ensembles = max(0, total_walltime_limit - elapsed_time)
         proc_ensemble = None
@@ -936,26 +990,12 @@ def _search(
             self._logger.info("Starting ensemble")
             ensemble_task_name = 'ensemble'
             self._stopwatch.start_task(ensemble_task_name)
-            proc_ensemble = EnsembleBuilderManager(
-                start_time=time.time(),
-                time_left_for_ensembles=time_left_for_ensembles,
-                backend=copy.deepcopy(self._backend),
-                dataset_name=str(dataset.dataset_name),
-                output_type=STRING_TO_OUTPUT_TYPES[dataset.output_type],
-                task_type=STRING_TO_TASK_TYPES[self.task_type],
-                metrics=[self._metric],
-                opt_metric=optimize_metric,
-                ensemble_size=self.ensemble_size,
-                ensemble_nbest=self.ensemble_nbest,
-                max_models_on_disc=self.max_models_on_disc,
-                seed=self.seed,
-                max_iterations=None,
-                read_at_most=sys.maxsize,
-                ensemble_memory_limit=self._memory_limit,
-                random_state=self.seed,
-                precision=precision,
-                logger_port=self._logger_port,
-            )
+            proc_ensemble = self._init_ensemble_builder(time_left_for_ensembles=time_left_for_ensembles,
+                                                        ensemble_size=self.ensemble_size,
+                                                        ensemble_nbest=self.ensemble_nbest,
+                                                        precision=precision,
+                                                        optimize_metric=self.opt_metric
+                                                        )
             self._stopwatch.stop_task(ensemble_task_name)
 
         # ==> Run SMAC
@@ -1035,18 +1075,12 @@ def _search(
                 pd.DataFrame(self.ensemble_performance_history).to_json(
                     os.path.join(self._backend.internals_directory, 'ensemble_history.json'))
 
-        self._logger.info("Closing the dask infrastructure")
-        self._close_dask_client()
-        self._logger.info("Finished closing the dask infrastructure")
-
         if load_models:
             self._logger.info("Loading models...")
             self._load_models()
             self._logger.info("Finished loading models...")
 
-        # Clean up the logger
-        self._logger.info("Starting to clean up the logger")
-        self._clean_logger()
+        self._cleanup()
 
         return self
 
@@ -1241,7 +1275,7 @@ def fit_pipeline(self,
         dataset_requirements = get_dataset_requirements(
             info=self._get_required_dataset_properties(dataset))
         dataset_properties = dataset.get_dataset_properties(dataset_requirements)
-        self._backend.save_datamanager(dataset)
+        self._backend.replace_datamanager(dataset)
 
         if self._logger is None:
             self._logger = self._get_logger(dataset.dataset_name)
@@ -1333,6 +1367,207 @@ def fit_pipeline(self,
 
         return fitted_pipeline, run_info, run_value, dataset
 
+    def fit_ensemble(
+            self,
+            optimize_metric: Optional[str] = None,
+            precision: Optional[int] = None,
+            ensemble_nbest: int = 50,
+            ensemble_size: int = 50,
+            load_models: bool = True,
+            time_for_task: int = 100,
+            func_eval_time_limit_secs: int = 50,
+            enable_traditional_pipeline: bool = True,
+    ) -> 'BaseTask':
+        """
+        Enables post-hoc fitting of the ensemble after the `search()`
+        method is finished. This method creates an ensemble using all
+        the models stored on disk during the smbo run.
+
+        Args:
+            optimize_metric (str): name of the metric that is used to
+                evaluate a pipeline. if not specified, value passed to search will be used
+            precision (int), (default=32): Numeric precision used when loading
+                ensemble data. Can be either 16, 32 or 64.
+            ensemble_nbest (Optional[int]):
+                only consider the ensemble_nbest models to build the ensemble.
+                If None, uses the value stored in class attribute `ensemble_nbest`.
+            ensemble_size (int) (default=50):
+                Number of models added to the ensemble built by
+                Ensemble selection from libraries of models.
+                Models are drawn with replacement.
+            enable_traditional_pipeline (bool), (default=True):
+                We fit traditional machine learning algorithms
+                (LightGBM, CatBoost, RandomForest, ExtraTrees, KNN, SVM)
+                prior building PyTorch Neural Networks. You can disable this
+                feature by turning this flag to False. All machine learning
+                algorithms that are fitted during search() are considered for
+                ensemble building.
+            load_models (bool), (default=True): Whether to load the
+                models after fitting AutoPyTorch.
+            time_for_task (int), (default=100): Time limit
+                in seconds for the search of appropriate models.
+                By increasing this value, autopytorch has a higher
+                chance of finding better models.
+            func_eval_time_limit_secs (int), (default=None): Time limit
+                for a single call to the machine learning model.
+                Model fitting will be terminated if the machine
+                learning algorithm runs over the time limit. Set
+                this value high enough so that typical machine
+                learning algorithms can be fit on the training
+                data.
+                When set to None, this time will automatically be set to
+                total_walltime_limit // 2 to allow enough time to fit
+                at least 2 individual machine learning algorithms.
+                Set to np.inf in case no time limit is desired.
+
+        Returns:
+            self
+        """
+        # Make sure that input is valid
+        if self.dataset is None or self.opt_metric is None:
+            raise ValueError("fit_ensemble() can only be called after `search()`. "
+                             "Please call the `search()` method of {} prior to "
+                             "fit_ensemble().".format(self.__class__.__name__))
+
+        if precision not in [16, 32, 64]:
+            raise ValueError("precision must be one of 16, 32, 64 but got {}".format(precision))
+
+        if self._logger is None:
+            self._logger = self._get_logger(self.dataset.dataset_name)
+
+        # Create a client if needed
+        if self._dask_client is None:
+            self._create_dask_client()
+        else:
+            self._is_dask_client_internally_created = False
+
+        ensemble_fit_task_name = 'EnsembleFit'
+        self._stopwatch.start_task(ensemble_fit_task_name)
+        if enable_traditional_pipeline:
+            if func_eval_time_limit_secs is None or func_eval_time_limit_secs > time_for_task:
+                self._logger.warning(
+                    'Time limit for a single run is higher than total time '
+                    'limit. Capping the limit for a single run to the total '
+                    'time given to Ensemble fit (%f)' % time_for_task
+                )
+                func_eval_time_limit_secs = time_for_task
+
+            # Make sure that at least 2 models are created for the ensemble process
+            num_models = time_for_task // func_eval_time_limit_secs
+            if num_models < 2:
+                func_eval_time_limit_secs = time_for_task // 2
+                self._logger.warning(
+                    "Capping the func_eval_time_limit_secs to {} to have "
+                    "time for at least 2 models to ensemble.".format(
+                        func_eval_time_limit_secs
+                    )
+                )
+        # ============> Run Dummy predictions
+        dummy_task_name = 'runDummy'
+        self._stopwatch.start_task(dummy_task_name)
+        self._do_dummy_prediction()
+        self._stopwatch.stop_task(dummy_task_name)
+
+        # ============> Run traditional ml
+        if enable_traditional_pipeline:
+            self.run_traditional_ml(current_task_name=ensemble_fit_task_name,
+                                    runtime_limit=time_for_task,
+                                    func_eval_time_limit_secs=func_eval_time_limit_secs)
+
+        elapsed_time = self._stopwatch.wall_elapsed(ensemble_fit_task_name)
+        time_left_for_ensemble = int(time_for_task - elapsed_time)
+        manager = self._init_ensemble_builder(
+            time_left_for_ensembles=time_left_for_ensemble,
+            optimize_metric=self.opt_metric if optimize_metric is None else optimize_metric,
+            precision=self.precision if precision is None else precision,
+            ensemble_size=ensemble_size,
+            ensemble_nbest=ensemble_nbest,
+        )
+
+        manager.build_ensemble(self._dask_client)
+        future = manager.futures.pop()
+        result = future.result()
+        if result is None:
+            raise ValueError("Errors occurred while building the ensemble - please"
+                             " check the log file and command line output for error messages.")
+        self.ensemble_performance_history, _, _, _ = result
+
+        if load_models:
+            self._load_models()
+
+        self._stopwatch.stop_task(ensemble_fit_task_name)
+
+        self._cleanup()
+
+        return self
+
+    def _init_ensemble_builder(
+            self,
+            time_left_for_ensembles: float,
+            optimize_metric: str,
+            ensemble_nbest: int,
+            ensemble_size: int,
+            precision: int = 32,
+    ) -> EnsembleBuilderManager:
+        """
+        Initializes an `EnsembleBuilderManager`.
+        Args:
+            time_left_for_ensembles (float):
+                Time (in seconds) allocated to building the ensemble
+            optimize_metric (str):
+                Name of the metric to optimize the ensemble.
+            ensemble_nbest (int):
+                only consider the ensemble_nbest models to build the ensemble.
+            ensemble_size (int):
+                Number of models added to the ensemble built by
+                Ensemble selection from libraries of models.
+                Models are drawn with replacement.
+            precision (int), (default=32): Numeric precision used when loading
+                ensemble data. Can be either 16, 32 or 64.
+
+        Returns:
+            EnsembleBuilderManager
+        """
+        if self._logger is None:
+            raise ValueError("logger should be initialized to fit ensemble")
+        if self.dataset is None:
+            raise ValueError("ensemble can only be initialised after or during `search()`. "
+                             "Please call the `search()` method of {}.".format(self.__class__.__name__))
+
+        self._logger.info("Starting ensemble")
+        ensemble_task_name = 'ensemble'
+        self._stopwatch.start_task(ensemble_task_name)
+
+        # Use the current thread to start the ensemble builder process
+        # The function ensemble_builder_process will internally create a ensemble
+        # builder in the provide dask client
+        required_dataset_properties = {'task_type': self.task_type,
+                                       'output_type': self.dataset.output_type}
+        proc_ensemble = EnsembleBuilderManager(
+            start_time=time.time(),
+            time_left_for_ensembles=time_left_for_ensembles,
+            backend=copy.deepcopy(self._backend),
+            dataset_name=str(self.dataset.dataset_name),
+            output_type=STRING_TO_OUTPUT_TYPES[self.dataset.output_type],
+            task_type=STRING_TO_TASK_TYPES[self.task_type],
+            metrics=[self._metric] if self._metric is not None else get_metrics(
+                dataset_properties=required_dataset_properties, names=[optimize_metric]),
+            opt_metric=optimize_metric,
+            ensemble_size=ensemble_size,
+            ensemble_nbest=ensemble_nbest,
+            max_models_on_disc=self.max_models_on_disc,
+            seed=self.seed,
+            max_iterations=None,
+            read_at_most=sys.maxsize,
+            ensemble_memory_limit=self._memory_limit,
+            random_state=self.seed,
+            precision=precision,
+            logger_port=self._logger_port,
+        )
+        self._stopwatch.stop_task(ensemble_task_name)
+
+        return proc_ensemble
+
     def predict(
         self,
         X_test: np.ndarray,
@@ -1382,7 +1617,7 @@ def predict(
 
         predictions = self.ensemble_.predict(all_predictions)
 
-        self._clean_logger()
+        self._cleanup()
 
         return predictions
 
@@ -1419,10 +1654,7 @@ def __getstate__(self) -> Dict[str, Any]:
         return self.__dict__
 
     def __del__(self) -> None:
-        # Clean up the logger
-        self._clean_logger()
-
-        self._close_dask_client()
+        self._cleanup()
 
         # When a multiprocessing work is done, the
         # objects are deleted. We don't want to delete run areas
diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py
index 209853c6e..b261f3b0f 100644
--- a/autoPyTorch/api/tabular_classification.py
+++ b/autoPyTorch/api/tabular_classification.py
@@ -27,11 +27,14 @@
 class TabularClassificationTask(BaseTask):
     """
     Tabular Classification API to the pipelines.
+
     Args:
         seed (int):
             seed to be used for reproducibility.
         n_jobs (int), (default=1):
             number of consecutive processes to spawn.
+        n_threads (int), (default=1):
+            number of threads to use for each process.
         logging_config (Optional[Dict]):
             specifies configuration for logging, if None, it is loaded from the logging.yaml
         ensemble_size (int), (default=50):
@@ -63,6 +66,7 @@ def __init__(
         self,
         seed: int = 1,
         n_jobs: int = 1,
+        n_threads: int = 1,
         logging_config: Optional[Dict] = None,
         ensemble_size: int = 50,
         ensemble_nbest: int = 50,
@@ -83,6 +87,7 @@ def __init__(
         super().__init__(
             seed=seed,
             n_jobs=n_jobs,
+            n_threads=n_threads,
             logging_config=logging_config,
             ensemble_size=ensemble_size,
             ensemble_nbest=ensemble_nbest,
@@ -277,6 +282,8 @@ def search(
                          y_test=y_test,
                          dataset_name=dataset_name)
 
+        if self.dataset is None:
+            raise ValueError("`dataset` in {} must be initialized, but got None".format(self.__class__.__name__))
         return self._search(
             dataset=self.dataset,
             optimize_metric=optimize_metric,
diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py
index ed7e1c4f6..cbaaa9099 100644
--- a/autoPyTorch/api/tabular_regression.py
+++ b/autoPyTorch/api/tabular_regression.py
@@ -27,9 +27,13 @@
 class TabularRegressionTask(BaseTask):
     """
     Tabular Regression API to the pipelines.
+
     Args:
         seed (int): seed to be used for reproducibility.
-        n_jobs (int), (default=1): number of consecutive processes to spawn.
+        n_jobs (int), (default=1):
+            number of consecutive processes to spawn.
+        n_threads (int), (default=1):
+            number of threads to use for each process.
         logging_config (Optional[Dict]): specifies configuration
             for logging, if None, it is loaded from the logging.yaml
         ensemble_size (int), (default=50): Number of models added to the ensemble built by
@@ -50,11 +54,11 @@ class TabularRegressionTask(BaseTask):
             Otherwise specifies set of components not to use. Incompatible with include
             components
     """
-
     def __init__(
             self,
             seed: int = 1,
             n_jobs: int = 1,
+            n_threads: int = 1,
             logging_config: Optional[Dict] = None,
             ensemble_size: int = 50,
             ensemble_nbest: int = 50,
@@ -75,6 +79,7 @@ def __init__(
         super().__init__(
             seed=seed,
             n_jobs=n_jobs,
+            n_threads=n_threads,
             logging_config=logging_config,
             ensemble_size=ensemble_size,
             ensemble_nbest=ensemble_nbest,
@@ -263,6 +268,8 @@ def search(
                          y_test=y_test,
                          dataset_name=dataset_name)
 
+        if self.dataset is None:
+            raise ValueError("`dataset` in {} must be initialized, but got None".format(self.__class__.__name__))
         return self._search(
             dataset=self.dataset,
             optimize_metric=optimize_metric,
diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py
index 0106a3aa8..700af1050 100644
--- a/autoPyTorch/data/base_feature_validator.py
+++ b/autoPyTorch/data/base_feature_validator.py
@@ -1,5 +1,5 @@
 import logging
-import typing
+from typing import List, Optional, Set, Tuple, Union
 
 import numpy as np
 
@@ -12,8 +12,8 @@
 from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 
-SUPPORTED_FEAT_TYPES = typing.Union[
-    typing.List,
+SUPPORTED_FEAT_TYPES = Union[
+    List,
     pd.DataFrame,
     np.ndarray,
     scipy.sparse.bsr_matrix,
@@ -29,66 +29,64 @@
 class BaseFeatureValidator(BaseEstimator):
     """
     A class to pre-process features. In this regards, the format of the data is checked,
-    and if applicable, features are encoded
+    and if applicable, features are encoded.
+
     Attributes:
         feat_type (List[str]):
             List of the column types found by this estimator during fit.
         data_type (str):
             Class name of the data type provided during fit.
-        encoder (typing.Optional[BaseEstimator])
+        encoder (Optional[BaseEstimator])
             Host a encoder object if the data requires transformation (for example,
-            if provided a categorical column in a pandas DataFrame)
-        enc_columns (typing.List[str])
-            List of columns that were encoded.
+            if provided a categorical column in a pandas DataFrame).
     """
-    def __init__(self,
-                 logger: typing.Optional[typing.Union[PicklableClientLogger, logging.Logger
-                                                      ]] = None,
-                 ) -> None:
+    def __init__(
+        self,
+        logger: Optional[Union[PicklableClientLogger, logging.Logger]] = None,
+    ) -> None:
         # Register types to detect unsupported data format changes
-        self.feat_type = None  # type: typing.Optional[typing.List[str]]
-        self.data_type = None  # type: typing.Optional[type]
-        self.dtypes = []  # type: typing.List[str]
-        self.column_order = []  # type: typing.List[str]
+        self.feat_type: Optional[List[str]] = None
+        self.data_type: Optional[type] = None
+        self.dtypes: List[str] = []
+        self.column_order: List[str] = []
 
-        self.encoder = None  # type: typing.Optional[BaseEstimator]
-        self.enc_columns = []  # type: typing.List[str]
+        self.column_transformer: Optional[BaseEstimator] = None
 
-        self.logger: typing.Union[
+        self.logger: Union[
             PicklableClientLogger, logging.Logger
         ] = logger if logger is not None else logging.getLogger(__name__)
 
         # Required for dataset properties
-        self.num_features = None  # type: typing.Optional[int]
-        self.categories = []  # type: typing.List[typing.List[int]]
-        self.categorical_columns: typing.List[int] = []
-        self.numerical_columns: typing.List[int] = []
-        # column identifiers may be integers or strings
-        self.null_columns: typing.Set[str] = set()
+        self.num_features: Optional[int] = None
+        self.categories: List[List[int]] = []
+        self.categorical_columns: List[int] = []
+        self.numerical_columns: List[int] = []
+
+        self.all_nan_columns: Optional[Set[Union[int, str]]] = None
 
         self._is_fitted = False
 
     def fit(
         self,
         X_train: SUPPORTED_FEAT_TYPES,
-        X_test: typing.Optional[SUPPORTED_FEAT_TYPES] = None,
+        X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
     ) -> BaseEstimator:
         """
         Validates and fit a categorical encoder (if needed) to the features.
         The supported data types are List, numpy arrays and pandas DataFrames.
         CSR sparse data types are also supported
 
-        Arguments:
+        Args:
             X_train (SUPPORTED_FEAT_TYPES):
                 A set of features that are going to be validated (type and dimensionality
                 checks) and a encoder fitted in the case the data needs encoding
-            X_test (typing.Optional[SUPPORTED_FEAT_TYPES]):
+            X_test (Optional[SUPPORTED_FEAT_TYPES]):
                 A hold out set of data used for checking
         """
 
         # If a list was provided, it will be converted to pandas
         if isinstance(X_train, list):
-            X_train, X_test = self.list_to_dataframe(X_train, X_test)
+            X_train, X_test = self.list_to_pandas(X_train, X_test)
 
         self._check_data(X_train)
 
@@ -114,7 +112,7 @@ def _fit(
         X: SUPPORTED_FEAT_TYPES,
     ) -> BaseEstimator:
         """
-        Arguments:
+        Args:
             X (SUPPORTED_FEAT_TYPES):
                 A set of features that are going to be validated (type and dimensionality
                 checks) and a encoder fitted in the case the data needs encoding
@@ -122,6 +120,7 @@ def _fit(
             self:
                 The fitted base estimator
         """
+
         raise NotImplementedError()
 
     def _check_data(
@@ -131,11 +130,12 @@ def _check_data(
         """
         Feature dimensionality and data type checks
 
-        Arguments:
+        Args:
             X (SUPPORTED_FEAT_TYPES):
                 A set of features that are going to be validated (type and dimensionality
                 checks) and a encoder fitted in the case the data needs encoding
         """
+
         raise NotImplementedError()
 
     def transform(
@@ -143,7 +143,7 @@ def transform(
         X: SUPPORTED_FEAT_TYPES,
     ) -> np.ndarray:
         """
-        Arguments:
+        Args:
             X_train (SUPPORTED_FEAT_TYPES):
                 A set of features, whose categorical features are going to be
                 transformed
@@ -152,4 +152,30 @@ def transform(
             np.ndarray:
                 The transformed array
         """
+
+        raise NotImplementedError()
+
+    def list_to_pandas(
+        self,
+        X_train: SUPPORTED_FEAT_TYPES,
+        X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
+    ) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
+        """
+        Converts a list to a pandas DataFrame. In this process, column types are inferred.
+
+        If test data is provided, we proactively match it to train data
+
+        Args:
+            X_train (SUPPORTED_FEAT_TYPES):
+                A set of features that are going to be validated (type and dimensionality
+                checks) and a encoder fitted in the case the data needs encoding
+            X_test (Optional[SUPPORTED_FEAT_TYPES]):
+                A hold out set of data used for checking
+        Returns:
+            pd.DataFrame:
+                transformed train data from list to pandas DataFrame
+            pd.DataFrame:
+                transformed test data from list to pandas DataFrame
+        """
+
         raise NotImplementedError()
diff --git a/autoPyTorch/data/base_target_validator.py b/autoPyTorch/data/base_target_validator.py
index dba9c19e3..f191e985b 100644
--- a/autoPyTorch/data/base_target_validator.py
+++ b/autoPyTorch/data/base_target_validator.py
@@ -1,5 +1,5 @@
 import logging
-import typing
+from typing import List, Optional, Union, cast
 
 import numpy as np
 
@@ -12,8 +12,8 @@
 from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 
-SUPPORTED_TARGET_TYPES = typing.Union[
-    typing.List,
+SUPPORTED_TARGET_TYPES = Union[
+    List,
     pd.Series,
     pd.DataFrame,
     np.ndarray,
@@ -35,48 +35,50 @@ class BaseTargetValidator(BaseEstimator):
         is_classification (bool):
             A bool that indicates if the validator should operate in classification mode.
             During classification, the targets are encoded.
-        encoder (typing.Optional[BaseEstimator]):
+        encoder (Optional[BaseEstimator]):
             Host a encoder object if the data requires transformation (for example,
             if provided a categorical column in a pandas DataFrame)
-        enc_columns (typing.List[str])
+        enc_columns (List[str])
             List of columns that where encoded
     """
     def __init__(self,
                  is_classification: bool = False,
-                 logger: typing.Optional[typing.Union[PicklableClientLogger, logging.Logger
-                                                      ]] = None,
+                 logger: Optional[Union[PicklableClientLogger,
+                                        logging.Logger
+                                        ]
+                                  ] = None,
                  ) -> None:
         self.is_classification = is_classification
 
-        self.data_type = None  # type: typing.Optional[type]
+        self.data_type: Optional[type] = None
 
-        self.encoder = None  # type: typing.Optional[BaseEstimator]
+        self.encoder: Optional[BaseEstimator] = None
 
-        self.out_dimensionality = None  # type: typing.Optional[int]
-        self.type_of_target = None  # type: typing.Optional[str]
+        self.out_dimensionality: Optional[int] = None
+        self.type_of_target: Optional[str] = None
 
-        self.logger: typing.Union[
+        self.logger: Union[
             PicklableClientLogger, logging.Logger
         ] = logger if logger is not None else logging.getLogger(__name__)
 
         # Store the dtype for remapping to correct type
-        self.dtype = None  # type: typing.Optional[type]
+        self.dtype: Optional[type] = None
 
         self._is_fitted = False
 
     def fit(
         self,
         y_train: SUPPORTED_TARGET_TYPES,
-        y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None,
+        y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
     ) -> BaseEstimator:
         """
         Validates and fit a categorical encoder (if needed) to the targets
         The supported data types are List, numpy arrays and pandas DataFrames.
 
-        Arguments:
+        Args:
             y_train (SUPPORTED_TARGET_TYPES)
                 A set of targets set aside for training
-            y_test (typing.Union[SUPPORTED_TARGET_TYPES])
+            y_test (Union[SUPPORTED_TARGET_TYPES])
                 A hold out set of data used of the targets. It is also used to fit the
                 categories of the encoder.
         """
@@ -95,8 +97,8 @@ def fit(
                                      np.shape(y_test)
                                  ))
             if isinstance(y_train, pd.DataFrame):
-                y_train = typing.cast(pd.DataFrame, y_train)
-                y_test = typing.cast(pd.DataFrame, y_test)
+                y_train = cast(pd.DataFrame, y_train)
+                y_test = cast(pd.DataFrame, y_test)
                 if y_train.columns.tolist() != y_test.columns.tolist():
                     raise ValueError(
                         "Train and test targets must both have the same columns, yet "
@@ -127,24 +129,24 @@ def fit(
     def _fit(
         self,
         y_train: SUPPORTED_TARGET_TYPES,
-        y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None,
+        y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
     ) -> BaseEstimator:
         """
-        Arguments:
+        Args:
             y_train (SUPPORTED_TARGET_TYPES)
                 The labels of the current task. They are going to be encoded in case
                 of classification
-            y_test (typing.Optional[SUPPORTED_TARGET_TYPES])
+            y_test (Optional[SUPPORTED_TARGET_TYPES])
                 A holdout set of labels
         """
         raise NotImplementedError()
 
     def transform(
         self,
-        y: typing.Union[SUPPORTED_TARGET_TYPES],
+        y: Union[SUPPORTED_TARGET_TYPES],
     ) -> np.ndarray:
         """
-        Arguments:
+        Args:
             y (SUPPORTED_TARGET_TYPES)
                 A set of targets that are going to be encoded if the current task
                 is classification
@@ -161,8 +163,8 @@ def inverse_transform(
         """
         Revert any encoding transformation done on a target array
 
-        Arguments:
-            y (typing.Union[np.ndarray, pd.DataFrame, pd.Series]):
+        Args:
+            y (Union[np.ndarray, pd.DataFrame, pd.Series]):
                 Target array to be transformed back to original form before encoding
         Returns:
             np.ndarray:
diff --git a/autoPyTorch/data/base_validator.py b/autoPyTorch/data/base_validator.py
index 7528d56ab..4ef54c665 100644
--- a/autoPyTorch/data/base_validator.py
+++ b/autoPyTorch/data/base_validator.py
@@ -58,7 +58,7 @@ def fit(
             + Checks for dimensionality as well as missing values are performed.
             + If performing a classification task, the data is going to be encoded
 
-        Arguments:
+        Args:
             X_train (SUPPORTED_FEAT_TYPES):
                 A set of features that are going to be validated (type and dimensionality
                 checks). If this data contains categorical columns, an encoder is going to
@@ -102,7 +102,7 @@ def transform(
         """
         Transform the given target or features to a numpy array
 
-        Arguments:
+        Args:
             X (SUPPORTED_FEAT_TYPES):
                 A set of features to transform
             y (typing.Optional[SUPPORTED_TARGET_TYPES]):
diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
index f978046ad..10736d473 100644
--- a/autoPyTorch/data/tabular_feature_validator.py
+++ b/autoPyTorch/data/tabular_feature_validator.py
@@ -1,5 +1,5 @@
 import functools
-import typing
+from typing import Dict, List, Optional, Tuple, cast
 
 import numpy as np
 
@@ -9,15 +9,100 @@
 import scipy.sparse
 
 import sklearn.utils
-from sklearn import preprocessing
 from sklearn.base import BaseEstimator
 from sklearn.compose import ColumnTransformer
 from sklearn.exceptions import NotFittedError
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
 
 from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SUPPORTED_FEAT_TYPES
 
 
+def _create_column_transformer(
+    preprocessors: Dict[str, List[BaseEstimator]],
+    numerical_columns: List[str],
+    categorical_columns: List[str],
+) -> ColumnTransformer:
+    """
+    Given a dictionary of preprocessors, this function
+    creates a sklearn column transformer with appropriate
+    columns associated with their preprocessors.
+
+    Args:
+        preprocessors (Dict[str, List[BaseEstimator]]):
+            Dictionary containing list of numerical and categorical preprocessors.
+        numerical_columns (List[str]):
+            List of names of numerical columns
+        categorical_columns (List[str]):
+            List of names of categorical columns
+
+    Returns:
+        ColumnTransformer
+    """
+
+    numerical_pipeline = 'drop'
+    categorical_pipeline = 'drop'
+    if len(numerical_columns) > 0:
+        numerical_pipeline = make_pipeline(*preprocessors['numerical'])
+    if len(categorical_columns) > 0:
+        categorical_pipeline = make_pipeline(*preprocessors['categorical'])
+
+    return ColumnTransformer([
+        ('categorical_pipeline', categorical_pipeline, categorical_columns),
+        ('numerical_pipeline', numerical_pipeline, numerical_columns)],
+        remainder='drop'
+    )
+
+
+def get_tabular_preprocessors() -> Dict[str, List[BaseEstimator]]:
+    """
+    This function creates a Dictionary containing a list
+    of numerical and categorical preprocessors
+
+    Returns:
+        Dict[str, List[BaseEstimator]]
+    """
+    preprocessors: Dict[str, List[BaseEstimator]] = dict()
+
+    # Categorical Preprocessors
+    onehot_encoder = OneHotEncoder(categories='auto', sparse=False, handle_unknown='ignore')
+    categorical_imputer = SimpleImputer(strategy='constant', copy=False)
+
+    # Numerical Preprocessors
+    numerical_imputer = SimpleImputer(strategy='median', copy=False)
+    standard_scaler = StandardScaler(with_mean=True, with_std=True, copy=False)
+
+    preprocessors['categorical'] = [categorical_imputer, onehot_encoder]
+    preprocessors['numerical'] = [numerical_imputer, standard_scaler]
+
+    return preprocessors
+
+
 class TabularFeatureValidator(BaseFeatureValidator):
+
+    @staticmethod
+    def _comparator(cmp1: str, cmp2: str) -> int:
+        """Order so that categorical columns come left and numerical columns come right
+
+        Args:
+            cmp1 (str): First variable to compare
+            cmp2 (str): Second variable to compare
+
+        Raises:
+            ValueError: if the values of the variables to compare
+            are not in 'categorical' or 'numerical'
+
+        Returns:
+            int: either [0, -1, 1]
+        """
+        choices = ['categorical', 'numerical']
+        if cmp1 not in choices or cmp2 not in choices:
+            raise ValueError('The comparator for the column order only accepts {}, '
+                             'but got {} and {}'.format(choices, cmp1, cmp2))
+        idx1, idx2 = choices.index(cmp1), choices.index(cmp2)
+        return idx1 - idx2
+
     def _fit(
         self,
         X: SUPPORTED_FEAT_TYPES,
@@ -27,7 +112,7 @@ def _fit(
         features (from categorical for example) to a numerical value that further stages
         will be able to use
 
-        Arguments:
+        Args:
             X (SUPPORTED_FEAT_TYPES):
                 A set of features that are going to be validated (type and dimensionality
                 checks) and a encoder fitted in the case the data needs encoding
@@ -39,79 +124,45 @@ def _fit(
         # The final output of a validator is a numpy array. But pandas
         # gives us information about the column dtype
         if isinstance(X, np.ndarray):
-            X = self.numpy_array_to_pandas(X)
+
+            X = self.numpy_to_pandas(X)
+            # Replace the data type from the previously saved type.
+            self.data_type = type(X)
+            # save all the information about the column order and data types
+            self._check_data(X)
 
         if hasattr(X, "iloc") and not scipy.sparse.issparse(X):
-            X = typing.cast(pd.DataFrame, X)
-            # Treat a column with all instances a NaN as numerical
-            # This will prevent doing encoding to a categorical column made completely
-            # out of nan values -- which will trigger a fail, as encoding is not supported
-            # with nan values.
-            # Columns that are completely made of NaN values are provided to the pipeline
-            # so that later stages decide how to handle them
-
-            # Clear whatever null column markers we had previously
-            self.null_columns.clear()
-            if np.any(pd.isnull(X)):
-                for column in X.columns:
-                    if X[column].isna().all():
-                        self.null_columns.add(column)
-                        X[column] = pd.to_numeric(X[column])
-                        # Also note this change in self.dtypes
-                        if len(self.dtypes) != 0:
-                            self.dtypes[list(X.columns).index(column)] = X[column].dtype
-
-            if not X.select_dtypes(include='object').empty:
-                X = self.infer_objects(X)
 
-            self._check_data(X)
-            self.enc_columns, self.feat_type = self._get_columns_to_encode(X)
-
-            if len(self.enc_columns) > 0:
-                X = self.impute_nan_in_categories(X)
-
-                self.encoder = ColumnTransformer(
-                    [
-                        ("encoder",
-                         preprocessing.OrdinalEncoder(
-                             handle_unknown='use_encoded_value',
-                             unknown_value=-1,
-                         ), self.enc_columns)],
-                    remainder="passthrough"
-                )
+            X = cast(pd.DataFrame, X)
 
-                # Mypy redefinition
-                assert self.encoder is not None
-                self.encoder.fit(X)
-
-                # The column transformer reoders the feature types - we therefore need to change
-                # it as well
-                # This means columns are shifted to the right
-                def comparator(cmp1: str, cmp2: str) -> int:
-                    if (
-                        cmp1 == 'categorical' and cmp2 == 'categorical'
-                        or cmp1 == 'numerical' and cmp2 == 'numerical'
-                    ):
-                        return 0
-                    elif cmp1 == 'categorical' and cmp2 == 'numerical':
-                        return -1
-                    elif cmp1 == 'numerical' and cmp2 == 'categorical':
-                        return 1
-                    else:
-                        raise ValueError((cmp1, cmp2))
-
-                self.feat_type = sorted(
-                    self.feat_type,
-                    key=functools.cmp_to_key(comparator)
-                )
+            self.all_nan_columns = set([column for column in X.columns if X[column].isna().all()])
 
-                self.categories = [
-                    # We fit an ordinal encoder, where all categorical
-                    # columns are shifted to the left
-                    list(range(len(cat)))
-                    for cat in self.encoder.transformers_[0][1].categories_
-                ]
+            categorical_columns, numerical_columns, feat_type = self._get_columns_info(X)
+
+            self.enc_columns = categorical_columns
+
+            preprocessors = get_tabular_preprocessors()
+            self.column_transformer = _create_column_transformer(
+                preprocessors=preprocessors,
+                numerical_columns=numerical_columns,
+                categorical_columns=categorical_columns,
+            )
+
+            # Mypy redefinition
+            assert self.column_transformer is not None
+            self.column_transformer.fit(X)
 
+            # The column transformer reorders the feature types
+            # therefore, we need to change the order of columns as well
+            # This means categorical columns are shifted to the left
+
+            self.feat_type = sorted(
+                feat_type,
+                key=functools.cmp_to_key(self._comparator)
+            )
+
+            # differently to categorical_columns and numerical_columns,
+            # this saves the index of the column.
             for i, type_ in enumerate(self.feat_type):
                 if 'numerical' in type_:
                     self.numerical_columns.append(i)
@@ -120,6 +171,7 @@ def comparator(cmp1: str, cmp2: str) -> int:
 
         # Lastly, store the number of features
         self.num_features = np.shape(X)[1]
+
         return self
 
     def transform(
@@ -130,7 +182,7 @@ def transform(
         Validates and fit a categorical encoder (if needed) to the features.
         The supported data types are List, numpy arrays and pandas DataFrames.
 
-        Arguments:
+        Args:
             X_train (SUPPORTED_FEAT_TYPES):
                 A set of features, whose categorical features are going to be
                 transformed
@@ -138,49 +190,70 @@ def transform(
         Return:
             np.ndarray:
                 The transformed array
+
+        Note:
+            The default transform performs the folloing:
+                * simple imputation for both
+                * scaling for numerical
+                * one-hot encoding for categorical
+            For example, here is a simple case
+            of which all the columns are categorical.
+                data = [
+                    {'A': 1, 'B': np.nan, 'C': np.nan},
+                    {'A': np.nan, 'B': 3, 'C': np.nan},
+                    {'A': 2, 'B': np.nan, 'C': np.nan}
+                ]
+            and suppose all the columns are categorical,
+            then
+                * `A` in {np.nan, 1, 2}
+                * `B` in {np.nan, 3}
+                * `C` in {np.nan} <=== it will be dropped.
+
+            So in the column A,
+                * np.nan ==> [1, 0, 0] (always the index 0)
+                * 1      ==> [0, 1, 0]
+                * 2      ==> [0, 0, 1]
+            in the column B,
+                * np.nan ==> [1, 0]
+                * 3      ==> [0, 1]
+            Therefore, by concatenating,
+                * {'A': 1, 'B': np.nan, 'C': np.nan} ==> [0, 1, 0, 1, 0]
+                * {'A': np.nan, 'B': 3, 'C': np.nan} ==> [1, 0, 0, 0, 1]
+                * {'A': 2, 'B': np.nan, 'C': np.nan} ==> [0, 0, 1, 1, 0]
+                ==> [
+                    [0, 1, 0, 1, 0],
+                    [1, 0, 0, 0, 1],
+                    [0, 0, 1, 1, 0]
+                ]
         """
         if not self._is_fitted:
             raise NotFittedError("Cannot call transform on a validator that is not fitted")
 
         # If a list was provided, it will be converted to pandas
         if isinstance(X, list):
-            X, _ = self.list_to_dataframe(X)
+            X, _ = self.list_to_pandas(X)
 
         if isinstance(X, np.ndarray):
-            X = self.numpy_array_to_pandas(X)
+            X = self.numpy_to_pandas(X)
 
         if hasattr(X, "iloc") and not scipy.sparse.issparse(X):
-            X = typing.cast(pd.DataFrame, X)
-            # If we had null columns in our fit call and we made them numeric, then:
-            # - If the columns are null even in transform, apply the same procedure.
-            # - Otherwise, substitute the values with np.NaN and then make the columns numeric.
-            # If the column is null here, but it was not in fit, it does not matter.
-            for column in self.null_columns:
-                # The column is not null, make it null since it was null in fit.
-                if not X[column].isna().all():
-                    X[column] = np.NaN
-                X[column] = pd.to_numeric(X[column])
-
-            # for the test set, if we have columns with only null values
-            # they will probably have a numeric type. If these columns were not
-            # with only null values in the train set, they should be converted
-            # to the type that they had during fitting.
-            for column in X.columns:
-                if X[column].isna().all():
-                    X[column] = X[column].astype(self.dtypes[list(X.columns).index(column)])
-
-            # Also remove the object dtype for new data
-            if not X.select_dtypes(include='object').empty:
-                X = self.infer_objects(X)
+            X = cast(pd.DataFrame, X)
 
         # Check the data here so we catch problems on new test data
         self._check_data(X)
-        # We also need to fillna on the transformation
-        # in case test data is provided
-        X = self.impute_nan_in_categories(X)
 
-        if self.encoder is not None:
-            X = self.encoder.transform(X)
+        # in case of test data being all none and train data
+        # having a value for a categorical column.
+        # We need to convert the column in test data to
+        # object otherwise the test column is interpreted as float
+        if len(self.categorical_columns) > 0:
+            categorical_columns = self.column_transformer.transformers_[0][-1]
+            for column in categorical_columns:
+                if X[column].isna().all():
+                    X[column] = X[column].astype('object')
+
+        if self.column_transformer is not None:
+            X = self.column_transformer.transform(X)
 
         # Sparse related transformations
         # Not all sparse format support index sorting
@@ -209,7 +282,7 @@ def _check_data(
         """
         Feature dimensionality and data type checks
 
-        Arguments:
+        Args:
             X (SUPPORTED_FEAT_TYPES):
                 A set of features that are going to be validated (type and dimensionality
                 checks) and a encoder fitted in the case the data needs encoding
@@ -246,125 +319,120 @@ def _check_data(
         # Then for Pandas, we do not support Nan in categorical columns
         if hasattr(X, "iloc"):
             # If entered here, we have a pandas dataframe
-            X = typing.cast(pd.DataFrame, X)
+            X = cast(pd.DataFrame, X)
 
             # Handle objects if possible
-            if not X.select_dtypes(include='object').empty:
+            exist_object_columns = has_object_columns(X.dtypes.values)
+            if exist_object_columns:
                 X = self.infer_objects(X)
 
-            # Define the column to be encoded here as the feature validator is fitted once
-            # per estimator
-            enc_columns, _ = self._get_columns_to_encode(X)
-
             column_order = [column for column in X.columns]
             if len(self.column_order) > 0:
                 if self.column_order != column_order:
-                    raise ValueError("Changing the column order of the features after fit() is "
-                                     "not supported. Fit() method was called with "
-                                     "{} whereas the new features have {} as type".format(self.column_order,
-                                                                                          column_order,)
-                                     )
+                    raise ValueError("The column order of the features must not be changed after fit(), but"
+                                     " the column order are different between training ({}) and"
+                                     " test ({}) datasets.".format(self.column_order, column_order))
             else:
                 self.column_order = column_order
 
             dtypes = [dtype.name for dtype in X.dtypes]
-            if len(self.dtypes) > 0:
-                if self.dtypes != dtypes:
-                    raise ValueError("Changing the dtype of the features after fit() is "
-                                     "not supported. Fit() method was called with "
-                                     "{} whereas the new features have {} as type".format(self.dtypes,
-                                                                                          dtypes,
-                                                                                          )
-                                     )
-            else:
-                self.dtypes = dtypes
 
-    def _get_columns_to_encode(
+            dtypes_diff = [s_dtype != dtype for s_dtype, dtype in zip(self.dtypes, dtypes)]
+            if len(self.dtypes) == 0:
+                self.dtypes = dtypes
+            elif (
+                any(dtypes_diff)  # the dtypes of some columns are different in train and test dataset
+                and self.all_nan_columns is not None  # Ignore all_nan_columns is None
+                and len(set(X.columns[dtypes_diff]).difference(self.all_nan_columns)) != 0
+            ):
+                # The dtypes can be different if and only if the column belongs
+                # to all_nan_columns as these columns would be imputed.
+                raise ValueError("The dtype of the features must not be changed after fit(), but"
+                                 " the dtypes of some columns are different between training ({}) and"
+                                 " test ({}) datasets.".format(self.dtypes, dtypes))
+
+    def _get_columns_info(
         self,
         X: pd.DataFrame,
-    ) -> typing.Tuple[typing.List[str], typing.List[str]]:
+    ) -> Tuple[List[str], List[str], List[str]]:
         """
         Return the columns to be encoded from a pandas dataframe
 
-        Arguments:
+        Args:
             X (pd.DataFrame)
                 A set of features that are going to be validated (type and dimensionality
                 checks) and a encoder fitted in the case the data needs encoding
         Returns:
-            enc_columns (List[str]):
-                Columns to encode, if any
-            feat_type:
+            categorical_columns (List[str])
+                List of the names of categorical columns.
+            numerical_columns (List[str])
+                List of the names of numerical columns.
+            feat_type (List[str])
                 Type of each column numerical/categorical
         """
         # Register if a column needs encoding
-        enc_columns = []
-
+        numerical_columns = []
+        categorical_columns = []
         # Also, register the feature types for the estimator
         feat_type = []
 
         # Make sure each column is a valid type
-        for column in X.columns:
-            if X[column].dtype.name in ['category', 'bool']:
-
-                enc_columns.append(column)
+        for i, column in enumerate(X.columns):
+            if self.all_nan_columns is not None and column in self.all_nan_columns:
+                continue
+            column_dtype = self.dtypes[i]
+            err_msg = "Valid types are `numerical`, `categorical` or `boolean`, " \
+                      "but input column {} has an invalid type `{}`.".format(column, column_dtype)
+            if column_dtype in ['category', 'bool']:
+                categorical_columns.append(column)
                 feat_type.append('categorical')
             # Move away from np.issubdtype as it causes
             # TypeError: data type not understood in certain pandas types
-            elif not is_numeric_dtype(X[column]):
-                if X[column].dtype.name == 'object':
-                    raise ValueError(
-                        "Input Column {} has invalid type object. "
-                        "Cast it to a valid dtype before using it in AutoPyTorch. "
-                        "Valid types are numerical, categorical or boolean. "
-                        "You can cast it to a valid dtype using "
-                        "pandas.Series.astype ."
-                        "If working with string objects, the following "
-                        "tutorial illustrates how to work with text data: "
-                        "https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html".format(
-                            # noqa: E501
-                            column,
-                        )
-                    )
-                elif pd.core.dtypes.common.is_datetime_or_timedelta_dtype(
-                    X[column].dtype
-                ):
-                    raise ValueError(
-                        "AutoPyTorch does not support time and/or date datatype as given "
-                        "in column {}. Please convert the time information to a numerical value "
-                        "first. One example on how to do this can be found on "
-                        "https://stats.stackexchange.com/questions/311494/".format(
-                            column,
-                        )
-                    )
-                else:
-                    raise ValueError(
-                        "Input Column {} has unsupported dtype {}. "
-                        "Supported column types are categorical/bool/numerical dtypes. "
-                        "Make sure your data is formatted in a correct way, "
-                        "before feeding it to AutoPyTorch.".format(
-                            column,
-                            X[column].dtype.name,
-                        )
+            elif is_numeric_dtype(column_dtype):
+                feat_type.append('numerical')
+                numerical_columns.append(column)
+            elif column_dtype == 'object':
+                # TODO verify how would this happen when we always convert the object dtypes to category
+                raise TypeError(
+                    "{} Cast it to a valid dtype before feeding it to AutoPyTorch. "
+                    "You can cast it to a valid dtype using pandas.Series.astype."
+                    "If you are working with string objects, the following "
+                    "tutorial illustrates how to work with text data: "
+                    "https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html".format(
+                        # noqa: E501
+                        err_msg,
                     )
+                )
+            elif pd.core.dtypes.common.is_datetime_or_timedelta_dtype(column_dtype):
+                raise TypeError(
+                    "{} Convert the time information to a numerical value"
+                    " before feeding it to AutoPyTorch. "
+                    "One example of the conversion can be found on "
+                    "https://stats.stackexchange.com/questions/311494/".format(err_msg)
+                )
             else:
-                feat_type.append('numerical')
-        return enc_columns, feat_type
+                raise TypeError(
+                    "{} Make sure your data is formatted in a correct way"
+                    "before feeding it to AutoPyTorch.".format(err_msg)
+                )
+
+        return categorical_columns, numerical_columns, feat_type
 
-    def list_to_dataframe(
+    def list_to_pandas(
         self,
         X_train: SUPPORTED_FEAT_TYPES,
-        X_test: typing.Optional[SUPPORTED_FEAT_TYPES] = None,
-    ) -> typing.Tuple[pd.DataFrame, typing.Optional[pd.DataFrame]]:
+        X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
+    ) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
         """
         Converts a list to a pandas DataFrame. In this process, column types are inferred.
 
         If test data is provided, we proactively match it to train data
 
-        Arguments:
+        Args:
             X_train (SUPPORTED_FEAT_TYPES):
                 A set of features that are going to be validated (type and dimensionality
                 checks) and a encoder fitted in the case the data needs encoding
-            X_test (typing.Optional[SUPPORTED_FEAT_TYPES]):
+            X_test (Optional[SUPPORTED_FEAT_TYPES]):
                 A hold out set of data used for checking
         Returns:
             pd.DataFrame:
@@ -374,7 +442,7 @@ def list_to_dataframe(
         """
 
         # If a list was provided, it will be converted to pandas
-        X_train = pd.DataFrame(data=X_train).infer_objects()
+        X_train = pd.DataFrame(data=X_train).convert_dtypes()
         self.logger.warning("The provided feature types to AutoPyTorch are of type list."
                             "Features have been interpreted as: {}".format([(col, t) for col, t in
                                                                             zip(X_train.columns, X_train.dtypes)]))
@@ -383,17 +451,18 @@ def list_to_dataframe(
                 self.logger.warning("Train features are a list while the provided test data"
                                     "is {}. X_test will be casted as DataFrame.".format(type(X_test))
                                     )
-            X_test = pd.DataFrame(data=X_test).infer_objects()
+            X_test = pd.DataFrame(data=X_test).convert_dtypes()
+
         return X_train, X_test
 
-    def numpy_array_to_pandas(
-        self,
+    @staticmethod
+    def numpy_to_pandas(
         X: np.ndarray,
     ) -> pd.DataFrame:
         """
         Converts a numpy array to pandas for type inference
 
-        Arguments:
+        Args:
             X (np.ndarray):
                 data to be interpreted.
 
@@ -408,7 +477,7 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
 
         This has to be done once, so the test and train data are treated equally
 
-        Arguments:
+        Args:
             X (pd.DataFrame):
                 data to be interpreted.
 
@@ -426,66 +495,33 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
                     self.logger.warning(f'Casting the column {key} to {dtype} caused the exception {e}')
                     pass
         else:
+            # Calling for the first time to infer the categories
             X = X.infer_objects()
-            for column in X.columns:
-                if not is_numeric_dtype(X[column]):
+            for column, data_type in zip(X.columns, X.dtypes):
+                if not is_numeric_dtype(data_type):
                     X[column] = X[column].astype('category')
-            self.object_dtype_mapping = {column: X[column].dtype for column in X.columns}
+
+            # only numerical attributes and categories
+            self.object_dtype_mapping = {column: data_type for column, data_type in zip(X.columns, X.dtypes)}
+
         self.logger.debug(f"Infer Objects: {self.object_dtype_mapping}")
+
         return X
 
-    def impute_nan_in_categories(self, X: pd.DataFrame) -> pd.DataFrame:
-        """
-        impute missing values before encoding,
-        remove once sklearn natively supports
-        it in ordinal encoding. Sklearn issue:
-        "https://github.com/scikit-learn/scikit-learn/issues/17123)"
 
-        Arguments:
-            X (pd.DataFrame):
-                data to be interpreted.
+def has_object_columns(
+    feature_types: pd.Series,
+) -> bool:
+    """
+    Indicate whether on a Series of dtypes for a Pandas DataFrame
+    there exists one or more object columns.
 
-        Returns:
-            pd.DataFrame
-        """
+    Args:
+        feature_types (pd.Series): The feature types for a DataFrame.
 
-        # To be on the safe side, map always to the same missing
-        # value per column
-        if not hasattr(self, 'dict_nancol_to_missing'):
-            self.dict_missing_value_per_col: typing.Dict[str, typing.Any] = {}
-
-        # First make sure that we do not alter the type of the column which cause:
-        # TypeError: '<' not supported between instances of 'int' and 'str'
-        # in the encoding
-        for column in self.enc_columns:
-            if X[column].isna().any():
-                if column not in self.dict_missing_value_per_col:
-                    try:
-                        float(X[column].dropna().values[0])
-                        can_cast_as_number = True
-                    except Exception:
-                        can_cast_as_number = False
-                    if can_cast_as_number:
-                        # In this case, we expect to have a number as category
-                        # it might be string, but its value represent a number
-                        missing_value: typing.Union[str, int] = '-1' if isinstance(X[column].dropna().values[0],
-                                                                                   str) else -1
-                    else:
-                        missing_value = 'Missing!'
-
-                    # Make sure this missing value is not seen before
-                    # Do this check for categorical columns
-                    # else modify the value
-                    if hasattr(X[column], 'cat'):
-                        while missing_value in X[column].cat.categories:
-                            if isinstance(missing_value, str):
-                                missing_value += '0'
-                            else:
-                                missing_value += missing_value
-                    self.dict_missing_value_per_col[column] = missing_value
-
-                # Convert the frame in place
-                X[column].cat.add_categories([self.dict_missing_value_per_col[column]],
-                                             inplace=True)
-                X.fillna({column: self.dict_missing_value_per_col[column]}, inplace=True)
-        return X
+    Returns:
+        bool:
+            True if the DataFrame dtypes contain an object column, False
+            otherwise.
+    """
+    return np.dtype('O') in feature_types
diff --git a/autoPyTorch/data/tabular_target_validator.py b/autoPyTorch/data/tabular_target_validator.py
index 239791768..7cbd88c38 100644
--- a/autoPyTorch/data/tabular_target_validator.py
+++ b/autoPyTorch/data/tabular_target_validator.py
@@ -28,7 +28,7 @@ def _fit(
         It does so by also using the classes from the test data, to prevent encoding
         errors
 
-        Arguments:
+        Args:
             y_train (SUPPORTED_TARGET_TYPES)
                 The labels of the current task. They are going to be encoded in case
                 of classification
@@ -100,7 +100,7 @@ def transform(
         Validates and fit a categorical encoder (if needed) to the features.
         The supported data types are List, numpy arrays and pandas DataFrames.
 
-        Arguments:
+        Args:
             y (SUPPORTED_TARGET_TYPES)
                 A set of targets that are going to be encoded if the current task
                 is classification
@@ -152,7 +152,7 @@ def inverse_transform(
         """
         Revert any encoding transformation done on a target array
 
-        Arguments:
+        Args:
             y (typing.Union[np.ndarray, pd.DataFrame, pd.Series]):
                 Target array to be transformed back to original form before encoding
         Returns:
@@ -189,7 +189,7 @@ def _check_data(
         """
         Perform dimensionality and data type checks on the targets
 
-        Arguments:
+        Args:
             y (typing.Union[np.ndarray, pd.DataFrame, pd.Series]):
                 A set of features whose dimensionality and data type is going to be checked
         """
diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py
index 9f9b95369..f041be5ec 100644
--- a/autoPyTorch/datasets/base_dataset.py
+++ b/autoPyTorch/datasets/base_dataset.py
@@ -330,13 +330,22 @@ def get_dataset_for_training(self, split_id: int, train: bool) -> Dataset:
         to provide training data to fit a pipeline
 
         Args:
-            split (int): The desired subset of the dataset to split and use
+            split_id (int): which split id to get from the splits
+            train (bool): whether the dataset is required for training or evaluating.
 
         Returns:
+
             Dataset: the reduced dataset to be used for testing
         """
         # Subset creates a dataset. Splits is a (train_indices, test_indices) tuple
-        return TransformSubset(self, self.splits[split_id][0], train=train)
+        if split_id >= len(self.splits):  # old version: split_id > len(self.splits)
+            raise IndexError("split_id out of range, got split_id={}"
+                             " (>= num_splits={})".format(split_id, len(self.splits)))
+        subset = int(not train)
+        indices = self.splits[split_id][subset]
+        if indices is None:
+            raise ValueError("Specified fold (or subset) does not exist")
+        return TransformSubset(self, indices, train=train)
 
     def replace_data(self, X_train: BaseDatasetInputType,
                      X_test: Optional[BaseDatasetInputType]) -> 'BaseDataset':
diff --git a/autoPyTorch/ensemble/singlebest_ensemble.py b/autoPyTorch/ensemble/singlebest_ensemble.py
index c6fbaf576..6f82cbdf4 100644
--- a/autoPyTorch/ensemble/singlebest_ensemble.py
+++ b/autoPyTorch/ensemble/singlebest_ensemble.py
@@ -3,7 +3,7 @@
 
 import numpy as np
 
-from smac.runhistory.runhistory import RunHistory
+from smac.runhistory.runhistory import RunHistory, StatusType
 
 from autoPyTorch.ensemble.abstract_ensemble import AbstractEnsemble
 from autoPyTorch.pipeline.base_pipeline import BasePipeline
@@ -49,6 +49,9 @@ def get_identifiers_from_run_history(self) -> List[Tuple[int, int, float]]:
 
         for run_key in self.run_history.data.keys():
             run_value = self.run_history.data[run_key]
+            if run_value.status == StatusType.CRASHED:
+                continue
+
             score = self.metric._optimum - (self.metric._sign * run_value.cost)
 
             if (score > best_model_score and self.metric._sign > 0) \
diff --git a/autoPyTorch/pipeline/base_pipeline.py b/autoPyTorch/pipeline/base_pipeline.py
index 842f63271..d98be9bd4 100644
--- a/autoPyTorch/pipeline/base_pipeline.py
+++ b/autoPyTorch/pipeline/base_pipeline.py
@@ -21,7 +21,9 @@
     get_match_array
 )
 from autoPyTorch.utils.common import FitRequirement
-from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
+from autoPyTorch.utils.hyperparameter_search_space_update import (
+    HyperparameterSearchSpaceUpdates
+)
 
 
 class BasePipeline(Pipeline):
@@ -398,6 +400,7 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]],
                 raise ValueError("Unknown node name. Expected update node name to be in {} "
                                  "got {}".format(self.named_steps.keys(), update.node_name))
             node = self.named_steps[update.node_name]
+            node_name = node.__class__.__name__
             # if node is a choice module
             if hasattr(node, 'get_components'):
                 split_hyperparameter = update.hyperparameter.split(':')
@@ -425,18 +428,18 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]],
                             if choice in exclude[update.node_name]:
                                 raise ValueError("Found {} in exclude".format(choice))
                         if choice not in components.keys():
-                            raise ValueError("Unknown hyperparameter for choice {}. "
+                            raise ValueError("Unknown component choice for node {}. "
                                              "Expected update hyperparameter "
-                                             "to be in {} got {}".format(node.__class__.__name__,
-                                                                         components.keys(), choice))
+                                             "to be in {}, but got {}".format(node_name,
+                                                                              components.keys(), choice))
                 # check if the component whose hyperparameter
                 # needs to be updated is in components of the
                 # choice module
                 elif split_hyperparameter[0] not in components.keys():
-                    raise ValueError("Unknown hyperparameter for choice {}. "
-                                     "Expected update hyperparameter "
-                                     "to be in {} got {}".format(node.__class__.__name__,
-                                                                 components.keys(), split_hyperparameter[0]))
+                    raise ValueError("Unknown component choice for node {}. "
+                                     "Expected update component "
+                                     "to be in {}, but got {}".format(node_name,
+                                                                      components.keys(), split_hyperparameter[0]))
                 else:
                     # check if hyperparameter is in the search space of the component
                     component = components[split_hyperparameter[0]]
@@ -449,14 +452,16 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]],
                                 component.get_hyperparameter_search_space(
                                     dataset_properties=self.dataset_properties).get_hyperparameter_names()]):
                             continue
-                        raise ValueError("Unknown hyperparameter for component {}. "
-                                         "Expected update hyperparameter "
-                                         "to be in {} got {}".format(node.__class__.__name__,
-                                                                     component.
-                                                                     get_hyperparameter_search_space(
-                                                                         dataset_properties=self.dataset_properties).
-                                                                     get_hyperparameter_names(),
-                                                                     split_hyperparameter[1]))
+                        component_hyperparameters = component.get_hyperparameter_search_space(
+                            dataset_properties=self.dataset_properties).get_hyperparameter_names()
+                        raise ValueError("Unknown hyperparameter for  component {} of node {}."
+                                         " Expected update hyperparameter "
+                                         "to be in {}, but got {}.".format(component.__name__,
+                                                                           node_name,
+                                                                           component_hyperparameters,
+                                                                           split_hyperparameter[1]
+                                                                           )
+                                         )
             else:
                 if update.hyperparameter not in node.get_hyperparameter_search_space(
                         dataset_properties=self.dataset_properties):
@@ -464,13 +469,13 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]],
                             node.get_hyperparameter_search_space(
                                 dataset_properties=self.dataset_properties).get_hyperparameter_names()]):
                         continue
-                    raise ValueError("Unknown hyperparameter for component {}. "
+                    node_hyperparameters = node.get_hyperparameter_search_space(
+                        dataset_properties=self.dataset_properties).get_hyperparameter_names()
+                    raise ValueError("Unknown hyperparameter for node {}. "
                                      "Expected update hyperparameter "
-                                     "to be in {} got {}".format(node.__class__.__name__,
-                                                                 node.
-                                                                 get_hyperparameter_search_space(
-                                                                     dataset_properties=self.dataset_properties).
-                                                                 get_hyperparameter_names(), update.hyperparameter))
+                                     "to be in {}, but got {}".format(node_name,
+                                                                      node_hyperparameters,
+                                                                      update.hyperparameter))
 
     def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]
                             ) -> List[Tuple[str, autoPyTorchChoice]]:
diff --git a/autoPyTorch/pipeline/components/preprocessing/image_preprocessing/normalise/ImageNormalizer.py b/autoPyTorch/pipeline/components/preprocessing/image_preprocessing/normalise/ImageNormalizer.py
index 4327d6346..a3be8fa79 100644
--- a/autoPyTorch/pipeline/components/preprocessing/image_preprocessing/normalise/ImageNormalizer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/image_preprocessing/normalise/ImageNormalizer.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 
-import torch.tensor
+import torch
 
 from autoPyTorch.pipeline.components.preprocessing.image_preprocessing.normalise.base_normalizer import BaseNormalizer
 
@@ -30,16 +30,16 @@ def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> "ImageNormalizer":
         self.std = X['dataset_properties']['std']
         return self
 
-    def __call__(self, X: Union[np.ndarray, torch.tensor]) -> Union[np.ndarray, torch.tensor]:
+    def __call__(self, X: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
         """
         Makes the autoPyTorchPreprocessingComponent Callable. Calling the component
         calls the transform function of the underlying early_preprocessor and
         returns the transformed array.
         Args:
-            X (Union[np.ndarray, torch.tensor]): input data tensor
+            X (Union[np.ndarray, torch.Tensor]): input data tensor
 
         Returns:
-            Union[np.ndarray, torch.tensor]: Transformed data tensor
+            Union[np.ndarray, torch.Tensor]: Transformed data tensor
         """
         X = (X - self.mean) / self.std
         return X
diff --git a/autoPyTorch/pipeline/components/preprocessing/image_preprocessing/normalise/NoNormalizer.py b/autoPyTorch/pipeline/components/preprocessing/image_preprocessing/normalise/NoNormalizer.py
index 7aeb83a9c..b36a50f4e 100644
--- a/autoPyTorch/pipeline/components/preprocessing/image_preprocessing/normalise/NoNormalizer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/image_preprocessing/normalise/NoNormalizer.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 
-import torch.tensor
+import torch
 
 from autoPyTorch.pipeline.components.preprocessing.image_preprocessing.normalise.base_normalizer import (
     BaseNormalizer
@@ -34,16 +34,16 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         X.update({'normalise': self})
         return X
 
-    def __call__(self, X: Union[np.ndarray, torch.tensor]) -> Union[np.ndarray, torch.tensor]:
+    def __call__(self, X: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
         """
         Makes the autoPyTorchPreprocessingComponent Callable. Calling the component
         calls the transform function of the underlying early_preprocessor and
         returns the transformed array.
         Args:
-            X (Union[np.ndarray, torch.tensor]): input data tensor
+            X (Union[np.ndarray, torch.Tensor]): input data tensor
 
         Returns:
-            Union[np.ndarray, torch.tensor]: Transformed data tensor
+            Union[np.ndarray, torch.Tensor]: Transformed data tensor
         """
         return X
 
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py
index e1e08e94e..e513b8729 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py
@@ -3,14 +3,14 @@
 import numpy as np
 
 from sklearn.compose import ColumnTransformer
-from sklearn.pipeline import make_pipeline
+# from sklearn.pipeline import make_pipeline
 
 import torch
 
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import (
     autoPyTorchTabularPreprocessingComponent
 )
-from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.utils import get_tabular_preprocessers
+# from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.utils import get_tabular_preprocessers
 from autoPyTorch.utils.common import FitRequirement, subsampler
 
 
@@ -47,15 +47,16 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer":
         Returns:
             "TabularColumnTransformer": an instance of self
         """
+
         self.check_requirements(X, y)
-        numerical_pipeline = 'drop'
-        categorical_pipeline = 'drop'
+        numerical_pipeline = 'passthrough'
+        categorical_pipeline = 'passthrough'
 
-        preprocessors = get_tabular_preprocessers(X)
-        if len(X['dataset_properties']['numerical_columns']):
-            numerical_pipeline = make_pipeline(*preprocessors['numerical'])
-        if len(X['dataset_properties']['categorical_columns']):
-            categorical_pipeline = make_pipeline(*preprocessors['categorical'])
+        # preprocessors = get_tabular_preprocessers(X)
+        # if len(X['dataset_properties']['numerical_columns']):
+        #     numerical_pipeline = make_pipeline(*preprocessors['numerical'])
+        # if len(X['dataset_properties']['categorical_columns']):
+        #     categorical_pipeline = make_pipeline(*preprocessors['categorical'])
 
         self.preprocessor = ColumnTransformer([
             ('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns']),
@@ -71,6 +72,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer":
             X_train = X['backend'].load_datamanager().train_tensors[0]
 
         self.preprocessor.fit(X_train)
+
         return self
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py
index eadc0a188..9829cadcd 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py
@@ -28,5 +28,5 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None:
             raise ValueError("cant call transform on {} without fitting first."
                              .format(self.__class__.__name__))
-        X.update({'encoder': self.preprocessor})
+        # X.update({'encoder': self.preprocessor})
         return X
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py
index b65f3c229..ac0648481 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py
@@ -29,5 +29,5 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None:
             raise ValueError("cant call transform on {} without fitting first."
                              .format(self.__class__.__name__))
-        X.update({'imputer': self.preprocessor})
+        # X.update({'imputer': self.preprocessor})
         return X
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler.py
index 39834dd2b..270fac246 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler.py
@@ -28,5 +28,5 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None:
             raise ValueError("cant call transform on {} without fitting first."
                              .format(self.__class__.__name__))
-        X.update({'scaler': self.preprocessor})
+        # X.update({'scaler': self.preprocessor})
         return X
diff --git a/autoPyTorch/pipeline/components/setup/network/base_network.py b/autoPyTorch/pipeline/components/setup/network/base_network.py
index c58f25283..512c8360e 100644
--- a/autoPyTorch/pipeline/components/setup/network/base_network.py
+++ b/autoPyTorch/pipeline/components/setup/network/base_network.py
@@ -131,13 +131,15 @@ def _predict(self, network: torch.nn.Module, loader: torch.utils.data.DataLoader
         # Batch prediction
         Y_batch_preds = list()
 
-        for i, (X_batch, Y_batch) in enumerate(loader):
-            # Predict on batch
-            X_batch = X_batch.float().to(self.device)
-            Y_batch_pred = network(X_batch)
-            if self.final_activation is not None:
-                Y_batch_pred = self.final_activation(Y_batch_pred)
-            Y_batch_preds.append(Y_batch_pred.detach().cpu())
+        # `torch.no_grad` reduces memory usage even after `model.eval()`
+        with torch.no_grad():
+            for i, (X_batch, Y_batch) in enumerate(loader):
+                # Predict on batch
+                X_batch = X_batch.float().to(self.device)
+                Y_batch_pred = network(X_batch)
+                if self.final_activation is not None:
+                    Y_batch_pred = self.final_activation(Y_batch_pred)
+                Y_batch_preds.append(Y_batch_pred.detach().cpu())
 
         return torch.cat(Y_batch_preds, 0)
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/ResNetBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/ResNetBackbone.py
index 7a3350ca8..51269a254 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/ResNetBackbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/ResNetBackbone.py
@@ -139,6 +139,14 @@ def get_hyperparameter_search_space(
                                                                                value_range=(True, False),
                                                                                default_value=True,
                                                                                ),
+        shake_shake_update_func: HyperparameterSearchSpace = HyperparameterSearchSpace(
+            hyperparameter="shake_shake_update_func",
+            value_range=('shake-shake',
+                         'shake-even',
+                         'even-even',
+                         'M3'),
+            default_value='shake-shake',
+        ),
         use_shake_drop: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="use_shake_drop",
                                                                               value_range=(True, False),
                                                                               default_value=True,
@@ -180,16 +188,25 @@ def get_hyperparameter_search_space(
 
         if skip_connection_flag:
 
+            shake_shake_flag = 'shake-shake' in multi_branch_choice.value_range
             shake_drop_prob_flag = 'shake-drop' in multi_branch_choice.value_range
 
             mb_choice = get_hyperparameter(multi_branch_choice, CategoricalHyperparameter)
             cs.add_hyperparameter(mb_choice)
             cs.add_condition(CS.EqualsCondition(mb_choice, use_sc, True))
 
+            shake_shake_update_func_conditional: List[str] = list()
             if shake_drop_prob_flag:
                 shake_drop_prob = get_hyperparameter(max_shake_drop_probability, UniformFloatHyperparameter)
                 cs.add_hyperparameter(shake_drop_prob)
                 cs.add_condition(CS.EqualsCondition(shake_drop_prob, mb_choice, "shake-drop"))
+                shake_shake_update_func_conditional.append('shake-drop')
+            if shake_shake_flag:
+                shake_shake_update_func_conditional.append('shake-shake')
+            if len(shake_shake_update_func_conditional) > 0:
+                method = get_hyperparameter(shake_shake_update_func, CategoricalHyperparameter)
+                cs.add_hyperparameter(method)
+                cs.add_condition(CS.InCondition(method, mb_choice, shake_shake_update_func_conditional))
 
         # It is the upper bound of the nr of groups,
         # since the configuration will actually be sampled.
@@ -258,7 +275,7 @@ def __init__(
         # if in != out the shortcut needs a linear layer to match the result dimensions
         # if the shortcut needs a layer we apply batchnorm and activation to the shortcut
         # as well (start_norm)
-        if in_features != out_features:
+        if in_features != out_features and self.config["use_skip_connection"]:
             self.shortcut = nn.Linear(in_features, out_features)
             initial_normalization = list()
             if self.config['use_batch_norm']:
@@ -288,13 +305,6 @@ def _build_block(self, in_features: int, out_features: int) -> nn.Module:
             if self.config['use_batch_norm']:
                 layers.append(nn.BatchNorm1d(in_features))
             layers.append(self.activation())
-        elif not self.config['use_skip_connection']:
-            # if start norm is not None and skip connection is False
-            # we will never apply the start_norm for the first layer in the block,
-            # which is why we should account for this case.
-            if self.config['use_batch_norm']:
-                layers.append(nn.BatchNorm1d(in_features))
-            layers.append(self.activation())
 
         layers.append(nn.Linear(in_features, out_features))
 
@@ -310,9 +320,6 @@ def _build_block(self, in_features: int, out_features: int) -> nn.Module:
 
     def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
 
-        if self.config["use_skip_connection"]:
-            residual = x
-
         # if shortcut is not none we need a layer such that x matches the output dimension
         if self.shortcut is not None and self.start_norm is not None:
             # in this case self.start_norm is also != none
@@ -320,38 +327,42 @@ def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
             # in front of shortcut and layers. Note that in this case layers
             # does not start with batchnorm+activation but with the first linear layer
             # (see _build_block). As a result if in_features == out_features
-            # -> result = x + W(~D(A(BN(W(A(BN(x))))))
+            # -> result = x + W_2(~D(A(BN(W_1(A(BN(x))))))
             # if in_features != out_features
             # -> result = W_shortcut(A(BN(x))) + W_2(~D(A(BN(W_1(A(BN(x))))))
             x = self.start_norm(x)
-            if self.config["use_skip_connection"]:
-                residual = self.shortcut(x)
-
-        # TODO make the below code better
-        if self.config["use_skip_connection"]:
-            if self.config["multi_branch_choice"] == 'shake-shake':
-                x1 = self.layers(x)
-                x2 = self.shake_shake_layers(x)
-                alpha, beta = shake_get_alpha_beta(self.training, x.is_cuda)
-                x = shake_shake(x1, x2, alpha, beta)
-            else:
-                x = self.layers(x)
+            residual = self.shortcut(x)
+        elif self.config["use_skip_connection"]:
+            # We use a skip connection but we do not need to match dimensions
+            residual = x
+        else:  # Early-return because no need of skip connection
+            return self.layers(x)
+
+        if self.config["multi_branch_choice"] == 'shake-shake':
+            x1 = self.layers(x)
+            x2 = self.shake_shake_layers(x)
+            alpha, beta = shake_get_alpha_beta(
+                is_training=self.training,
+                is_cuda=x.is_cuda,
+                method=self.config['shake_shake_update_func'],
+            )
+            x = shake_shake(x1, x2, alpha, beta)
+        elif self.config["multi_branch_choice"] == 'shake-drop':
+            x = self.layers(x)
+            alpha, beta = shake_get_alpha_beta(
+                is_training=self.training,
+                is_cuda=x.is_cuda,
+                method=self.config['shake_shake_update_func'],
+            )
+            bl = shake_drop_get_bl(
+                self.block_index,
+                1 - self.config["max_shake_drop_probability"],
+                self.num_blocks,
+                self.training,
+                x.is_cuda,
+            )
+            x = shake_drop(x, alpha, beta, bl)
         else:
             x = self.layers(x)
 
-        if self.config["use_skip_connection"]:
-            if self.config["multi_branch_choice"] == 'shake-drop':
-                alpha, beta = shake_get_alpha_beta(self.training, x.is_cuda)
-                bl = shake_drop_get_bl(
-                    self.block_index,
-                    1 - self.config["max_shake_drop_probability"],
-                    self.num_blocks,
-                    self.training,
-                    x.is_cuda,
-                )
-                x = shake_drop(x, alpha, beta, bl)
-
-        if self.config["use_skip_connection"]:
-            x = x + residual
-
-        return x
+        return x + residual
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/ShapedResNetBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/ShapedResNetBackbone.py
index 1a37482d7..72046a9b7 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/ShapedResNetBackbone.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/ShapedResNetBackbone.py
@@ -72,6 +72,7 @@ def build_backbone(self, input_shape: Tuple[int, ...]) -> None:
             )
         if self.config['use_batch_norm']:
             layers.append(torch.nn.BatchNorm1d(self.config["num_units_%i" % self.config['num_groups']]))
+        layers.append(_activations[self.config["activation"]]())
         backbone = torch.nn.Sequential(*layers)
         self.backbone = backbone
         return backbone
@@ -144,6 +145,14 @@ def get_hyperparameter_search_space(  # type: ignore[override]
                                                                                           'stairs'),
                                                                              default_value='funnel',
                                                                              ),
+        shake_shake_update_func: HyperparameterSearchSpace = HyperparameterSearchSpace(
+            hyperparameter="shake_shake_update_func",
+            value_range=('shake-shake',
+                         'shake-even',
+                         'even-even',
+                         'M3'),
+            default_value='shake-shake',
+        ),
         max_shake_drop_probability: HyperparameterSearchSpace = HyperparameterSearchSpace(
             hyperparameter="max_shake_drop_probability",
             value_range=(0, 1),
@@ -187,17 +196,24 @@ def get_hyperparameter_search_space(  # type: ignore[override]
 
         if skip_connection_flag:
 
-            shake_drop_prob_flag = False
-            if 'shake-drop' in multi_branch_choice.value_range:
-                shake_drop_prob_flag = True
+            shake_shake_flag = 'shake-shake' in multi_branch_choice.value_range
+            shake_drop_prob_flag = 'shake-drop' in multi_branch_choice.value_range
 
             mb_choice = get_hyperparameter(multi_branch_choice, CategoricalHyperparameter)
             cs.add_hyperparameter(mb_choice)
             cs.add_condition(CS.EqualsCondition(mb_choice, use_sc, True))
 
+            shake_shake_update_func_conditional: List[str] = list()
             if shake_drop_prob_flag:
                 shake_drop_prob = get_hyperparameter(max_shake_drop_probability, UniformFloatHyperparameter)
                 cs.add_hyperparameter(shake_drop_prob)
                 cs.add_condition(CS.EqualsCondition(shake_drop_prob, mb_choice, "shake-drop"))
+                shake_shake_update_func_conditional.append('shake-drop')
+            if shake_shake_flag:
+                shake_shake_update_func_conditional.append('shake-shake')
+            if len(shake_shake_update_func_conditional) > 0:
+                method = get_hyperparameter(shake_shake_update_func, CategoricalHyperparameter)
+                cs.add_hyperparameter(method)
+                cs.add_condition(CS.InCondition(method, mb_choice, shake_shake_update_func_conditional))
 
         return cs
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py
index f7c682ef9..70cab66fc 100644
--- a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py
+++ b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py
@@ -28,6 +28,7 @@ def get_output_shape(network: torch.nn.Module, input_shape: typing.Tuple[int, ..
     placeholder = torch.randn((2, *input_shape), dtype=torch.float)
     with torch.no_grad():
         output = network(placeholder)
+
     return tuple(output.shape[1:])
 
 
@@ -44,8 +45,8 @@ def forward(
         ctx: typing.Any,  # No typing for AutogradContext
         x1: torch.Tensor,
         x2: torch.Tensor,
-        alpha: torch.tensor,
-        beta: torch.tensor,
+        alpha: torch.Tensor,
+        beta: torch.Tensor,
     ) -> torch.Tensor:
         ctx.save_for_backward(x1, x2, alpha, beta)
 
@@ -85,10 +86,10 @@ class ShakeDropFunction(Function):
     """
     @staticmethod
     def forward(ctx: typing.Any,
-                x: torch.tensor,
-                alpha: torch.tensor,
-                beta: torch.tensor,
-                bl: torch.tensor,
+                x: torch.Tensor,
+                alpha: torch.Tensor,
+                beta: torch.Tensor,
+                bl: torch.Tensor,
                 ) -> torch.Tensor:
         ctx.save_for_backward(x, alpha, beta, bl)
 
@@ -111,15 +112,53 @@ def backward(ctx: typing.Any,
 shake_drop = ShakeDropFunction.apply
 
 
-def shake_get_alpha_beta(is_training: bool, is_cuda: bool
-                         ) -> typing.Tuple[torch.tensor, torch.tensor]:
-    if is_training:
+def shake_get_alpha_beta(
+    is_training: bool,
+    is_cuda: bool,
+    method: str
+) -> typing.Tuple[torch.Tensor, torch.Tensor]:
+    """
+    The methods used in this function have been introduced in 'ShakeShake Regularisation'
+    Each method name is available in the referred paper.
+    Currently, this function supports `even-even`, `shake-even`, `shake-shake` and `M3`.
+
+    Args:
+        is_training (bool): Whether the computation for the training
+        is_cuda (bool): Whether the tensor is on CUDA
+        method (str): The shake method either `even-even`, `shake-even`, `shake-shake` or `M3`
+
+    Returns:
+        alpha, beta (Tuple[float, float]):
+            alpha (in [0, 1]) is the weight coefficient  for the forward pass
+            beta (in [0, 1]) is the weight coefficient for the backward pass
+
+    Reference:
+        Title: Shake-shake regularization
+        Author: Xavier Gastaldi
+        URL: https://arxiv.org/abs/1705.07485
+
+    The names have been taken from the paper as well.
+    Currently, this function supports `even-even`, `shake-even`, `shake-shake` and `M3`.
+    """
+    if not is_training:
         result = (torch.FloatTensor([0.5]), torch.FloatTensor([0.5]))
         return result if not is_cuda else (result[0].cuda(), result[1].cuda())
 
     # TODO implement other update methods
-    alpha = torch.rand(1)
-    beta = torch.rand(1)
+    # alpha is the weight ratio for the forward pass and beta is that for the backward pass
+    alpha = torch.FloatTensor([0.5]) if method.startswith('even') else torch.rand(1)
+    if method.endswith('even'):
+        beta = torch.FloatTensor([0.5])
+    elif method.endswith('shake'):
+        beta = torch.rand(1)
+    elif method == 'M3':
+        # Table 4 in the paper `Shake-Shake regularization`
+        rnd = torch.rand(1)
+        beta = torch.FloatTensor(
+            [rnd * (0.5 - alpha) + alpha if alpha < 0.5 else rnd * (alpha - 0.5) + 0.5]
+        )
+    else:
+        raise ValueError(f"Unknown method `{method}` for ShakeShakeRegularisation in NetworkBackbone")
 
     if is_cuda:
         alpha = alpha.cuda()
@@ -129,19 +168,37 @@ def shake_get_alpha_beta(is_training: bool, is_cuda: bool
 
 
 def shake_drop_get_bl(
-        block_index: int,
-        min_prob_no_shake: float,
-        num_blocks: int,
-        is_training: bool,
-        is_cuda: bool
-) -> torch.tensor:
+    block_index: int,
+    min_prob_no_shake: float,
+    num_blocks: int,
+    is_training: bool,
+    is_cuda: bool
+) -> torch.Tensor:
+    """
+    The sampling of Bernoulli random variable
+    based on Eq. (4) in the paper
+    Args:
+        block_index (int): The index of the block from the input layer
+        min_prob_no_shake (float): The initial shake probability
+        num_blocks (int): The total number of building blocks
+        is_training (bool): Whether it is training
+        is_cuda (bool): Whether the tensor is on CUDA
+
+    Returns:
+        bl (torch.Tensor): a Bernoulli random variable in {0, 1}
+    Reference:
+        ShakeDrop Regularization for Deep Residual Learning
+        Yoshihiro Yamada et. al. (2020)
+        paper: https://arxiv.org/pdf/1802.02375.pdf
+        implementation: https://github.com/imenurok/ShakeDrop
+    """
     pl = 1 - ((block_index + 1) / num_blocks) * (1 - min_prob_no_shake)
 
-    if not is_training:
-        # Move to torch.randn(1) for reproducibility
-        bl = torch.tensor(1.0) if torch.randn(1) <= pl else torch.tensor(0.0)
     if is_training:
-        bl = torch.tensor(pl)
+        # Move to torch.randn(1) for reproducibility
+        bl = torch.as_tensor(1.0) if torch.rand(1) <= pl else torch.as_tensor(0.0)
+    else:
+        bl = torch.as_tensor(pl)
 
     if is_cuda:
         bl = bl.cuda()
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
index 5ae2880ed..6feac0fba 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
@@ -1,4 +1,4 @@
-import copy
+# import copy
 from typing import Any, Dict, Optional, Tuple
 
 import numpy as np
@@ -31,21 +31,22 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
     def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module:
         raise NotImplementedError
 
-    def _get_args(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]:
+    def _get_args(self, X: Dict[str, Any]) -> Tuple[None, None]:  # Tuple[int, np.ndarray]:
         # Feature preprocessors can alter numerical columns
-        if len(X['dataset_properties']['numerical_columns']) == 0:
-            num_numerical_columns = 0
-        else:
-            X_train = copy.deepcopy(X['backend'].load_datamanager().train_tensors[0][:2])
-
-            numerical_column_transformer = X['tabular_transformer'].preprocessor. \
-                named_transformers_['numerical_pipeline']
-            num_numerical_columns = numerical_column_transformer.transform(
-                X_train[:, X['dataset_properties']['numerical_columns']]).shape[1]
-        num_input_features = np.zeros((num_numerical_columns + len(X['dataset_properties']['categorical_columns'])),
-                                      dtype=int)
-        categories = X['dataset_properties']['categories']
-
-        for i, category in enumerate(categories):
-            num_input_features[num_numerical_columns + i, ] = len(category)
-        return num_numerical_columns, num_input_features
+        # if len(X['dataset_properties']['numerical_columns']) == 0:
+        #     num_numerical_columns = 0
+        # else:
+        #     X_train = copy.deepcopy(X['backend'].load_datamanager().train_tensors[0][:2])
+        #
+        #     numerical_column_transformer = X['tabular_transformer'].preprocessor. \
+        #         named_transformers_['numerical_pipeline']
+        #     num_numerical_columns = numerical_column_transformer.transform(
+        #         X_train[:, X['dataset_properties']['numerical_columns']]).shape[1]
+        # num_input_features = np.zeros((num_numerical_columns + len(X['dataset_properties']['categorical_columns'])),
+        #                               dtype=int)
+        # categories = X['dataset_properties']['categories']
+        #
+        # for i, category in enumerate(categories):
+        #     num_input_features[num_numerical_columns + i, ] = len(category)
+        # return num_numerical_columns, num_input_features
+        return None, None
diff --git a/autoPyTorch/pipeline/components/setup/network_head/no_head.py b/autoPyTorch/pipeline/components/setup/network_head/no_head.py
index 870f680fb..0e711f06c 100644
--- a/autoPyTorch/pipeline/components/setup/network_head/no_head.py
+++ b/autoPyTorch/pipeline/components/setup/network_head/no_head.py
@@ -23,7 +23,6 @@ def build_head(self, input_shape: Tuple[int, ...], output_shape: Tuple[int, ...]
         layers = []
         in_features = np.prod(input_shape).item()
         out_features = np.prod(output_shape).item()
-        layers.append(_activations[self.config["activation"]]())
         layers.append(nn.Linear(in_features=in_features,
                                 out_features=out_features))
         return nn.Sequential(*layers)
diff --git a/autoPyTorch/pipeline/components/setup/optimizer/AdamWOptimizer.py b/autoPyTorch/pipeline/components/setup/optimizer/AdamWOptimizer.py
index 4d11c3026..a415ff1c6 100644
--- a/autoPyTorch/pipeline/components/setup/optimizer/AdamWOptimizer.py
+++ b/autoPyTorch/pipeline/components/setup/optimizer/AdamWOptimizer.py
@@ -95,9 +95,9 @@ def get_hyperparameter_search_space(
                                                                                 default_value=True,
                                                                                 ),
         weight_decay: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="weight_decay",
-                                                                            value_range=(1E-7, 0.1),
+                                                                            value_range=(1E-5, 0.1),
                                                                             default_value=1E-4,
-                                                                            log=True),
+                                                                            log=False),
     ) -> ConfigurationSpace:
         cs = ConfigurationSpace()
 
diff --git a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py
index 21a3b9022..f568c3dc4 100644
--- a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py
+++ b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py
@@ -115,7 +115,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             shuffle=True,
             num_workers=X.get('num_workers', 0),
             pin_memory=X.get('pin_memory', True),
-            drop_last=X.get('drop_last', True),
+            drop_last=X.get('drop_last', False),
             collate_fn=custom_collate_fn,
         )
 
@@ -148,6 +148,8 @@ def get_loader(self, X: np.ndarray, y: Optional[np.ndarray] = None, batch_size:
         dataset = BaseDataset(
             train_tensors=(X, y),
             # This dataset is used for loading test data in a batched format
+            seed=self.random_state.get_state()[1][0],
+            shuffle=False,
             train_transforms=self.test_transform,
             val_transforms=self.test_transform,
         )
diff --git a/autoPyTorch/pipeline/components/training/trainer/AdversarialTrainer.py b/autoPyTorch/pipeline/components/training/trainer/AdversarialTrainer.py
index a24aac328..a95832ab1 100644
--- a/autoPyTorch/pipeline/components/training/trainer/AdversarialTrainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/AdversarialTrainer.py
@@ -31,7 +31,7 @@ class AdversarialTrainer(BaseTrainerComponent):
     def __init__(
             self,
             epsilon: float,
-            weighted_loss: bool = False,
+            weighted_loss: int = 0,
             random_state: Optional[np.random.RandomState] = None,
             use_stochastic_weight_averaging: bool = False,
             use_snapshot_ensemble: bool = False,
@@ -157,7 +157,7 @@ def get_properties(dataset_properties: Optional[Dict[str, Any]] = None
             'shortname': 'AdversarialTrainer',
             'name': 'AdversarialTrainer',
             'handles_tabular': True,
-            'handles_image': False,
+            'handles_image': True,
             'handles_time_series': False,
         }
 
@@ -166,8 +166,8 @@ def get_hyperparameter_search_space(
         dataset_properties: Optional[Dict] = None,
         weighted_loss: HyperparameterSearchSpace = HyperparameterSearchSpace(
             hyperparameter="weighted_loss",
-            value_range=(True, False),
-            default_value=True),
+            value_range=(1, ),
+            default_value=1),
         la_steps: HyperparameterSearchSpace = HyperparameterSearchSpace(
             hyperparameter="la_steps",
             value_range=(5, 10),
@@ -192,16 +192,21 @@ def get_hyperparameter_search_space(
             default_value=True),
         se_lastk: HyperparameterSearchSpace = HyperparameterSearchSpace(
             hyperparameter="se_lastk",
-            value_range=(3,),
+            value_range=(3, ),
             default_value=3),
         epsilon: HyperparameterSearchSpace = HyperparameterSearchSpace(
             hyperparameter="epsilon",
-            value_range=(0.05, 0.2),
-            default_value=0.2),
+            value_range=(0.001, 0.15),
+            default_value=0.007,
+            log=True),
     ) -> ConfigurationSpace:
         cs = ConfigurationSpace()
 
+        epsilon = HyperparameterSearchSpace(hyperparameter="epsilon",
+                                            value_range=(0.007, 0.007),
+                                            default_value=0.007)
         add_hyperparameter(cs, epsilon, UniformFloatHyperparameter)
+
         add_hyperparameter(cs, use_stochastic_weight_averaging, CategoricalHyperparameter)
         snapshot_ensemble_flag = any(use_snapshot_ensemble.value_range)
 
@@ -229,9 +234,17 @@ def get_hyperparameter_search_space(
                 parent_hyperparameter=parent_hyperparameter
             )
 
+        """
         # TODO, decouple the weighted loss from the trainer
         if dataset_properties is not None:
             if STRING_TO_TASK_TYPES[dataset_properties['task_type']] in CLASSIFICATION_TASKS:
                 add_hyperparameter(cs, weighted_loss, CategoricalHyperparameter)
+        """
+        # TODO, decouple the weighted loss from the trainer. Uncomment the code above and
+        # remove the code below. Also update the method signature, so the weighted loss
+        # is not a constant.
+        if dataset_properties is not None:
+            if STRING_TO_TASK_TYPES[dataset_properties['task_type']] in CLASSIFICATION_TASKS:
+                add_hyperparameter(cs, weighted_loss, Constant)
 
         return cs
diff --git a/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py b/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py
index e36faf121..3f7866f3c 100644
--- a/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py
@@ -1,4 +1,4 @@
-import typing
+from typing import Any, Dict, Optional, Tuple, Union
 
 import numpy as np
 
@@ -11,7 +11,7 @@
 class RowCutMixTrainer(MixUp, BaseTrainerComponent):
 
     def data_preparation(self, X: np.ndarray, y: np.ndarray,
-                         ) -> typing.Tuple[np.ndarray, typing.Dict[str, np.ndarray]]:
+                         ) -> Tuple[np.ndarray, Dict[str, np.ndarray]]:
         """
         Depending on the trainer choice, data fed to the network might be pre-processed
         on a different way. That is, in standard training we provide the data to the
@@ -26,39 +26,38 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray,
             np.ndarray: that processes data
             typing.Dict[str, np.ndarray]: arguments to the criterion function
         """
-        alpha, beta = 1.0, 1.0
-        lam = self.random_state.beta(alpha, beta)
-        batch_size = X.shape[0]
-        device = torch.device('cuda' if X.is_cuda else 'cpu')
-        permed_indices = torch.randperm(batch_size).to(device)
+        beta = 1.0
+        lam = self.random_state.beta(beta, beta)
+        batch_size, n_columns = np.shape(X)
+        # shuffled_indices: Shuffled version of torch.arange(batch_size)
+        shuffled_indices = torch.randperm(batch_size).cuda() if X.is_cuda else torch.randperm(batch_size)
 
         r = self.random_state.rand(1)
         if beta <= 0 or r > self.alpha:
-            return X, {'y_a': y, 'y_b': y[permed_indices], 'lam': 1}
+            return X, {'y_a': y, 'y_b': y[shuffled_indices], 'lam': 1}
 
-        # batch_size (permutation of rows), col_size = X.shape
-        col_size = X.shape[1]
-        col_indices = torch.tensor(
+        cut_column_indices = torch.as_tensor(
             self.random_state.choice(
-                range(col_size),
-                max(1, int(col_size * lam)),
-                replace=False
-            )
+                range(n_columns),
+                max(1, np.int32(n_columns * lam)),
+                replace=False,
+            ),
         )
 
-        # Replace selected columns with columns from another data point
-        X[:, col_indices] = X[permed_indices, :][:, col_indices]
+        # Replace the values in `cut_indices` columns with
+        # the values from `permed_indices`
+        X[:, cut_column_indices] = X[shuffled_indices, :][:, cut_column_indices]
 
-        # Adjust lam
-        lam = 1 - len(col_indices) / X.shape[1]
+        # Since we cannot cut exactly `lam x 100 %` of rows, we need to adjust the `lam`
+        lam = 1 - (len(cut_column_indices) / n_columns)
 
-        y_a, y_b = y, y[permed_indices]
+        y_a, y_b = y, y[shuffled_indices]
 
         return X, {'y_a': y_a, 'y_b': y_b, 'lam': lam}
 
     @staticmethod
-    def get_properties(dataset_properties: typing.Optional[typing.Dict[str, typing.Any]] = None
-                       ) -> typing.Dict[str, typing.Union[str, bool]]:
+    def get_properties(dataset_properties: Optional[Dict[str, Any]] = None
+                       ) -> Dict[str, Union[str, bool]]:
         return {
             'shortname': 'RowCutMixTrainer',
             'name': 'MixUp Regularized with Cutoff Tabular Trainer',
diff --git a/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py b/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py
index 5e1b9a1a3..4578082cb 100644
--- a/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py
@@ -1,9 +1,7 @@
-import typing
+from typing import Any, Dict, Optional, Tuple, Union
 
 import numpy as np
 
-import torch
-
 from autoPyTorch.pipeline.components.training.trainer.base_trainer import BaseTrainerComponent
 from autoPyTorch.pipeline.components.training.trainer.cutout_utils import CutOut
 
@@ -17,13 +15,8 @@ class RowCutOutTrainer(CutOut, BaseTrainerComponent):
         Github URL: https://github.com/hysts/pytorch_cutout/blob/master/dataloader.py#L36-L68
     """
 
-    # 0 is non-informative in image data
-    NUMERICAL_VALUE = 0
-    # -1 is the conceptually equivalent to 0 in a image, i.e. 0-pad
-    CATEGORICAL_VALUE = -1
-
     def data_preparation(self, X: np.ndarray, y: np.ndarray,
-                         ) -> typing.Tuple[np.ndarray, typing.Dict[str, np.ndarray]]:
+                         ) -> Tuple[np.ndarray, Dict[str, np.ndarray]]:
         """
         Depending on the trainer choice, data fed to the network might be pre-processed
         on a different way. That is, in standard training we provide the data to the
@@ -36,9 +29,8 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray,
 
         Returns:
             np.ndarray: that processes data
-            typing.Dict[str, np.ndarray]: arguments to the criterion function
+            Dict[str, np.ndarray]: arguments to the criterion function
         """
-
         r = self.random_state.rand(1)
         if r > self.cutout_prob:
             y_a = y
@@ -46,30 +38,23 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray,
             lam = 1
             return X, {'y_a': y_a, 'y_b': y_b, 'lam': lam}
 
-        # (batch_size (permutation of rows), col_size) = X.shape
-        col_size = X.shape[1]
-        col_indices = self.random_state.choice(range(col_size), max(1, int(col_size * self.patch_ratio)),
-                                               replace=False)
-
-        if not isinstance(self.numerical_columns, typing.Iterable):
-            raise ValueError("numerical_columns in {} must be iterable, "
-                             "but got {}.".format(self.__class__.__name__,
-                                                  self.numerical_columns))
-
-        numerical_indices = torch.tensor(self.numerical_columns)
-        categorical_indices = torch.tensor([idx for idx in col_indices if idx not in self.numerical_columns])
-
-        X[:, categorical_indices.long()] = self.CATEGORICAL_VALUE
-        X[:, numerical_indices.long()] = self.NUMERICAL_VALUE
+        size: int = np.shape(X)[1]
+        cut_column_indices = self.random_state.choice(
+            range(size),
+            max(1, np.int32(size * self.patch_ratio)),
+            replace=False,
+        )
 
+        # Mask the selected features as 0
+        X[:, cut_column_indices] = 0
         lam = 1
         y_a = y
         y_b = y
         return X, {'y_a': y_a, 'y_b': y_b, 'lam': lam}
 
     @staticmethod
-    def get_properties(dataset_properties: typing.Optional[typing.Dict[str, typing.Any]] = None
-                       ) -> typing.Dict[str, typing.Union[str, bool]]:
+    def get_properties(dataset_properties: Optional[Dict[str, Any]] = None
+                       ) -> Dict[str, Union[str, bool]]:
         return {
             'shortname': 'RowCutOutTrainer',
             'name': 'RowCutOutTrainer',
diff --git a/autoPyTorch/pipeline/components/training/trainer/StandardTrainer.py b/autoPyTorch/pipeline/components/training/trainer/StandardTrainer.py
index 02db03f1d..1f6bbd224 100644
--- a/autoPyTorch/pipeline/components/training/trainer/StandardTrainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/StandardTrainer.py
@@ -7,7 +7,7 @@
 
 class StandardTrainer(BaseTrainerComponent):
     def __init__(self,
-                 weighted_loss: bool = False,
+                 weighted_loss: int = 0,
                  use_stochastic_weight_averaging: bool = False,
                  use_snapshot_ensemble: bool = False,
                  se_lastk: int = 3,
@@ -18,7 +18,7 @@ def __init__(self,
         This class handles the training of a network for a single given epoch.
 
         Args:
-            weighted_loss (bool): whether to use weighted loss
+            weighted_loss (int): whether to use weighted loss
 
         """
         super().__init__(random_state=random_state,
diff --git a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
index a8023bee5..8c5df84b2 100644
--- a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
@@ -175,7 +175,7 @@ class BaseTrainerComponent(autoPyTorchTrainingComponent):
     """
     Base class for training
     Args:
-        weighted_loss (bool, default=False): In case for classification, whether to weight
+        weighted_loss (int, default=0): In case for classification, whether to weight
             the loss function according to the distribution of classes in the target
         use_stochastic_weight_averaging (bool, default=True): whether to use stochastic
             weight averaging. Stochastic weight averaging is a simple average of
@@ -190,7 +190,7 @@ class BaseTrainerComponent(autoPyTorchTrainingComponent):
         random_state:
         **lookahead_config:
     """
-    def __init__(self, weighted_loss: bool = False,
+    def __init__(self, weighted_loss: int = 0,
                  use_stochastic_weight_averaging: bool = True,
                  use_snapshot_ensemble: bool = True,
                  se_lastk: int = 3,
@@ -319,8 +319,12 @@ def on_epoch_end(self, X: Dict[str, Any], epoch: int) -> bool:
                 if self.use_snapshot_ensemble:
                     assert self.model_snapshots is not None, "model snapshots container can't be " \
                                                              "none when snapshot ensembling is enabled"
-                    model_copy = deepcopy(self.swa_model) if self.use_stochastic_weight_averaging \
-                        else deepcopy(self.model)
+                    is_last_epoch = (epoch == self.budget_tracker.max_epochs)
+                    if is_last_epoch and self.use_stochastic_weight_averaging:
+                        model_copy = deepcopy(self.swa_model)
+                    else:
+                        model_copy = deepcopy(self.model)
+
                     assert model_copy is not None
                     model_copy.cpu()
                     self.model_snapshots.append(model_copy)
@@ -538,8 +542,8 @@ def get_hyperparameter_search_space(
         dataset_properties: Optional[Dict] = None,
         weighted_loss: HyperparameterSearchSpace = HyperparameterSearchSpace(
             hyperparameter="weighted_loss",
-            value_range=(True, False),
-            default_value=True),
+            value_range=(1, ),
+            default_value=1),
         la_steps: HyperparameterSearchSpace = HyperparameterSearchSpace(
             hyperparameter="la_steps",
             value_range=(5, 10),
@@ -564,7 +568,7 @@ def get_hyperparameter_search_space(
             default_value=True),
         se_lastk: HyperparameterSearchSpace = HyperparameterSearchSpace(
             hyperparameter="se_lastk",
-            value_range=(3,),
+            value_range=(3, ),
             default_value=3),
     ) -> ConfigurationSpace:
         cs = ConfigurationSpace()
@@ -596,9 +600,17 @@ def get_hyperparameter_search_space(
                 parent_hyperparameter=parent_hyperparameter
             )
 
+        """
         # TODO, decouple the weighted loss from the trainer
         if dataset_properties is not None:
             if STRING_TO_TASK_TYPES[dataset_properties['task_type']] in CLASSIFICATION_TASKS:
                 add_hyperparameter(cs, weighted_loss, CategoricalHyperparameter)
+        """
+        # TODO, decouple the weighted loss from the trainer. Uncomment the code above and
+        # remove the code below. Also update the method signature, so the weighted loss
+        # is not a constant.
+        if dataset_properties is not None:
+            if STRING_TO_TASK_TYPES[dataset_properties['task_type']] in CLASSIFICATION_TASKS:
+                add_hyperparameter(cs, weighted_loss, Constant)
 
         return cs
diff --git a/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py b/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py
index 1d63b6141..ec2ef6106 100755
--- a/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py
+++ b/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py
@@ -285,7 +285,6 @@ def fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> autoPyTorchCom
         self.choice: autoPyTorchComponent = cast(autoPyTorchComponent, self.choice)
         if self.choice.use_snapshot_ensemble:
             X['network_snapshots'].extend(self.choice.model_snapshots)
-
         return self.choice
 
     def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoice':
@@ -402,14 +401,15 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic
                 torch.cuda.empty_cache()
 
         if self.choice.use_stochastic_weight_averaging and self.choice.swa_updated:
+
             # update batch norm statistics
             swa_utils.update_bn(loader=X['train_data_loader'], model=self.choice.swa_model.double())
 
             # change model
             update_model_state_dict_from_swa(X['network'], self.choice.swa_model.state_dict())
             if self.choice.use_snapshot_ensemble:
-                for model in self.choice.model_snapshots:
-                    swa_utils.update_bn(loader=X['train_data_loader'], model=model.double())
+                # we update only the last network which pertains to the stochastic weight averaging model
+                swa_utils.update_bn(X['train_data_loader'], self.choice.model_snapshots[-1].double())
 
         # wrap up -- add score if not evaluating every epoch
         if not self.eval_valid_each_epoch(X):
diff --git a/autoPyTorch/pipeline/components/training/trainer/cutout_utils.py b/autoPyTorch/pipeline/components/training/trainer/cutout_utils.py
index 1b987d599..4feedf5cb 100644
--- a/autoPyTorch/pipeline/components/training/trainer/cutout_utils.py
+++ b/autoPyTorch/pipeline/components/training/trainer/cutout_utils.py
@@ -20,7 +20,7 @@
 class CutOut:
     def __init__(self, patch_ratio: float,
                  cutout_prob: float,
-                 weighted_loss: bool = False,
+                 weighted_loss: int = 0,
                  random_state: Optional[np.random.RandomState] = None,
                  use_stochastic_weight_averaging: bool = False,
                  use_snapshot_ensemble: bool = False,
@@ -63,8 +63,8 @@ def get_hyperparameter_search_space(
         dataset_properties: Optional[Dict] = None,
         weighted_loss: HyperparameterSearchSpace = HyperparameterSearchSpace(
             hyperparameter="weighted_loss",
-            value_range=(True, False),
-            default_value=True),
+            value_range=(1, ),
+            default_value=1),
         la_steps: HyperparameterSearchSpace = HyperparameterSearchSpace(
             hyperparameter="la_steps",
             value_range=(5, 10),
@@ -136,9 +136,17 @@ def get_hyperparameter_search_space(
                 parent_hyperparameter=parent_hyperparameter
             )
 
+        """
         # TODO, decouple the weighted loss from the trainer
         if dataset_properties is not None:
             if STRING_TO_TASK_TYPES[dataset_properties['task_type']] in CLASSIFICATION_TASKS:
                 add_hyperparameter(cs, weighted_loss, CategoricalHyperparameter)
+        """
+        # TODO, decouple the weighted loss from the trainer. Uncomment the code above and
+        # remove the code below. Also update the method signature, so the weighted loss
+        # is not a constant.
+        if dataset_properties is not None:
+            if STRING_TO_TASK_TYPES[dataset_properties['task_type']] in CLASSIFICATION_TASKS:
+                add_hyperparameter(cs, weighted_loss, Constant)
 
         return cs
diff --git a/autoPyTorch/pipeline/components/training/trainer/mixup_utils.py b/autoPyTorch/pipeline/components/training/trainer/mixup_utils.py
index e33011bf5..e2ea25148 100644
--- a/autoPyTorch/pipeline/components/training/trainer/mixup_utils.py
+++ b/autoPyTorch/pipeline/components/training/trainer/mixup_utils.py
@@ -26,7 +26,7 @@ class MixUp:
         Github URL: https://github.com/facebookresearch/mixup-cifar10/blob/master/train.py#L119-L138
     """
     def __init__(self, alpha: float,
-                 weighted_loss: bool = False,
+                 weighted_loss: int = 0,
                  random_state: Optional[np.random.RandomState] = None,
                  use_stochastic_weight_averaging: bool = False,
                  use_snapshot_ensemble: bool = False,
@@ -68,8 +68,8 @@ def get_hyperparameter_search_space(
         dataset_properties: Optional[Dict] = None,
         weighted_loss: HyperparameterSearchSpace = HyperparameterSearchSpace(
             hyperparameter="weighted_loss",
-            value_range=(True, False),
-            default_value=True),
+            value_range=(1, ),
+            default_value=1),
         la_steps: HyperparameterSearchSpace = HyperparameterSearchSpace(
             hyperparameter="la_steps",
             value_range=(5, 10),
@@ -94,7 +94,7 @@ def get_hyperparameter_search_space(
             default_value=True),
         se_lastk: HyperparameterSearchSpace = HyperparameterSearchSpace(
             hyperparameter="se_lastk",
-            value_range=(3,),
+            value_range=(3, ),
             default_value=3),
         alpha: HyperparameterSearchSpace = HyperparameterSearchSpace(
             hyperparameter="alpha",
@@ -134,9 +134,18 @@ def get_hyperparameter_search_space(
                 la_config_space,
                 parent_hyperparameter=parent_hyperparameter
             )
+
+        """
         # TODO, decouple the weighted loss from the trainer
         if dataset_properties is not None:
             if STRING_TO_TASK_TYPES[dataset_properties['task_type']] in CLASSIFICATION_TASKS:
                 add_hyperparameter(cs, weighted_loss, CategoricalHyperparameter)
+        """
+        # TODO, decouple the weighted loss from the trainer. Uncomment the code above and
+        # remove the code below. Also update the method signature, so the weighted loss
+        # is not a constant.
+        if dataset_properties is not None:
+            if STRING_TO_TASK_TYPES[dataset_properties['task_type']] in CLASSIFICATION_TASKS:
+                add_hyperparameter(cs, weighted_loss, Constant)
 
         return cs
diff --git a/autoPyTorch/pipeline/tabular_classification.py b/autoPyTorch/pipeline/tabular_classification.py
index bb4cb10ac..b611cd8b3 100644
--- a/autoPyTorch/pipeline/tabular_classification.py
+++ b/autoPyTorch/pipeline/tabular_classification.py
@@ -7,7 +7,6 @@
 
 import numpy as np
 
-import sklearn.preprocessing
 from sklearn.base import ClassifierMixin
 
 import torch
@@ -91,13 +90,8 @@ def _predict_proba(self, X: np.ndarray) -> np.ndarray:
         loader = self.named_steps['data_loader'].get_loader(X=X)
         pred = self.named_steps['network'].predict(loader)
         if isinstance(self.dataset_properties['output_shape'], int):
-            proba = pred[:, :self.dataset_properties['output_shape']]
-            normalizer = proba.sum(axis=1)[:, np.newaxis]
-            normalizer[normalizer == 0.0] = 1.0
-            proba /= normalizer
-
-            return proba
-
+            # The final layer is always softmax now (`pred` already gives pseudo proba)
+            return pred
         else:
             all_proba = []
 
@@ -108,7 +102,7 @@ def _predict_proba(self, X: np.ndarray) -> np.ndarray:
                 proba_k /= normalizer
                 all_proba.append(proba_k)
 
-            return all_proba
+            return np.array(all_proba)
 
     def predict_proba(self, X: np.ndarray, batch_size: Optional[int] = None) -> np.ndarray:
         """predict_proba.
@@ -145,11 +139,6 @@ def predict_proba(self, X: np.ndarray, batch_size: Optional[int] = None) -> np.n
                     pred_prob = self.predict_proba(X[batch_from:batch_to], batch_size=None)
                     y[batch_from:batch_to] = pred_prob.astype(np.float32)
 
-        # Neural networks might not be fit to produce a [0-1] output
-        # For instance, after small number of epochs.
-        y = np.clip(y, 0, 1)
-        y = sklearn.preprocessing.normalize(y, axis=1, norm='l1')
-
         return y
 
     def _get_hyperparameter_search_space(self,
diff --git a/autoPyTorch/utils/backend.py b/autoPyTorch/utils/backend.py
index c9681adb3..5348bd11c 100644
--- a/autoPyTorch/utils/backend.py
+++ b/autoPyTorch/utils/backend.py
@@ -204,13 +204,14 @@ def temporary_directory(self) -> str:
         return self.context.temporary_directory
 
     def _make_internals_directory(self) -> None:
+        # TODO: make exist_ok a function argument
         try:
-            os.makedirs(self.internals_directory)
+            os.makedirs(self.internals_directory, exist_ok=True)
         except Exception as e:
             if self._logger is not None:
                 self._logger.debug("_make_internals_directory: %s" % e)
         try:
-            os.makedirs(self.get_runs_directory())
+            os.makedirs(self.get_runs_directory(), exist_ok=True)
         except Exception as e:
             if self._logger is not None:
                 self._logger.debug("_make_internals_directory: %s" % e)
@@ -328,6 +329,20 @@ def load_datamanager(self) -> BaseDataset:
             with open(filepath, 'rb') as fh:
                 return pickle.load(fh)
 
+    def replace_datamanager(self, datamanager: BaseDataset) -> None:
+        """
+        This function is called to replace the old datamanager with a datamanager
+        in case it is required.
+
+        Args:
+            datamanager (BaseDataset): the new datamanager to replace the old.
+        """
+        warnings.warn("Original dataset will be overwritten with the provided dataset")
+        datamanager_pickle_file = self._get_datamanager_pickle_filename()
+        if os.path.exists(datamanager_pickle_file):
+            os.remove(datamanager_pickle_file)
+        self.save_datamanager(datamanager=datamanager)
+
     def get_runs_directory(self) -> str:
         return os.path.join(self.internals_directory, 'runs')
 
diff --git a/autoPyTorch/utils/common.py b/autoPyTorch/utils/common.py
index a2ec4d580..892b5a1ad 100644
--- a/autoPyTorch/utils/common.py
+++ b/autoPyTorch/utils/common.py
@@ -96,7 +96,7 @@ def replace_prefix_in_config_dict(config: Dict[str, Any], prefix: str, replace:
             k.startswith(prefix)}
 
 
-def custom_collate_fn(batch: List) -> List[Optional[torch.tensor]]:
+def custom_collate_fn(batch: List) -> List[Optional[torch.Tensor]]:
     """
     In the case of not providing a y tensor, in a
     dataset of form {X, y}, y would be None.
diff --git a/autoPyTorch/utils/implementations.py b/autoPyTorch/utils/implementations.py
index 2130cfd6b..40ceb85d1 100644
--- a/autoPyTorch/utils/implementations.py
+++ b/autoPyTorch/utils/implementations.py
@@ -35,7 +35,6 @@ def __call__(self, y: Union[np.ndarray, torch.Tensor]) -> np.ndarray:
             weights = (np.ones(y.shape[1]) * weight_per_class) / np.maximum(counts, 1)
         else:
             classes, counts = np.unique(y, axis=0, return_counts=True)
-            classes, counts = classes[::-1], counts[::-1]
             weight_per_class = total_weight / classes.shape[0]
             weights = (np.ones(classes.shape[0]) * weight_per_class) / counts
 
diff --git a/examples/tabular/40_advanced/example_custom_configuration_space.py b/examples/tabular/40_advanced/example_custom_configuration_space.py
index 6a3764b94..b95ceeaa5 100644
--- a/examples/tabular/40_advanced/example_custom_configuration_space.py
+++ b/examples/tabular/40_advanced/example_custom_configuration_space.py
@@ -54,6 +54,15 @@ def get_search_space_updates():
                    hyperparameter='ResNetBackbone:dropout',
                    value_range=[0, 0.5],
                    default_value=0.2)
+    updates.append(node_name='network_backbone',
+                   hyperparameter='ResNetBackbone:multi_branch_choice',
+                   value_range=['shake-shake'],
+                   default_value='shake-shake')
+    updates.append(node_name='network_backbone',
+                   hyperparameter='ResNetBackbone:shake_shake_method',
+                   value_range=['M3'],
+                   default_value='M3'
+                   )
     return updates
 
 
@@ -74,7 +83,7 @@ def get_search_space_updates():
     # ==================================================
     api = TabularClassificationTask(
         search_space_updates=get_search_space_updates(),
-        include_components={'network_backbone': ['MLPBackbone', 'ResNetBackbone'],
+        include_components={'network_backbone': ['ResNetBackbone'],
                             'encoder': ['OneHotEncoder']}
     )
 
diff --git a/examples/tabular/40_advanced/example_posthoc_ensemble_fit.py b/examples/tabular/40_advanced/example_posthoc_ensemble_fit.py
new file mode 100644
index 000000000..b9383b2a6
--- /dev/null
+++ b/examples/tabular/40_advanced/example_posthoc_ensemble_fit.py
@@ -0,0 +1,81 @@
+"""
+=====================================================
+Tabular Classification with Post-Hoc Ensemble Fitting
+=====================================================
+
+The following example shows how to fit a sample classification model
+and create an ensemble post-hoc with AutoPyTorch
+"""
+import os
+import tempfile as tmp
+import warnings
+
+os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
+os.environ['OMP_NUM_THREADS'] = '1'
+os.environ['OPENBLAS_NUM_THREADS'] = '1'
+os.environ['MKL_NUM_THREADS'] = '1'
+
+warnings.simplefilter(action='ignore', category=UserWarning)
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+import sklearn.datasets
+import sklearn.model_selection
+
+from autoPyTorch.api.tabular_classification import TabularClassificationTask
+
+
+if __name__ == '__main__':
+
+    ############################################################################
+    # Data Loading
+    # ============
+    X, y = sklearn.datasets.fetch_openml(data_id=40981, return_X_y=True, as_frame=True)
+    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+        X,
+        y,
+        random_state=42,
+    )
+
+    ############################################################################
+    # Build and fit a classifier
+    # ==========================
+    api = TabularClassificationTask(
+        ensemble_size=0,
+        seed=42,
+    )
+
+    ############################################################################
+    # Search for the best neural network
+    # ==================================
+    api.search(
+        X_train=X_train,
+        y_train=y_train,
+        X_test=X_test.copy(),
+        y_test=y_test.copy(),
+        optimize_metric='accuracy',
+        total_walltime_limit=250,
+        func_eval_time_limit_secs=50
+    )
+
+    ############################################################################
+    # Print the final performance of the incumbent neural network
+    # ===========================================================
+    print(api.run_history, api.trajectory)
+    y_pred = api.predict(X_test)
+    score = api.score(y_pred, y_test)
+    print(score)
+
+    ############################################################################
+    # Fit an ensemble with the neural networks fitted during the search
+    # =================================================================
+
+    api.fit_ensemble(ensemble_size=5,
+                     # Set the enable_traditional_pipeline=True
+                     # to also include traditional models
+                     # in the ensemble
+                     enable_traditional_pipeline=False)
+    # Print the final ensemble built by AutoPyTorch
+    y_pred = api.predict(X_test)
+    score = api.score(y_pred, y_test)
+    print(score)
+    print(api.show_models())
diff --git a/requirements.txt b/requirements.txt
index c79104461..f4a913789 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,7 +10,7 @@ imgaug>=0.4.0
 ConfigSpace>=0.4.14,<0.5
 pynisher>=0.6.3
 pyrfr>=0.7,<0.9
-smac>=0.13.1,<0.14
+smac
 dask
 distributed>=2.2.0
 catboost
diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py
index f9ba2855e..3a70549b0 100644
--- a/test/test_data/test_feature_validator.py
+++ b/test/test_data/test_feature_validator.py
@@ -1,4 +1,5 @@
 import copy
+import functools
 
 import numpy as np
 
@@ -283,9 +284,9 @@ def test_featurevalidator_fitontypeA_transformtypeB(input_data_featuretest):
     if isinstance(input_data_featuretest, pd.DataFrame):
         pytest.skip("Column order change in pandas is not supported")
     elif isinstance(input_data_featuretest, np.ndarray):
-        complementary_type = validator.numpy_array_to_pandas(input_data_featuretest)
+        complementary_type = validator.numpy_to_pandas(input_data_featuretest)
     elif isinstance(input_data_featuretest, list):
-        complementary_type, _ = validator.list_to_dataframe(input_data_featuretest)
+        complementary_type, _ = validator.list_to_pandas(input_data_featuretest)
     elif sparse.issparse(input_data_featuretest):
         complementary_type = sparse.csr_matrix(input_data_featuretest.todense())
     else:
@@ -311,10 +312,121 @@ def test_featurevalidator_get_columns_to_encode():
     for col in df.columns:
         df[col] = df[col].astype(col)
 
-    enc_columns, feature_types = validator._get_columns_to_encode(df)
+    validator.fit(df)
 
-    assert enc_columns == ['category', 'bool']
-    assert feature_types == ['numerical', 'numerical', 'categorical', 'categorical']
+    categorical_columns, numerical_columns, feat_type = validator._get_columns_info(df)
+
+    assert numerical_columns == ['int', 'float']
+    assert categorical_columns == ['category', 'bool']
+    assert feat_type == ['numerical', 'numerical', 'categorical', 'categorical']
+
+
+def feature_validator_remove_nan_catcolumns(df_train: pd.DataFrame, df_test: pd.DataFrame,
+                                            ans_train: np.ndarray, ans_test: np.ndarray) -> None:
+    validator = TabularFeatureValidator()
+    validator.fit(df_train)
+    transformed_df_train = validator.transform(df_train)
+    transformed_df_test = validator.transform(df_test)
+
+    assert np.array_equal(transformed_df_train, ans_train)
+    assert np.array_equal(transformed_df_test, ans_test)
+
+
+def test_feature_validator_remove_nan_catcolumns():
+    """
+    Make sure categorical columns that have only nan values are removed.
+    Transform performs the folloing:
+        * simple imputation for both
+        * scaling for numerical
+        * one-hot encoding for categorical
+    For example,
+        data = [
+            {'A': 1, 'B': np.nan, 'C': np.nan},
+            {'A': np.nan, 'B': 3, 'C': np.nan},
+            {'A': 2, 'B': np.nan, 'C': np.nan}
+        ]
+    and suppose all the columns are categorical,
+    then
+        * `A` in {np.nan, 1, 2}
+        * `B` in {np.nan, 3}
+        * `C` in {np.nan} <=== it will be dropped.
+
+    So in the column A,
+        * np.nan ==> [1, 0, 0]
+        * 1      ==> [0, 1, 0]
+        * 2      ==> [0, 0, 1]
+    in the column B,
+        * np.nan ==> [1, 0]
+        * 3      ==> [0, 1]
+    Therefore, by concatenating,
+        * {'A': 1, 'B': np.nan, 'C': np.nan} ==> [0, 1, 0, 1, 0]
+        * {'A': np.nan, 'B': 3, 'C': np.nan} ==> [1, 0, 0, 0, 1]
+        * {'A': 2, 'B': np.nan, 'C': np.nan} ==> [0, 0, 1, 1, 0]
+    """
+    # First case, there exist null columns (B and C) in the train set
+    # and a same column (C) are not all null for the test set.
+
+    df_train = pd.DataFrame(
+        [
+            {'A': 1, 'B': np.nan, 'C': np.nan},
+            {'A': np.nan, 'C': np.nan},
+            {'A': 1}
+        ],
+        dtype='category',
+    )
+    ans_train = np.array([[0, 1], [1, 0], [0, 1]], dtype=np.float64)
+    df_test = pd.DataFrame(
+        [
+            {'A': np.nan, 'B': np.nan, 'C': 5},
+            {'A': np.nan, 'C': np.nan},
+            {'A': 1}
+        ],
+        dtype='category',
+    )
+    ans_test = np.array([[1, 0], [1, 0], [0, 1]], dtype=np.float64)
+    feature_validator_remove_nan_catcolumns(df_train, df_test, ans_train, ans_test)
+
+    # Second case, there exist null columns (B and C) in the training set and
+    # the same columns (B and C) are null in the test set.
+    df_train = pd.DataFrame(
+        [
+            {'A': 1, 'B': np.nan, 'C': np.nan},
+            {'A': np.nan, 'C': np.nan},
+            {'A': 1}
+        ],
+        dtype='category',
+    )
+    ans_train = np.array([[0, 1], [1, 0], [0, 1]], dtype=np.float64)
+    df_test = pd.DataFrame(
+        [
+            {'A': np.nan, 'B': np.nan, 'C': np.nan},
+            {'A': np.nan, 'C': np.nan},
+            {'A': 1}
+        ],
+        dtype='category',
+    )
+    ans_test = np.array([[1, 0], [1, 0], [0, 1]], dtype=np.float64)
+    feature_validator_remove_nan_catcolumns(df_train, df_test, ans_train, ans_test)
+
+    # Third case, there exist no null columns in the training set and
+    # null columns exist in the test set.
+    df_train = pd.DataFrame(
+        [
+            {'A': 1, 'B': 1},
+            {'A': 2, 'B': 2}
+        ],
+        dtype='category',
+    )
+    ans_train = np.array([[1, 0, 1, 0], [0, 1, 0, 1]], dtype=np.float64)
+    df_test = pd.DataFrame(
+        [
+            {'A': np.nan, 'B': np.nan},
+            {'A': np.nan, 'B': np.nan}
+        ],
+        dtype='category',
+    )
+    ans_test = np.array([[0, 0, 0, 0], [0, 0, 0, 0]], dtype=np.float64)
+    feature_validator_remove_nan_catcolumns(df_train, df_test, ans_train, ans_test)
 
 
 def test_features_unsupported_calls_are_raised():
@@ -550,15 +662,16 @@ def test_feature_validator_imbalanced_data():
     validator.fit(X_train)
 
     train_feature_types = copy.deepcopy(validator.feat_type)
-    assert train_feature_types == ['numerical', 'numerical', 'numerical', 'numerical']
+    assert train_feature_types == ['numerical']
     # validator will throw an error if the column types are not the same
     transformed_X_test = validator.transform(X_test)
     transformed_X_test = pd.DataFrame(transformed_X_test)
-    null_columns = []
-    for column in transformed_X_test.columns:
-        if transformed_X_test[column].isna().all():
-            null_columns.append(column)
-    assert null_columns == [0, 2, 3]
+    assert sorted(validator.all_nan_columns) == sorted(['A', 'C', 'D'])
+    # as there are no categorical columns, we can make such an
+    # assertion. We only expect to drop the all nan columns
+    total_all_nan_columns = len(validator.all_nan_columns)
+    total_columns = len(validator.column_order)
+    assert total_columns - total_all_nan_columns == len(transformed_X_test.columns)
 
     # Columns with not all null values in the train split and
     # completely null on the test split.
@@ -577,14 +690,33 @@ def test_feature_validator_imbalanced_data():
     X_test = pd.DataFrame.from_dict(test_features)
     validator = TabularFeatureValidator()
     validator.fit(X_train)
+
     train_feature_types = copy.deepcopy(validator.feat_type)
     assert train_feature_types == ['categorical', 'numerical', 'numerical']
 
     transformed_X_test = validator.transform(X_test)
     transformed_X_test = pd.DataFrame(transformed_X_test)
-    null_columns = []
-    for column in transformed_X_test.columns:
-        if transformed_X_test[column].isna().all():
-            null_columns.append(column)
-
-    assert null_columns == [1]
+    assert not len(validator.all_nan_columns)
+
+
+def test_comparator():
+    numerical = 'numerical'
+    categorical = 'categorical'
+
+    validator = TabularFeatureValidator
+
+    feat_type = [numerical, categorical] * 10
+    ans = [categorical] * 10 + [numerical] * 10
+    feat_type = sorted(
+        feat_type,
+        key=functools.cmp_to_key(validator._comparator)
+    )
+    assert ans == feat_type
+
+    feat_type = [numerical] * 10 + [categorical] * 10
+    ans = [categorical] * 10 + [numerical] * 10
+    feat_type = sorted(
+        feat_type,
+        key=functools.cmp_to_key(validator._comparator)
+    )
+    assert ans == feat_type