automl · ravinkohli · Feb 9, 2022 · Jan 24, 2022 · Jan 28, 2022 · Feb 1, 2022
diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py
@@ -13,8 +13,8 @@
 from autoPyTorch.data.tabular_validator import TabularInputValidator
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.datasets.resampling_strategy import (
-    HoldoutValTypes,
     CrossValTypes,
+    HoldoutValTypes,
     ResamplingStrategies,
 )
 from autoPyTorch.datasets.tabular_dataset import TabularDataset

diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py
@@ -13,8 +13,8 @@
 from autoPyTorch.data.tabular_validator import TabularInputValidator
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.datasets.resampling_strategy import (
-    HoldoutValTypes,
     CrossValTypes,
+    HoldoutValTypes,
     ResamplingStrategies,
 )
 from autoPyTorch.datasets.tabular_dataset import TabularDataset

diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
@@ -14,14 +14,13 @@
 from sklearn.exceptions import NotFittedError
 from sklearn.impute import SimpleImputer
 from sklearn.pipeline import make_pipeline
-from sklearn.preprocessing import OneHotEncoder, StandardScaler
+from sklearn.preprocessing import OrdinalEncoder
 
 from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SUPPORTED_FEAT_TYPES
 
 
 def _create_column_transformer(
     preprocessors: Dict[str, List[BaseEstimator]],
-    numerical_columns: List[str],
     categorical_columns: List[str],
 ) -> ColumnTransformer:
     """
@@ -32,49 +31,36 @@ def _create_column_transformer(
     Args:
         preprocessors (Dict[str, List[BaseEstimator]]):
             Dictionary containing list of numerical and categorical preprocessors.
-        numerical_columns (List[str]):
-            List of names of numerical columns
         categorical_columns (List[str]):
             List of names of categorical columns
 
     Returns:
         ColumnTransformer
     """
 
-    numerical_pipeline = 'drop'
-    categorical_pipeline = 'drop'
-    if len(numerical_columns) > 0:
-        numerical_pipeline = make_pipeline(*preprocessors['numerical'])
-    if len(categorical_columns) > 0:
-        categorical_pipeline = make_pipeline(*preprocessors['categorical'])
+    categorical_pipeline = make_pipeline(*preprocessors['categorical'])
 
     return ColumnTransformer([
-        ('categorical_pipeline', categorical_pipeline, categorical_columns),
-        ('numerical_pipeline', numerical_pipeline, numerical_columns)],
-        remainder='drop'
+        ('categorical_pipeline', categorical_pipeline, categorical_columns)],
+        remainder='passthrough'
     )
 
 
 def get_tabular_preprocessors() -> Dict[str, List[BaseEstimator]]:
     """
     This function creates a Dictionary containing a list
     of numerical and categorical preprocessors
-
     Returns:
         Dict[str, List[BaseEstimator]]
     """
     preprocessors: Dict[str, List[BaseEstimator]] = dict()
 
     # Categorical Preprocessors
-    onehot_encoder = OneHotEncoder(categories='auto', sparse=False, handle_unknown='ignore')
+    ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value',
+                                     unknown_value=-1)
     categorical_imputer = SimpleImputer(strategy='constant', copy=False)
 
-    # Numerical Preprocessors
-    numerical_imputer = SimpleImputer(strategy='median', copy=False)
-    standard_scaler = StandardScaler(with_mean=True, with_std=True, copy=False)
-
-    preprocessors['categorical'] = [categorical_imputer, onehot_encoder]
-    preprocessors['numerical'] = [numerical_imputer, standard_scaler]
+    preprocessors['categorical'] = [categorical_imputer, ordinal_encoder]
 
     return preprocessors
 
@@ -161,31 +147,41 @@ def _fit(
 
             X = cast(pd.DataFrame, X)
 
-            self.all_nan_columns = set([column for column in X.columns if X[column].isna().all()])
+            all_nan_columns = X.columns[X.isna().all()]
+            for col in all_nan_columns:
+                X[col] = pd.to_numeric(X[col])
+            self.dtypes = [dt.name for dt in X.dtypes]  # Also note this change in self.dtypes
+            self.all_nan_columns = set(all_nan_columns)
 
-            categorical_columns, numerical_columns, feat_type = self._get_columns_info(X)
+            self.enc_columns, self.feat_type = self._get_columns_info(X)
 
-            self.enc_columns = categorical_columns
+            if len(self.enc_columns) > 0:
 
-            preprocessors = get_tabular_preprocessors()
-            self.column_transformer = _create_column_transformer(
-                preprocessors=preprocessors,
-                numerical_columns=numerical_columns,
-                categorical_columns=categorical_columns,
-            )
+                preprocessors = get_tabular_preprocessors()
+                self.column_transformer = _create_column_transformer(
+                    preprocessors=preprocessors,
+                    categorical_columns=self.enc_columns,
+                )
 
-            # Mypy redefinition
-            assert self.column_transformer is not None
-            self.column_transformer.fit(X)
+                # Mypy redefinition
+                assert self.column_transformer is not None
+                self.column_transformer.fit(X)
 
-            # The column transformer reorders the feature types
-            # therefore, we need to change the order of columns as well
-            # This means categorical columns are shifted to the left
+                # The column transformer moves categorical columns before all numerical columns
+                # therefore, we need to sort categorical columns so that it complies this change
 
-            self.feat_type = sorted(
-                feat_type,
-                key=functools.cmp_to_key(self._comparator)
-            )
+                self.feat_type = sorted(
+                    self.feat_type,
+                    key=functools.cmp_to_key(self._comparator)
+                )
+
+                encoded_categories = self.column_transformer.\
+                    named_transformers_['categorical_pipeline'].\
+                    named_steps['ordinalencoder'].categories_
+                self.categories = [
+                    list(range(len(cat)))
+                    for cat in encoded_categories
+                ]
 
             # differently to categorical_columns and numerical_columns,
             # this saves the index of the column.
@@ -264,6 +260,20 @@ def transform(
 
         if hasattr(X, "iloc") and not scipy.sparse.issparse(X):
             X = cast(Type[pd.DataFrame], X)
+            if self.all_nan_columns is not None:
+                for column in X.columns:
+                    if column in self.all_nan_columns:
+                        if not X[column].isna().all():
+                            X[column] = np.nan
+                        X[column] = pd.to_numeric(X[column])
+            if len(self.categorical_columns) > 0:
+                if self.column_transformer is None:
+                    raise AttributeError("Expect column transformer to be built"
+                                         "if there are categorical columns")
+                categorical_columns = self.column_transformer.transformers_[0][-1]
+                for column in categorical_columns:
+                    if X[column].isna().all():
+                        X[column] = X[column].astype('object')
 
         # Check the data here so we catch problems on new test data
         self._check_data(X)
@@ -273,11 +283,6 @@ def transform(
         # We need to convert the column in test data to
         # object otherwise the test column is interpreted as float
         if self.column_transformer is not None:
-            if len(self.categorical_columns) > 0:
-                categorical_columns = self.column_transformer.transformers_[0][-1]
-                for column in categorical_columns:
-                    if X[column].isna().all():
-                        X[column] = X[column].astype('object')
             X = self.column_transformer.transform(X)
 
         # Sparse related transformations
@@ -362,18 +367,17 @@ def _check_data(
 
             dtypes = [dtype.name for dtype in X.dtypes]
 
-            diff_cols = X.columns[[s_dtype != dtype for s_dtype, dtype in zip(self.dtypes, dtypes)]]
             if len(self.dtypes) == 0:
                 self.dtypes = dtypes
-            elif not self._is_datasets_consistent(diff_cols, X):
+            elif self.dtypes != dtypes:
                 raise ValueError("The dtype of the features must not be changed after fit(), but"
                                  " the dtypes of some columns are different between training ({}) and"
                                  " test ({}) datasets.".format(self.dtypes, dtypes))
 
     def _get_columns_info(
         self,
         X: pd.DataFrame,
-    ) -> Tuple[List[str], List[str], List[str]]:
+    ) -> Tuple[List[str], List[str]]:
         """
         Return the columns to be encoded from a pandas dataframe
 
@@ -392,15 +396,12 @@ def _get_columns_info(
         """
 
         # Register if a column needs encoding
-        numerical_columns = []
         categorical_columns = []
         # Also, register the feature types for the estimator
         feat_type = []
 
         # Make sure each column is a valid type
         for i, column in enumerate(X.columns):
-            if self.all_nan_columns is not None and column in self.all_nan_columns:
-                continue
             column_dtype = self.dtypes[i]
             err_msg = "Valid types are `numerical`, `categorical` or `boolean`, " \
                       "but input column {} has an invalid type `{}`.".format(column, column_dtype)
@@ -411,7 +412,6 @@ def _get_columns_info(
             # TypeError: data type not understood in certain pandas types
             elif is_numeric_dtype(column_dtype):
                 feat_type.append('numerical')
-                numerical_columns.append(column)
             elif column_dtype == 'object':
                 # TODO verify how would this happen when we always convert the object dtypes to category
                 raise TypeError(
@@ -437,7 +437,7 @@ def _get_columns_info(
                     "before feeding it to AutoPyTorch.".format(err_msg)
                 )
 
-        return categorical_columns, numerical_columns, feat_type
+        return categorical_columns, feat_type
 
     def list_to_pandas(
         self,

diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py
@@ -125,7 +125,6 @@ def __init__(
         self.holdout_validators: Dict[str, HoldOutFunc] = {}
         self.no_resampling_validators: Dict[str, NoResamplingFunc] = {}
         self.random_state = np.random.RandomState(seed=seed)
-        self.no_resampling_validators: Dict[str, NoResamplingFunc] = {}
         self.shuffle = shuffle
         self.resampling_strategy = resampling_strategy
         self.resampling_strategy_args = resampling_strategy_args
@@ -145,7 +144,7 @@ def __init__(
 
         # TODO: Look for a criteria to define small enough to preprocess
         # False for the regularization cocktails initially
-        self.is_small_preprocess = False
+        # self.is_small_preprocess = False
 
         # Make sure cross validation splits are created once
         self.cross_validators = CrossValFuncs.get_cross_validators(*CrossValTypes)

diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py
@@ -39,13 +39,6 @@ def __call__(self, random_state: np.random.RandomState, val_share: float,
         ...
 
 
-class NoResamplingFunc(Protocol):
-    def __call__(self,
-                 random_state: np.random.RandomState,
-                 indices: np.ndarray) -> np.ndarray:
-        ...
-
-
 class CrossValTypes(IntEnum):
     """The type of cross validation
 

diff --git a/autoPyTorch/evaluation/train_evaluator.py b/autoPyTorch/evaluation/train_evaluator.py
@@ -152,13 +152,6 @@ def __init__(self, backend: Backend, queue: Queue,
             search_space_updates=search_space_updates
         )
 
-        if not isinstance(self.datamanager.resampling_strategy, (CrossValTypes, HoldoutValTypes)):
-            raise ValueError(
-                'TrainEvaluator expect to have (CrossValTypes, HoldoutValTypes) as '
-                'resampling_strategy, but got {}'.format(self.datamanager.resampling_strategy)
-            )
-
-
         if not isinstance(self.datamanager.resampling_strategy, (CrossValTypes, HoldoutValTypes)):
             resampling_strategy = self.datamanager.resampling_strategy
             raise ValueError(
@@ -428,10 +421,10 @@ def eval_train_function(
     budget: float,
     config: Optional[Configuration],
     seed: int,
+    output_y_hat_optimization: bool,
     num_run: int,
     include: Optional[Dict[str, Any]],
     exclude: Optional[Dict[str, Any]],
-    output_y_hat_optimization: bool,
     disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
     pipeline_config: Optional[Dict[str, Any]] = None,
     budget_type: str = None,

diff --git a/...Torch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/...Torch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py
@@ -3,14 +3,14 @@
 import numpy as np
 
 from sklearn.compose import ColumnTransformer
-# from sklearn.pipeline import make_pipeline
+from sklearn.pipeline import make_pipeline
 
 import torch
 
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import (
     autoPyTorchTabularPreprocessingComponent
 )
-# from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.utils import get_tabular_preprocessers
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.utils import get_tabular_preprocessers
 from autoPyTorch.utils.common import FitRequirement, subsampler
 
 
@@ -52,11 +52,11 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer":
         numerical_pipeline = 'passthrough'
         categorical_pipeline = 'passthrough'
 
-        # preprocessors = get_tabular_preprocessers(X)
-        # if len(X['dataset_properties']['numerical_columns']):
-        #     numerical_pipeline = make_pipeline(*preprocessors['numerical'])
-        # if len(X['dataset_properties']['categorical_columns']):
-        #     categorical_pipeline = make_pipeline(*preprocessors['categorical'])
+        preprocessors = get_tabular_preprocessers(X)
+        if len(X['dataset_properties']['numerical_columns']):
+            numerical_pipeline = make_pipeline(*preprocessors['numerical'])
+        if len(X['dataset_properties']['categorical_columns']):
+            categorical_pipeline = make_pipeline(*preprocessors['categorical'])
 
         self.preprocessor = ColumnTransformer([
             ('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns']),

diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/NoEncoder.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/NoEncoder.py
@@ -40,7 +40,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         Returns:
             (Dict[str, Any]): the updated 'X' dictionary
         """
-        # X.update({'encoder': self.preprocessor})
+        X.update({'encoder': self.preprocessor})
         return X
 
     @staticmethod

diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py
@@ -28,5 +28,5 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None:
             raise ValueError("cant call transform on {} without fitting first."
                              .format(self.__class__.__name__))
-        # X.update({'encoder': self.preprocessor})
+        X.update({'encoder': self.preprocessor})
         return X
diff --git a/...yTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py b/...yTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py
@@ -29,5 +29,5 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None:
             raise ValueError("cant call transform on {} without fitting first."
                              .format(self.__class__.__name__))
-        # X.update({'imputer': self.preprocessor})
+        X.update({'imputer': self.preprocessor})
         return X
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/NoScaler.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/NoScaler.py
@@ -43,7 +43,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         Returns:
             np.ndarray: Transformed features
         """
-        # X.update({'scaler': self.preprocessor})
+        X.update({'scaler': self.preprocessor})
         return X
 
     @staticmethod

diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler.py
@@ -28,5 +28,5 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None:
             raise ValueError("cant call transform on {} without fitting first."
                              .format(self.__class__.__name__))
-        # X.update({'scaler': self.preprocessor})
+        X.update({'scaler': self.preprocessor})
         return X