automl · franchuterivera · Mar 4, 2021 · Feb 8, 2021 · Feb 8, 2021 · Feb 15, 2021
diff --git a/.github/workflows/examples.yml b/.github/workflows/examples.yml
@@ -31,4 +31,5 @@ jobs:
     - name: Run tests
       run: |
         python examples/example_tabular_classification.py
-        python examples/example_image_classification.py
+        python examples/example_tabular_regression.py
+        python examples/example_image_classification.py
diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
@@ -11,7 +11,7 @@
 import sklearn.utils
 from sklearn import preprocessing
 from sklearn.base import BaseEstimator
-from sklearn.compose import make_column_transformer
+from sklearn.compose import ColumnTransformer
 from sklearn.exceptions import NotFittedError
 
 from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SUPPORTED_FEAT_TYPES
@@ -53,16 +53,34 @@ def _fit(
                 for column in X.columns:
                     if X[column].isna().all():
                         X[column] = pd.to_numeric(X[column])
+                        # Also note this change in self.dtypes
+                        if len(self.dtypes) != 0:
+                            self.dtypes[list(X.columns).index(column)] = X[column].dtype
 
             self.enc_columns, self.feat_type = self._get_columns_to_encode(X)
 
             if len(self.enc_columns) > 0:
-
-                self.encoder = make_column_transformer(
-                    (preprocessing.OrdinalEncoder(
-                        handle_unknown='use_encoded_value',
-                        unknown_value=-1,
-                    ), self.enc_columns),
+                # impute missing values before encoding,
+                # remove once sklearn natively supports
+                # it in ordinal encoding. Sklearn issue:
+                # "https://github.com/scikit-learn/scikit-learn/issues/17123)"
+                for column in self.enc_columns:
+                    if X[column].isna().any():
+                        missing_value: typing.Union[int, str] = -1
+                        # make sure for a string column we give
+                        # string missing value else we give numeric
+                        if type(X[column][0]) == str:
+                            missing_value = str(missing_value)
+                        X[column] = X[column].cat.add_categories([missing_value])
+                        X[column] = X[column].fillna(missing_value)
+
+                self.encoder = ColumnTransformer(
+                    [
+                        ("encoder",
+                         preprocessing.OrdinalEncoder(
+                             handle_unknown='use_encoded_value',
+                             unknown_value=-1,
+                         ), self.enc_columns)],
                     remainder="passthrough"
                 )
 
@@ -85,6 +103,7 @@ def comparator(cmp1: str, cmp2: str) -> int:
                         return 1
                     else:
                         raise ValueError((cmp1, cmp2))
+
                 self.feat_type = sorted(
                     self.feat_type,
                     key=functools.cmp_to_key(comparator)
@@ -182,9 +201,8 @@ def _check_data(
         if not isinstance(X, (np.ndarray, pd.DataFrame)) and not scipy.sparse.issparse(X):
             raise ValueError("AutoPyTorch only supports Numpy arrays, Pandas DataFrames,"
                              " scipy sparse and Python Lists, yet, the provided input is"
-                             " of type {}".format(
-                                 type(X)
-                             ))
+                             " of type {}".format(type(X))
+                             )
 
         if self.data_type is None:
             self.data_type = type(X)
@@ -217,39 +235,25 @@ def _check_data(
             # per estimator
             enc_columns, _ = self._get_columns_to_encode(X)
 
-            if len(enc_columns) > 0:
-                if np.any(pd.isnull(
-                    X[enc_columns].dropna(  # type: ignore[call-overload]
-                        axis='columns', how='all')
-                )):
-                    # Ignore all NaN columns, and if still a NaN
-                    # Error out
-                    raise ValueError("Categorical features in a dataframe cannot contain "
-                                     "missing/NaN values. The OrdinalEncoder used by "
-                                     "AutoPyTorch cannot handle this yet (due to a "
-                                     "limitation on scikit-learn being addressed via: "
-                                     "https://github.com/scikit-learn/scikit-learn/issues/17123)"
-                                     )
             column_order = [column for column in X.columns]
             if len(self.column_order) > 0:
                 if self.column_order != column_order:
                     raise ValueError("Changing the column order of the features after fit() is "
                                      "not supported. Fit() method was called with "
-                                     "{} whereas the new features have {} as type".format(
-                                         self.column_order,
-                                         column_order,
-                                     ))
+                                     "{} whereas the new features have {} as type".format(self.column_order,
+                                                                                          column_order,)
+                                     )
             else:
                 self.column_order = column_order
             dtypes = [dtype.name for dtype in X.dtypes]
             if len(self.dtypes) > 0:
                 if self.dtypes != dtypes:
                     raise ValueError("Changing the dtype of the features after fit() is "
                                      "not supported. Fit() method was called with "
-                                     "{} whereas the new features have {} as type".format(
-                                         self.dtypes,
-                                         dtypes,
-                                     ))
+                                     "{} whereas the new features have {} as type".format(self.dtypes,
+                                                                                          dtypes,
+                                                                                          )
+                                     )
             else:
                 self.dtypes = dtypes
 
@@ -294,7 +298,8 @@ def _get_columns_to_encode(
                         "pandas.Series.astype ."
                         "If working with string objects, the following "
                         "tutorial illustrates how to work with text data: "
-                        "https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html".format(  # noqa: E501
+                        "https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html".format(
+                            # noqa: E501
                             column,
                         )
                     )
@@ -349,15 +354,13 @@ def list_to_dataframe(
         # If a list was provided, it will be converted to pandas
         X_train = pd.DataFrame(data=X_train).infer_objects()
         self.logger.warning("The provided feature types to AutoPyTorch are of type list."
-                            "Features have been interpreted as: {}".format(
-                                [(col, t) for col, t in zip(X_train.columns, X_train.dtypes)]
-                            ))
+                            "Features have been interpreted as: {}".format([(col, t) for col, t in
+                                                                            zip(X_train.columns, X_train.dtypes)]))
         if X_test is not None:
             if not isinstance(X_test, list):
                 self.logger.warning("Train features are a list while the provided test data"
-                                    "is {}. X_test will be casted as DataFrame.".format(
-                                        type(X_test)
-                                    ))
+                                    "is {}. X_test will be casted as DataFrame.".format(type(X_test))
+                                    )
             X_test = pd.DataFrame(data=X_test).infer_objects()
         return X_train, X_test
 

diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -331,6 +331,8 @@ def __init__(self, backend: Backend,
             name=logger_name,
             port=logger_port,
         )
+        self.backend.setup_logger(name=logger_name, port=logger_port)
+
         self.Y_optimization: Optional[np.ndarray] = None
         self.Y_actual_train: Optional[np.ndarray] = None
         self.pipelines: Optional[List[BaseEstimator]] = None
@@ -538,6 +540,7 @@ def file_output(
         else:
             pipeline = None
 
+        self.logger.debug("Saving directory {}, {}, {}".format(self.seed, self.num_run, self.budget))
         self.backend.save_numrun_to_dir(
             seed=int(self.seed),
             idx=int(self.num_run),

diff --git a/...Torch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/...Torch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 
-from sklearn.compose import ColumnTransformer, make_column_transformer
+from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import make_pipeline
 
 import torch
@@ -57,9 +57,9 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer":
         if len(X['dataset_properties']['categorical_columns']):
             categorical_pipeline = make_pipeline(*preprocessors['categorical'])
 
-        self.preprocessor = make_column_transformer(
-            (numerical_pipeline, X['dataset_properties']['numerical_columns']),
-            (categorical_pipeline, X['dataset_properties']['categorical_columns']),
+        self.preprocessor = ColumnTransformer([
+            ('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns']),
+            ('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns'])],
             remainder='passthrough'
         )
 

diff --git a/...yTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OrdinalEncoder.py b/...yTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OrdinalEncoder.py
diff --git a/...h/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder_choice.py b/...h/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder_choice.py
@@ -65,7 +65,7 @@ def get_hyperparameter_search_space(self,
             raise ValueError("no encoders found, please add a encoder")
 
         if default is None:
-            defaults = ['OneHotEncoder', 'OrdinalEncoder', 'NoEncoder']
+            defaults = ['OneHotEncoder', 'NoEncoder']
             for default_ in defaults:
                 if default_ in available_preprocessors:
                     if include is not None and default_ not in include:

diff --git a/autoPyTorch/pipeline/components/setup/network/base_network.py b/autoPyTorch/pipeline/components/setup/network/base_network.py
@@ -29,6 +29,7 @@ def __init__(
         self.add_fit_requirements([
             FitRequirement("network_head", (torch.nn.Module,), user_defined=False, dataset_property=False),
             FitRequirement("network_backbone", (torch.nn.Module,), user_defined=False, dataset_property=False),
+            FitRequirement("network_embedding", (torch.nn.Module,), user_defined=False, dataset_property=False),
         ])
         self.final_activation = None
 
@@ -47,7 +48,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
         # information to fit this stage
         self.check_requirements(X, y)
 
-        self.network = torch.nn.Sequential(X['network_backbone'], X['network_head'])
+        self.network = torch.nn.Sequential(X['network_embedding'], X['network_backbone'], X['network_head'])
 
         # Properly set the network training device
         if self.device is None:

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
@@ -14,6 +14,7 @@
 from autoPyTorch.pipeline.components.base_component import (
     autoPyTorchComponent,
 )
+from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape
 from autoPyTorch.utils.common import FitRequirement
 
 
@@ -31,7 +32,9 @@ def __init__(self,
             FitRequirement('X_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True,
                            dataset_property=False),
             FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
-            FitRequirement('tabular_transformer', (BaseEstimator,), user_defined=False, dataset_property=False)])
+            FitRequirement('tabular_transformer', (BaseEstimator,), user_defined=False, dataset_property=False),
+            FitRequirement('network_embedding', (nn.Module,), user_defined=False, dataset_property=False)
+        ])
         self.backbone: nn.Module = None
         self.config = kwargs
         self.input_shape: Optional[Iterable] = None
@@ -56,6 +59,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
             column_transformer = X['tabular_transformer'].preprocessor
             input_shape = column_transformer.transform(X_train[:1]).shape[1:]
 
+        input_shape = get_output_shape(X['network_embedding'], input_shape=input_shape)
         self.input_shape = input_shape
 
         self.backbone = self.build_backbone(