automl · ravinkohli · Jul 11, 2022 · Jun 23, 2022 · Jun 23, 2022 · Jun 23, 2022
diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
@@ -307,6 +307,7 @@ def _get_dataset_input_validator(
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         dataset_name: Optional[str] = None,
         dataset_compression: Optional[DatasetCompressionSpec] = None,
+        **kwargs: Any
     ) -> Tuple[BaseDataset, BaseInputValidator]:
         """
         Returns an object of a child class of `BaseDataset` and
@@ -353,6 +354,7 @@ def get_dataset(
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         dataset_name: Optional[str] = None,
         dataset_compression: Optional[DatasetCompressionSpec] = None,
+        **kwargs: Any
     ) -> BaseDataset:
         """
         Returns an object of a child class of `BaseDataset` according to the current task.
@@ -407,6 +409,10 @@ def get_dataset(
                         Subsampling takes into account classification labels and stratifies
                         accordingly. We guarantee that at least one occurrence of each
                         label is included in the sampled set.
+            kwargs (Any):
+                can be used to pass task specific dataset arguments. Currently supports
+                passing `feat_types` for tabular tasks which specifies whether a feature is
+                'numerical' or 'categorical'.
 
         Returns:
             BaseDataset:
@@ -420,7 +426,8 @@ def get_dataset(
             resampling_strategy=resampling_strategy,
             resampling_strategy_args=resampling_strategy_args,
             dataset_name=dataset_name,
-            dataset_compression=dataset_compression)
+            dataset_compression=dataset_compression,
+            **kwargs)
 
         return dataset
 

diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py
@@ -168,6 +168,7 @@ def _get_dataset_input_validator(
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         dataset_name: Optional[str] = None,
         dataset_compression: Optional[DatasetCompressionSpec] = None,
+        **kwargs: Any,
     ) -> Tuple[TabularDataset, TabularInputValidator]:
         """
         Returns an object of `TabularDataset` and an object of
@@ -194,6 +195,9 @@ def _get_dataset_input_validator(
             dataset_compression (Optional[DatasetCompressionSpec]):
                 specifications for dataset compression. For more info check
                 documentation for `BaseTask.get_dataset`.
+            kwargs (Any):
+                Currently for tabular tasks, expect `feat_types: (Optional[List[str]]` which
+                specifies whether a feature is 'numerical' or 'categorical'.
 
         Returns:
             TabularDataset:
@@ -206,12 +210,14 @@ def _get_dataset_input_validator(
         resampling_strategy_args = resampling_strategy_args if resampling_strategy_args is not None else \
             self.resampling_strategy_args
 
+        feat_types = kwargs.pop('feat_types', None)
         # Create a validator object to make sure that the data provided by
         # the user matches the autopytorch requirements
         input_validator = TabularInputValidator(
             is_classification=True,
             logger_port=self._logger_port,
-            dataset_compression=dataset_compression
+            dataset_compression=dataset_compression,
+            feat_types=feat_types
         )
 
         # Fit a input validator to check the provided data
@@ -238,6 +244,7 @@ def search(
         X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         dataset_name: Optional[str] = None,
+        feat_types: Optional[List[str]] = None,
         budget_type: str = 'epochs',
         min_budget: int = 5,
         max_budget: int = 50,
@@ -266,6 +273,10 @@ def search(
                 A pair of features (X_train) and targets (y_train) used to fit a
                 pipeline. Additionally, a holdout of this pairs (X_test, y_test) can
                 be provided to track the generalization performance of each stage.
+            feat_types (Optional[List[str]]):
+                Description about the feature types of the columns.
+                Accepts `numerical` for integers, float data and `categorical`
+                for categories, strings and bool. Defaults to None.
             optimize_metric (str):
                 name of the metric that is used to evaluate a pipeline.
             budget_type (str):
@@ -433,7 +444,8 @@ def search(
             resampling_strategy=self.resampling_strategy,
             resampling_strategy_args=self.resampling_strategy_args,
             dataset_name=dataset_name,
-            dataset_compression=self._dataset_compression)
+            dataset_compression=self._dataset_compression,
+            feat_types=feat_types)
 
         return self._search(
             dataset=self.dataset,

diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py
@@ -169,6 +169,7 @@ def _get_dataset_input_validator(
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         dataset_name: Optional[str] = None,
         dataset_compression: Optional[DatasetCompressionSpec] = None,
+        **kwargs: Any
     ) -> Tuple[TabularDataset, TabularInputValidator]:
         """
         Returns an object of `TabularDataset` and an object of
@@ -195,6 +196,9 @@ def _get_dataset_input_validator(
             dataset_compression (Optional[DatasetCompressionSpec]):
                 specifications for dataset compression. For more info check
                 documentation for `BaseTask.get_dataset`.
+            kwargs (Any):
+                Currently for tabular tasks, expect `feat_types: (Optional[List[str]]` which
+                specifies whether a feature is 'numerical' or 'categorical'.
         Returns:
             TabularDataset:
                 the dataset object.
@@ -206,12 +210,14 @@ def _get_dataset_input_validator(
         resampling_strategy_args = resampling_strategy_args if resampling_strategy_args is not None else \
             self.resampling_strategy_args
 
+        feat_types = kwargs.pop('feat_types', None)
         # Create a validator object to make sure that the data provided by
         # the user matches the autopytorch requirements
         input_validator = TabularInputValidator(
             is_classification=False,
             logger_port=self._logger_port,
-            dataset_compression=dataset_compression
+            dataset_compression=dataset_compression,
+            feat_types=feat_types
         )
 
         # Fit a input validator to check the provided data
@@ -238,6 +244,7 @@ def search(
         X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         dataset_name: Optional[str] = None,
+        feat_types: Optional[List[str]] = None,
         budget_type: str = 'epochs',
         min_budget: int = 5,
         max_budget: int = 50,
@@ -266,6 +273,10 @@ def search(
                 A pair of features (X_train) and targets (y_train) used to fit a
                 pipeline. Additionally, a holdout of this pairs (X_test, y_test) can
                 be provided to track the generalization performance of each stage.
+            feat_types (Optional[List[str]]):
+                Description about the feature types of the columns.
+                Accepts `numerical` for integers, float data and `categorical`
+                for categories, strings and bool. Defaults to None.
             optimize_metric (str):
                 Name of the metric that is used to evaluate a pipeline.
             budget_type (str):
@@ -434,7 +445,8 @@ def search(
             resampling_strategy=self.resampling_strategy,
             resampling_strategy_args=self.resampling_strategy_args,
             dataset_name=dataset_name,
-            dataset_compression=self._dataset_compression)
+            dataset_compression=self._dataset_compression,
+            feat_types=feat_types)
 
         return self._search(
             dataset=self.dataset,

diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py
@@ -35,7 +35,7 @@ def __init__(
         logger: Optional[Union[PicklableClientLogger, logging.Logger]] = None,
     ):
         # Register types to detect unsupported data format changes
-        self.feat_type: Optional[List[str]] = None
+        self.feat_types: Optional[List[str]] = None
         self.data_type: Optional[type] = None
         self.dtypes: List[str] = []
         self.column_order: List[str] = []

diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
@@ -94,12 +94,18 @@ class TabularFeatureValidator(BaseFeatureValidator):
             List of indices of numerical columns
         categorical_columns (List[int]):
             List of indices of categorical columns
+        feat_types (List[str]):
+                Description about the feature types of the columns.
+                Accepts `numerical` for integers, float data and `categorical`
+                for categories, strings and bool.
     """
     def __init__(
         self,
         logger: Optional[Union[PicklableClientLogger, Logger]] = None,
+        feat_types: Optional[List[str]] = None,
     ):
         super().__init__(logger)
+        self.feat_types = feat_types
 
     @staticmethod
     def _comparator(cmp1: str, cmp2: str) -> int:
@@ -167,9 +173,9 @@ def _fit(
             if not X.select_dtypes(include='object').empty:
                 X = self.infer_objects(X)
 
-            self.transformed_columns, self.feat_type = self._get_columns_to_encode(X)
+            self.transformed_columns, self.feat_types = self.get_columns_to_encode(X)
 
-            assert self.feat_type is not None
+            assert self.feat_types is not None
 
             if len(self.transformed_columns) > 0:
 
@@ -186,8 +192,8 @@ def _fit(
                 # The column transformer reorders the feature types
                 # therefore, we need to change the order of columns as well
                 # This means categorical columns are shifted to the left
-                self.feat_type = sorted(
-                    self.feat_type,
+                self.feat_types = sorted(
+                    self.feat_types,
                     key=functools.cmp_to_key(self._comparator)
                 )
 
@@ -201,7 +207,7 @@ def _fit(
                     for cat in encoded_categories
                 ]
 
-            for i, type_ in enumerate(self.feat_type):
+            for i, type_ in enumerate(self.feat_types):
                 if 'numerical' in type_:
                     self.numerical_columns.append(i)
                 else:
@@ -336,7 +342,7 @@ def _check_data(
 
             # Define the column to be encoded here as the feature validator is fitted once
             # per estimator
-            self.transformed_columns, self.feat_type = self._get_columns_to_encode(X)
+            self.transformed_columns, self.feat_types = self.get_columns_to_encode(X)
 
             column_order = [column for column in X.columns]
             if len(self.column_order) > 0:
@@ -361,12 +367,72 @@ def _check_data(
             else:
                 self.dtypes = dtypes
 
+    def get_columns_to_encode(
+        self,
+        X: pd.DataFrame
+    ) -> Tuple[List[str], List[str]]:
+        """
+        Return the columns to be transformed as well as
+        the type of feature for each column.
+
+        The returned values are dependent on `feat_types` passed to the `__init__`.
+
+        Args:
+            X (pd.DataFrame)
+                A set of features that are going to be validated (type and dimensionality
+                checks) and an encoder fitted in the case the data needs encoding
+
+        Returns:
+            transformed_columns (List[str]):
+                Columns to encode, if any
+            feat_type:
+                Type of each column numerical/categorical
+        """
+        transformed_columns, feat_types = self._get_columns_to_encode(X)
+        if self.feat_types is not None:
+            self._validate_feat_types(X)
+            transformed_columns = [X.columns[i] for i, col in enumerate(self.feat_types)
+                                   if col.lower() == 'categorical']
+            return transformed_columns, self.feat_types
+        else:
+            return transformed_columns, feat_types
+
+    def _validate_feat_types(self, X: pd.DataFrame) -> None:
+        """
+        Checks if the passed `feat_types` is compatible with what
+        AutoPyTorch expects, i.e, it should only contain `numerical`
+        or `categorical` and the number of feature types is equal to
+        the number of features. The case does not matter.
+
+        Args:
+            X (pd.DataFrame):
+                input features set
+
+        Raises:
+            ValueError:
+                if the number of feat_types is not equal to the number of features
+                if the feature type are not one of "numerical", "categorical"
+        """
+        assert self.feat_types is not None  # mypy check
+
+        if len(self.feat_types) != len(X.columns):
+            raise ValueError(f"Expected number of `feat_types`: {len(self.feat_types)}"
+                             f" to be the same as the number of features {len(X.columns)}")
+        for feat_type in set(self.feat_types):
+            if feat_type.lower() not in ['numerical', 'categorical']:
+                raise ValueError(f"Expected type of features to be in `['numerical', "
+                                 f"'categorical']`, but got {feat_type}")
+
     def _get_columns_to_encode(
         self,
         X: pd.DataFrame,
     ) -> Tuple[List[str], List[str]]:
         """
-        Return the columns to be encoded from a pandas dataframe
+        Return the columns to be transformed as well as
+        the type of feature for each column from a pandas dataframe.
+
+        If `self.feat_types` is not None, it also validates that the
+        dataframe dtypes dont disagree with the ones passed in `__init__`.
 
         Args:
             X (pd.DataFrame)
@@ -380,21 +446,24 @@ def _get_columns_to_encode(
                 Type of each column numerical/categorical
         """
 
-        if len(self.transformed_columns) > 0 and self.feat_type is not None:
-            return self.transformed_columns, self.feat_type
+        if len(self.transformed_columns) > 0 and self.feat_types is not None:
+            return self.transformed_columns, self.feat_types
 
         # Register if a column needs encoding
         transformed_columns = []
 
         # Also, register the feature types for the estimator
-        feat_type = []
+        feat_types = []
 
         # Make sure each column is a valid type
         for i, column in enumerate(X.columns):
             if X[column].dtype.name in ['category', 'bool']:
 
                 transformed_columns.append(column)
-                feat_type.append('categorical')
+                if self.feat_types is not None and self.feat_types[i].lower() == 'numerical':
+                    raise ValueError(f"Passed numerical as the feature type for column: {column} "
+                                     f"but the column is categorical")
+                feat_types.append('categorical')
             # Move away from np.issubdtype as it causes
             # TypeError: data type not understood in certain pandas types
             elif not is_numeric_dtype(X[column]):
@@ -434,8 +503,8 @@ def _get_columns_to_encode(
                         )
                     )
             else:
-                feat_type.append('numerical')
-        return transformed_columns, feat_type
+                feat_types.append('numerical')
+        return transformed_columns, feat_types
 
     def list_to_dataframe(
         self,

diff --git a/autoPyTorch/data/tabular_validator.py b/autoPyTorch/data/tabular_validator.py
@@ -1,6 +1,6 @@
 # -*- encoding: utf-8 -*-
 import logging
-from typing import Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import numpy as np
 
@@ -41,18 +41,24 @@ class TabularInputValidator(BaseInputValidator):
         dataset_compression (Optional[DatasetCompressionSpec]):
             specifications for dataset compression. For more info check
             documentation for `BaseTask.get_dataset`.
+        feat_types (List[str]):
+                Description about the feature types of the columns.
+                Accepts `numerical` for integers, float data and `categorical`
+                for categories, strings and bool
     """
     def __init__(
         self,
         is_classification: bool = False,
         logger_port: Optional[int] = None,
         dataset_compression: Optional[DatasetCompressionSpec] = None,
+        feat_types: Optional[List[str]] = None,
         seed: int = 42,
     ):
         self.dataset_compression = dataset_compression
         self._reduced_dtype: Optional[DatasetDTypeContainerType] = None
         self.is_classification = is_classification
         self.logger_port = logger_port
+        self.feat_types = feat_types
         self.seed = seed
         if self.logger_port is not None:
             self.logger: Union[logging.Logger, PicklableClientLogger] = get_named_client_logger(
@@ -63,7 +69,8 @@ def __init__(
             self.logger = logging.getLogger('Validation')
 
         self.feature_validator = TabularFeatureValidator(
-            logger=self.logger)
+            logger=self.logger,
+            feat_types=self.feat_types)
         self.target_validator = TabularTargetValidator(
             is_classification=self.is_classification,
             logger=self.logger