Skip to content

[ADD] Allow users to pass feat types to tabular validator #441

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jul 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion autoPyTorch/api/base_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,7 @@ def _get_dataset_input_validator(
resampling_strategy_args: Optional[Dict[str, Any]] = None,
dataset_name: Optional[str] = None,
dataset_compression: Optional[DatasetCompressionSpec] = None,
**kwargs: Any
) -> Tuple[BaseDataset, BaseInputValidator]:
"""
Returns an object of a child class of `BaseDataset` and
Expand Down Expand Up @@ -353,6 +354,7 @@ def get_dataset(
resampling_strategy_args: Optional[Dict[str, Any]] = None,
dataset_name: Optional[str] = None,
dataset_compression: Optional[DatasetCompressionSpec] = None,
**kwargs: Any
) -> BaseDataset:
"""
Returns an object of a child class of `BaseDataset` according to the current task.
Expand Down Expand Up @@ -407,6 +409,10 @@ def get_dataset(
Subsampling takes into account classification labels and stratifies
accordingly. We guarantee that at least one occurrence of each
label is included in the sampled set.
kwargs (Any):
can be used to pass task specific dataset arguments. Currently supports
passing `feat_types` for tabular tasks which specifies whether a feature is
'numerical' or 'categorical'.

Returns:
BaseDataset:
Expand All @@ -420,7 +426,8 @@ def get_dataset(
resampling_strategy=resampling_strategy,
resampling_strategy_args=resampling_strategy_args,
dataset_name=dataset_name,
dataset_compression=dataset_compression)
dataset_compression=dataset_compression,
**kwargs)

return dataset

Expand Down
16 changes: 14 additions & 2 deletions autoPyTorch/api/tabular_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ def _get_dataset_input_validator(
resampling_strategy_args: Optional[Dict[str, Any]] = None,
dataset_name: Optional[str] = None,
dataset_compression: Optional[DatasetCompressionSpec] = None,
**kwargs: Any,
) -> Tuple[TabularDataset, TabularInputValidator]:
"""
Returns an object of `TabularDataset` and an object of
Expand All @@ -194,6 +195,9 @@ def _get_dataset_input_validator(
dataset_compression (Optional[DatasetCompressionSpec]):
specifications for dataset compression. For more info check
documentation for `BaseTask.get_dataset`.
kwargs (Any):
Currently for tabular tasks, expect `feat_types: (Optional[List[str]]` which
specifies whether a feature is 'numerical' or 'categorical'.

Returns:
TabularDataset:
Expand All @@ -206,12 +210,14 @@ def _get_dataset_input_validator(
resampling_strategy_args = resampling_strategy_args if resampling_strategy_args is not None else \
self.resampling_strategy_args

feat_types = kwargs.pop('feat_types', None)
# Create a validator object to make sure that the data provided by
# the user matches the autopytorch requirements
input_validator = TabularInputValidator(
is_classification=True,
logger_port=self._logger_port,
dataset_compression=dataset_compression
dataset_compression=dataset_compression,
feat_types=feat_types
)

# Fit a input validator to check the provided data
Expand All @@ -238,6 +244,7 @@ def search(
X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
dataset_name: Optional[str] = None,
feat_types: Optional[List[str]] = None,
budget_type: str = 'epochs',
min_budget: int = 5,
max_budget: int = 50,
Expand Down Expand Up @@ -266,6 +273,10 @@ def search(
A pair of features (X_train) and targets (y_train) used to fit a
pipeline. Additionally, a holdout of this pairs (X_test, y_test) can
be provided to track the generalization performance of each stage.
feat_types (Optional[List[str]]):
Description about the feature types of the columns.
Accepts `numerical` for integers, float data and `categorical`
for categories, strings and bool. Defaults to None.
optimize_metric (str):
name of the metric that is used to evaluate a pipeline.
budget_type (str):
Expand Down Expand Up @@ -433,7 +444,8 @@ def search(
resampling_strategy=self.resampling_strategy,
resampling_strategy_args=self.resampling_strategy_args,
dataset_name=dataset_name,
dataset_compression=self._dataset_compression)
dataset_compression=self._dataset_compression,
feat_types=feat_types)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we check near here if feat_types includes only possible options, i.e. either numerical or categorical

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have added to tabular feature validator.


return self._search(
dataset=self.dataset,
Expand Down
16 changes: 14 additions & 2 deletions autoPyTorch/api/tabular_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ def _get_dataset_input_validator(
resampling_strategy_args: Optional[Dict[str, Any]] = None,
dataset_name: Optional[str] = None,
dataset_compression: Optional[DatasetCompressionSpec] = None,
**kwargs: Any
) -> Tuple[TabularDataset, TabularInputValidator]:
"""
Returns an object of `TabularDataset` and an object of
Expand All @@ -195,6 +196,9 @@ def _get_dataset_input_validator(
dataset_compression (Optional[DatasetCompressionSpec]):
specifications for dataset compression. For more info check
documentation for `BaseTask.get_dataset`.
kwargs (Any):
Currently for tabular tasks, expect `feat_types: (Optional[List[str]]` which
specifies whether a feature is 'numerical' or 'categorical'.
Returns:
TabularDataset:
the dataset object.
Expand All @@ -206,12 +210,14 @@ def _get_dataset_input_validator(
resampling_strategy_args = resampling_strategy_args if resampling_strategy_args is not None else \
self.resampling_strategy_args

feat_types = kwargs.pop('feat_types', None)
# Create a validator object to make sure that the data provided by
# the user matches the autopytorch requirements
input_validator = TabularInputValidator(
is_classification=False,
logger_port=self._logger_port,
dataset_compression=dataset_compression
dataset_compression=dataset_compression,
feat_types=feat_types
)

# Fit a input validator to check the provided data
Expand All @@ -238,6 +244,7 @@ def search(
X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
dataset_name: Optional[str] = None,
feat_types: Optional[List[str]] = None,
budget_type: str = 'epochs',
min_budget: int = 5,
max_budget: int = 50,
Expand Down Expand Up @@ -266,6 +273,10 @@ def search(
A pair of features (X_train) and targets (y_train) used to fit a
pipeline. Additionally, a holdout of this pairs (X_test, y_test) can
be provided to track the generalization performance of each stage.
feat_types (Optional[List[str]]):
Description about the feature types of the columns.
Accepts `numerical` for integers, float data and `categorical`
for categories, strings and bool. Defaults to None.
optimize_metric (str):
Name of the metric that is used to evaluate a pipeline.
budget_type (str):
Expand Down Expand Up @@ -434,7 +445,8 @@ def search(
resampling_strategy=self.resampling_strategy,
resampling_strategy_args=self.resampling_strategy_args,
dataset_name=dataset_name,
dataset_compression=self._dataset_compression)
dataset_compression=self._dataset_compression,
feat_types=feat_types)

return self._search(
dataset=self.dataset,
Expand Down
2 changes: 1 addition & 1 deletion autoPyTorch/data/base_feature_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def __init__(
logger: Optional[Union[PicklableClientLogger, logging.Logger]] = None,
):
# Register types to detect unsupported data format changes
self.feat_type: Optional[List[str]] = None
self.feat_types: Optional[List[str]] = None
self.data_type: Optional[type] = None
self.dtypes: List[str] = []
self.column_order: List[str] = []
Expand Down
95 changes: 82 additions & 13 deletions autoPyTorch/data/tabular_feature_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,12 +94,18 @@ class TabularFeatureValidator(BaseFeatureValidator):
List of indices of numerical columns
categorical_columns (List[int]):
List of indices of categorical columns
feat_types (List[str]):
Description about the feature types of the columns.
Accepts `numerical` for integers, float data and `categorical`
for categories, strings and bool.
"""
def __init__(
self,
logger: Optional[Union[PicklableClientLogger, Logger]] = None,
feat_types: Optional[List[str]] = None,
):
super().__init__(logger)
self.feat_types = feat_types

@staticmethod
def _comparator(cmp1: str, cmp2: str) -> int:
Expand Down Expand Up @@ -167,9 +173,9 @@ def _fit(
if not X.select_dtypes(include='object').empty:
X = self.infer_objects(X)

self.transformed_columns, self.feat_type = self._get_columns_to_encode(X)
self.transformed_columns, self.feat_types = self.get_columns_to_encode(X)

assert self.feat_type is not None
assert self.feat_types is not None

if len(self.transformed_columns) > 0:

Expand All @@ -186,8 +192,8 @@ def _fit(
# The column transformer reorders the feature types
# therefore, we need to change the order of columns as well
# This means categorical columns are shifted to the left
self.feat_type = sorted(
self.feat_type,
self.feat_types = sorted(
self.feat_types,
key=functools.cmp_to_key(self._comparator)
)

Expand All @@ -201,7 +207,7 @@ def _fit(
for cat in encoded_categories
]

for i, type_ in enumerate(self.feat_type):
for i, type_ in enumerate(self.feat_types):
if 'numerical' in type_:
self.numerical_columns.append(i)
else:
Expand Down Expand Up @@ -336,7 +342,7 @@ def _check_data(

# Define the column to be encoded here as the feature validator is fitted once
# per estimator
self.transformed_columns, self.feat_type = self._get_columns_to_encode(X)
self.transformed_columns, self.feat_types = self.get_columns_to_encode(X)

column_order = [column for column in X.columns]
if len(self.column_order) > 0:
Expand All @@ -361,12 +367,72 @@ def _check_data(
else:
self.dtypes = dtypes

def get_columns_to_encode(
self,
X: pd.DataFrame
) -> Tuple[List[str], List[str]]:
"""
Return the columns to be transformed as well as
the type of feature for each column.

The returned values are dependent on `feat_types` passed to the `__init__`.

Args:
X (pd.DataFrame)
A set of features that are going to be validated (type and dimensionality
checks) and an encoder fitted in the case the data needs encoding

Returns:
transformed_columns (List[str]):
Columns to encode, if any
feat_type:
Type of each column numerical/categorical
"""
transformed_columns, feat_types = self._get_columns_to_encode(X)
if self.feat_types is not None:
self._validate_feat_types(X)
transformed_columns = [X.columns[i] for i, col in enumerate(self.feat_types)
if col.lower() == 'categorical']
return transformed_columns, self.feat_types
else:
return transformed_columns, feat_types

def _validate_feat_types(self, X: pd.DataFrame) -> None:
"""
Checks if the passed `feat_types` is compatible with what
AutoPyTorch expects, i.e, it should only contain `numerical`
or `categorical` and the number of feature types is equal to
the number of features. The case does not matter.

Args:
X (pd.DataFrame):
input features set

Raises:
ValueError:
if the number of feat_types is not equal to the number of features
if the feature type are not one of "numerical", "categorical"
"""
assert self.feat_types is not None # mypy check

if len(self.feat_types) != len(X.columns):
raise ValueError(f"Expected number of `feat_types`: {len(self.feat_types)}"
f" to be the same as the number of features {len(X.columns)}")
for feat_type in set(self.feat_types):
if feat_type.lower() not in ['numerical', 'categorical']:
raise ValueError(f"Expected type of features to be in `['numerical', "
f"'categorical']`, but got {feat_type}")

def _get_columns_to_encode(
self,
X: pd.DataFrame,
) -> Tuple[List[str], List[str]]:
"""
Return the columns to be encoded from a pandas dataframe
Return the columns to be transformed as well as
the type of feature for each column from a pandas dataframe.

If `self.feat_types` is not None, it also validates that the
dataframe dtypes dont disagree with the ones passed in `__init__`.

Args:
X (pd.DataFrame)
Expand All @@ -380,21 +446,24 @@ def _get_columns_to_encode(
Type of each column numerical/categorical
"""

if len(self.transformed_columns) > 0 and self.feat_type is not None:
return self.transformed_columns, self.feat_type
if len(self.transformed_columns) > 0 and self.feat_types is not None:
return self.transformed_columns, self.feat_types

# Register if a column needs encoding
transformed_columns = []

# Also, register the feature types for the estimator
feat_type = []
feat_types = []

# Make sure each column is a valid type
for i, column in enumerate(X.columns):
if X[column].dtype.name in ['category', 'bool']:

transformed_columns.append(column)
feat_type.append('categorical')
if self.feat_types is not None and self.feat_types[i].lower() == 'numerical':
raise ValueError(f"Passed numerical as the feature type for column: {column} "
f"but the column is categorical")
feat_types.append('categorical')
# Move away from np.issubdtype as it causes
# TypeError: data type not understood in certain pandas types
elif not is_numeric_dtype(X[column]):
Expand Down Expand Up @@ -434,8 +503,8 @@ def _get_columns_to_encode(
)
)
else:
feat_type.append('numerical')
return transformed_columns, feat_type
feat_types.append('numerical')
return transformed_columns, feat_types

def list_to_dataframe(
self,
Expand Down
11 changes: 9 additions & 2 deletions autoPyTorch/data/tabular_validator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- encoding: utf-8 -*-
import logging
from typing import Optional, Tuple, Union
from typing import List, Optional, Tuple, Union

import numpy as np

Expand Down Expand Up @@ -41,18 +41,24 @@ class TabularInputValidator(BaseInputValidator):
dataset_compression (Optional[DatasetCompressionSpec]):
specifications for dataset compression. For more info check
documentation for `BaseTask.get_dataset`.
feat_types (List[str]):
Description about the feature types of the columns.
Accepts `numerical` for integers, float data and `categorical`
for categories, strings and bool
"""
def __init__(
self,
is_classification: bool = False,
logger_port: Optional[int] = None,
dataset_compression: Optional[DatasetCompressionSpec] = None,
feat_types: Optional[List[str]] = None,
seed: int = 42,
):
self.dataset_compression = dataset_compression
self._reduced_dtype: Optional[DatasetDTypeContainerType] = None
self.is_classification = is_classification
self.logger_port = logger_port
self.feat_types = feat_types
self.seed = seed
if self.logger_port is not None:
self.logger: Union[logging.Logger, PicklableClientLogger] = get_named_client_logger(
Expand All @@ -63,7 +69,8 @@ def __init__(
self.logger = logging.getLogger('Validation')

self.feature_validator = TabularFeatureValidator(
logger=self.logger)
logger=self.logger,
feat_types=self.feat_types)
self.target_validator = TabularTargetValidator(
is_classification=self.is_classification,
logger=self.logger
Expand Down
Loading