Skip to content

Commit 6e6b39a

Browse files
committed
[FIX] Enable preprocessing in reg_cocktails (automl#369)
* enable preprocessing and remove is_small_preprocess * address comments from shuhei and fix precommit checks * fix tests * fix precommit checks * add suggestions from shuhei for astype use * address speed issue when using object_dtype_mapping * make code more readable * improve documentation for base network embedding
1 parent 89e40d2 commit 6e6b39a

34 files changed

+188
-824
lines changed

autoPyTorch/api/tabular_classification.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
1515
from autoPyTorch.datasets.resampling_strategy import (
1616
HoldoutValTypes,
17-
CrossValTypes,
1817
ResamplingStrategies,
1918
)
2019
from autoPyTorch.datasets.tabular_dataset import TabularDataset
@@ -384,13 +383,6 @@ def search(
384383
dataset_name=dataset_name
385384
)
386385

387-
if not isinstance(self.resampling_strategy, (CrossValTypes, HoldoutValTypes)):
388-
raise ValueError(
389-
'Hyperparameter optimization requires a validation split. '
390-
'Expected `self.resampling_strategy` to be either '
391-
'(CrossValTypes, HoldoutValTypes), but got {}'.format(self.resampling_strategy)
392-
)
393-
394386
return self._search(
395387
dataset=self.dataset,
396388
optimize_metric=optimize_metric,

autoPyTorch/api/tabular_regression.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
1515
from autoPyTorch.datasets.resampling_strategy import (
1616
HoldoutValTypes,
17-
CrossValTypes,
1817
ResamplingStrategies,
1918
)
2019
from autoPyTorch.datasets.tabular_dataset import TabularDataset
@@ -384,13 +383,6 @@ def search(
384383
dataset_name=dataset_name
385384
)
386385

387-
if not isinstance(self.resampling_strategy, (CrossValTypes, HoldoutValTypes)):
388-
raise ValueError(
389-
'Hyperparameter optimization requires a validation split. '
390-
'Expected `self.resampling_strategy` to be either '
391-
'(CrossValTypes, HoldoutValTypes), but got {}'.format(self.resampling_strategy)
392-
)
393-
394386
return self._search(
395387
dataset=self.dataset,
396388
optimize_metric=optimize_metric,

autoPyTorch/data/tabular_feature_validator.py

Lines changed: 80 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,13 @@
1414
from sklearn.exceptions import NotFittedError
1515
from sklearn.impute import SimpleImputer
1616
from sklearn.pipeline import make_pipeline
17-
from sklearn.preprocessing import OneHotEncoder, StandardScaler
17+
from sklearn.preprocessing import OrdinalEncoder
1818

1919
from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SUPPORTED_FEAT_TYPES
2020

2121

2222
def _create_column_transformer(
2323
preprocessors: Dict[str, List[BaseEstimator]],
24-
numerical_columns: List[str],
2524
categorical_columns: List[str],
2625
) -> ColumnTransformer:
2726
"""
@@ -32,49 +31,36 @@ def _create_column_transformer(
3231
Args:
3332
preprocessors (Dict[str, List[BaseEstimator]]):
3433
Dictionary containing list of numerical and categorical preprocessors.
35-
numerical_columns (List[str]):
36-
List of names of numerical columns
3734
categorical_columns (List[str]):
3835
List of names of categorical columns
3936
4037
Returns:
4138
ColumnTransformer
4239
"""
4340

44-
numerical_pipeline = 'drop'
45-
categorical_pipeline = 'drop'
46-
if len(numerical_columns) > 0:
47-
numerical_pipeline = make_pipeline(*preprocessors['numerical'])
48-
if len(categorical_columns) > 0:
49-
categorical_pipeline = make_pipeline(*preprocessors['categorical'])
41+
categorical_pipeline = make_pipeline(*preprocessors['categorical'])
5042

5143
return ColumnTransformer([
52-
('categorical_pipeline', categorical_pipeline, categorical_columns),
53-
('numerical_pipeline', numerical_pipeline, numerical_columns)],
54-
remainder='drop'
44+
('categorical_pipeline', categorical_pipeline, categorical_columns)],
45+
remainder='passthrough'
5546
)
5647

5748

5849
def get_tabular_preprocessors() -> Dict[str, List[BaseEstimator]]:
5950
"""
6051
This function creates a Dictionary containing a list
6152
of numerical and categorical preprocessors
62-
6353
Returns:
6454
Dict[str, List[BaseEstimator]]
6555
"""
6656
preprocessors: Dict[str, List[BaseEstimator]] = dict()
6757

6858
# Categorical Preprocessors
69-
onehot_encoder = OneHotEncoder(categories='auto', sparse=False, handle_unknown='ignore')
59+
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value',
60+
unknown_value=-1)
7061
categorical_imputer = SimpleImputer(strategy='constant', copy=False)
7162

72-
# Numerical Preprocessors
73-
numerical_imputer = SimpleImputer(strategy='median', copy=False)
74-
standard_scaler = StandardScaler(with_mean=True, with_std=True, copy=False)
75-
76-
preprocessors['categorical'] = [categorical_imputer, onehot_encoder]
77-
preprocessors['numerical'] = [numerical_imputer, standard_scaler]
63+
preprocessors['categorical'] = [categorical_imputer, ordinal_encoder]
7864

7965
return preprocessors
8066

@@ -161,31 +147,47 @@ def _fit(
161147

162148
X = cast(pd.DataFrame, X)
163149

164-
self.all_nan_columns = set([column for column in X.columns if X[column].isna().all()])
150+
all_nan_columns = X.columns[X.isna().all()]
151+
for col in all_nan_columns:
152+
X[col] = pd.to_numeric(X[col])
153+
154+
# Handle objects if possible
155+
exist_object_columns = has_object_columns(X.dtypes.values)
156+
if exist_object_columns:
157+
X = self.infer_objects(X)
165158

166-
categorical_columns, numerical_columns, feat_type = self._get_columns_info(X)
159+
self.dtypes = [dt.name for dt in X.dtypes] # Also note this change in self.dtypes
160+
self.all_nan_columns = set(all_nan_columns)
167161

168-
self.enc_columns = categorical_columns
162+
self.enc_columns, self.feat_type = self._get_columns_info(X)
169163

170-
preprocessors = get_tabular_preprocessors()
171-
self.column_transformer = _create_column_transformer(
172-
preprocessors=preprocessors,
173-
numerical_columns=numerical_columns,
174-
categorical_columns=categorical_columns,
175-
)
164+
if len(self.enc_columns) > 0:
176165

177-
# Mypy redefinition
178-
assert self.column_transformer is not None
179-
self.column_transformer.fit(X)
166+
preprocessors = get_tabular_preprocessors()
167+
self.column_transformer = _create_column_transformer(
168+
preprocessors=preprocessors,
169+
categorical_columns=self.enc_columns,
170+
)
180171

181-
# The column transformer reorders the feature types
182-
# therefore, we need to change the order of columns as well
183-
# This means categorical columns are shifted to the left
172+
# Mypy redefinition
173+
assert self.column_transformer is not None
174+
self.column_transformer.fit(X)
184175

185-
self.feat_type = sorted(
186-
feat_type,
187-
key=functools.cmp_to_key(self._comparator)
188-
)
176+
# The column transformer moves categorical columns before all numerical columns
177+
# therefore, we need to sort categorical columns so that it complies this change
178+
179+
self.feat_type = sorted(
180+
self.feat_type,
181+
key=functools.cmp_to_key(self._comparator)
182+
)
183+
184+
encoded_categories = self.column_transformer.\
185+
named_transformers_['categorical_pipeline'].\
186+
named_steps['ordinalencoder'].categories_
187+
self.categories = [
188+
list(range(len(cat)))
189+
for cat in encoded_categories
190+
]
189191

190192
# differently to categorical_columns and numerical_columns,
191193
# this saves the index of the column.
@@ -265,6 +267,23 @@ def transform(
265267
if hasattr(X, "iloc") and not scipy.sparse.issparse(X):
266268
X = cast(Type[pd.DataFrame], X)
267269

270+
if self.all_nan_columns is None:
271+
raise ValueError('_fit must be called before calling transform')
272+
273+
for col in list(self.all_nan_columns):
274+
X[col] = np.nan
275+
X[col] = pd.to_numeric(X[col])
276+
277+
if len(self.categorical_columns) > 0:
278+
# when some categorical columns are not all nan in the training set
279+
# but they are all nan in the testing or validation set
280+
# we change those columns to `object` dtype
281+
# to ensure that these columns are changed to appropriate dtype
282+
# in self.infer_objects
283+
all_nan_cat_cols = set(X[self.enc_columns].columns[X[self.enc_columns].isna().all()])
284+
dtype_dict = {col: 'object' for col in self.enc_columns if col in all_nan_cat_cols}
285+
X = X.astype(dtype_dict)
286+
268287
# Check the data here so we catch problems on new test data
269288
self._check_data(X)
270289

@@ -273,11 +292,6 @@ def transform(
273292
# We need to convert the column in test data to
274293
# object otherwise the test column is interpreted as float
275294
if self.column_transformer is not None:
276-
if len(self.categorical_columns) > 0:
277-
categorical_columns = self.column_transformer.transformers_[0][-1]
278-
for column in categorical_columns:
279-
if X[column].isna().all():
280-
X[column] = X[column].astype('object')
281295
X = self.column_transformer.transform(X)
282296

283297
# Sparse related transformations
@@ -361,7 +375,6 @@ def _check_data(
361375
self.column_order = column_order
362376

363377
dtypes = [dtype.name for dtype in X.dtypes]
364-
365378
diff_cols = X.columns[[s_dtype != dtype for s_dtype, dtype in zip(self.dtypes, dtypes)]]
366379
if len(self.dtypes) == 0:
367380
self.dtypes = dtypes
@@ -373,7 +386,7 @@ def _check_data(
373386
def _get_columns_info(
374387
self,
375388
X: pd.DataFrame,
376-
) -> Tuple[List[str], List[str], List[str]]:
389+
) -> Tuple[List[str], List[str]]:
377390
"""
378391
Return the columns to be encoded from a pandas dataframe
379392
@@ -392,15 +405,12 @@ def _get_columns_info(
392405
"""
393406

394407
# Register if a column needs encoding
395-
numerical_columns = []
396408
categorical_columns = []
397409
# Also, register the feature types for the estimator
398410
feat_type = []
399411

400412
# Make sure each column is a valid type
401413
for i, column in enumerate(X.columns):
402-
if self.all_nan_columns is not None and column in self.all_nan_columns:
403-
continue
404414
column_dtype = self.dtypes[i]
405415
err_msg = "Valid types are `numerical`, `categorical` or `boolean`, " \
406416
"but input column {} has an invalid type `{}`.".format(column, column_dtype)
@@ -411,7 +421,6 @@ def _get_columns_info(
411421
# TypeError: data type not understood in certain pandas types
412422
elif is_numeric_dtype(column_dtype):
413423
feat_type.append('numerical')
414-
numerical_columns.append(column)
415424
elif column_dtype == 'object':
416425
# TODO verify how would this happen when we always convert the object dtypes to category
417426
raise TypeError(
@@ -437,7 +446,7 @@ def _get_columns_info(
437446
"before feeding it to AutoPyTorch.".format(err_msg)
438447
)
439448

440-
return categorical_columns, numerical_columns, feat_type
449+
return categorical_columns, feat_type
441450

442451
def list_to_pandas(
443452
self,
@@ -507,22 +516,26 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
507516
pd.DataFrame
508517
"""
509518
if hasattr(self, 'object_dtype_mapping'):
510-
# Mypy does not process the has attr. This dict is defined below
511-
for key, dtype in self.object_dtype_mapping.items(): # type: ignore[has-type]
512-
# honor the training data types
513-
try:
514-
X[key] = X[key].astype(dtype.name)
515-
except Exception as e:
516-
# Try inference if possible
517-
self.logger.warning(f'Casting the column {key} to {dtype} caused the exception {e}')
518-
pass
519+
# honor the training data types
520+
try:
521+
# Mypy does not process the has attr.
522+
X = X.astype(self.object_dtype_mapping) # type: ignore[has-type]
523+
except Exception as e:
524+
# Try inference if possible
525+
self.logger.warning(f'Casting the columns to training dtypes ' # type: ignore[has-type]
526+
f'{self.object_dtype_mapping} caused the exception {e}')
527+
pass
519528
else:
520-
# Calling for the first time to infer the categories
521-
X = X.infer_objects()
522-
for column, data_type in zip(X.columns, X.dtypes):
523-
if not is_numeric_dtype(data_type):
524-
X[column] = X[column].astype('category')
525-
529+
if len(self.dtypes) != 0:
530+
# when train data has no object dtype, but test does
531+
# we prioritise the datatype given in training data
532+
dtype_dict = {col: dtype for col, dtype in zip(X.columns, self.dtypes)}
533+
X = X.astype(dtype_dict)
534+
else:
535+
# Calling for the first time to infer the categories
536+
X = X.infer_objects()
537+
dtype_dict = {col: 'category' for col, dtype in zip(X.columns, X.dtypes) if not is_numeric_dtype(dtype)}
538+
X = X.astype(dtype_dict)
526539
# only numerical attributes and categories
527540
self.object_dtype_mapping = {column: data_type for column, data_type in zip(X.columns, X.dtypes)}
528541

autoPyTorch/datasets/base_dataset.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,6 @@ def __init__(
125125
self.holdout_validators: Dict[str, HoldOutFunc] = {}
126126
self.no_resampling_validators: Dict[str, NoResamplingFunc] = {}
127127
self.random_state = np.random.RandomState(seed=seed)
128-
self.no_resampling_validators: Dict[str, NoResamplingFunc] = {}
129128
self.shuffle = shuffle
130129
self.resampling_strategy = resampling_strategy
131130
self.resampling_strategy_args = resampling_strategy_args
@@ -143,10 +142,6 @@ def __init__(
143142
else:
144143
self.output_shape = self.train_tensors[1].shape[-1] if self.train_tensors[1].ndim > 1 else 1
145144

146-
# TODO: Look for a criteria to define small enough to preprocess
147-
# False for the regularization cocktails initially
148-
self.is_small_preprocess = False
149-
150145
# Make sure cross validation splits are created once
151146
self.cross_validators = CrossValFuncs.get_cross_validators(*CrossValTypes)
152147
self.holdout_validators = HoldOutFuncs.get_holdout_validators(*HoldoutValTypes)

autoPyTorch/datasets/resampling_strategy.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,6 @@ def __call__(self, random_state: np.random.RandomState, val_share: float,
3939
...
4040

4141

42-
class NoResamplingFunc(Protocol):
43-
def __call__(self,
44-
random_state: np.random.RandomState,
45-
indices: np.ndarray) -> np.ndarray:
46-
...
47-
48-
4942
class CrossValTypes(IntEnum):
5043
"""The type of cross validation
5144

0 commit comments

Comments
 (0)