14
14
from sklearn .exceptions import NotFittedError
15
15
from sklearn .impute import SimpleImputer
16
16
from sklearn .pipeline import make_pipeline
17
- from sklearn .preprocessing import OneHotEncoder , StandardScaler
17
+ from sklearn .preprocessing import OrdinalEncoder
18
18
19
19
from autoPyTorch .data .base_feature_validator import BaseFeatureValidator , SUPPORTED_FEAT_TYPES
20
20
21
21
22
22
def _create_column_transformer (
23
23
preprocessors : Dict [str , List [BaseEstimator ]],
24
- numerical_columns : List [str ],
25
24
categorical_columns : List [str ],
26
25
) -> ColumnTransformer :
27
26
"""
@@ -32,49 +31,36 @@ def _create_column_transformer(
32
31
Args:
33
32
preprocessors (Dict[str, List[BaseEstimator]]):
34
33
Dictionary containing list of numerical and categorical preprocessors.
35
- numerical_columns (List[str]):
36
- List of names of numerical columns
37
34
categorical_columns (List[str]):
38
35
List of names of categorical columns
39
36
40
37
Returns:
41
38
ColumnTransformer
42
39
"""
43
40
44
- numerical_pipeline = 'drop'
45
- categorical_pipeline = 'drop'
46
- if len (numerical_columns ) > 0 :
47
- numerical_pipeline = make_pipeline (* preprocessors ['numerical' ])
48
- if len (categorical_columns ) > 0 :
49
- categorical_pipeline = make_pipeline (* preprocessors ['categorical' ])
41
+ categorical_pipeline = make_pipeline (* preprocessors ['categorical' ])
50
42
51
43
return ColumnTransformer ([
52
- ('categorical_pipeline' , categorical_pipeline , categorical_columns ),
53
- ('numerical_pipeline' , numerical_pipeline , numerical_columns )],
54
- remainder = 'drop'
44
+ ('categorical_pipeline' , categorical_pipeline , categorical_columns )],
45
+ remainder = 'passthrough'
55
46
)
56
47
57
48
58
49
def get_tabular_preprocessors () -> Dict [str , List [BaseEstimator ]]:
59
50
"""
60
51
This function creates a Dictionary containing a list
61
52
of numerical and categorical preprocessors
62
-
63
53
Returns:
64
54
Dict[str, List[BaseEstimator]]
65
55
"""
66
56
preprocessors : Dict [str , List [BaseEstimator ]] = dict ()
67
57
68
58
# Categorical Preprocessors
69
- onehot_encoder = OneHotEncoder (categories = 'auto' , sparse = False , handle_unknown = 'ignore' )
59
+ ordinal_encoder = OrdinalEncoder (handle_unknown = 'use_encoded_value' ,
60
+ unknown_value = - 1 )
70
61
categorical_imputer = SimpleImputer (strategy = 'constant' , copy = False )
71
62
72
- # Numerical Preprocessors
73
- numerical_imputer = SimpleImputer (strategy = 'median' , copy = False )
74
- standard_scaler = StandardScaler (with_mean = True , with_std = True , copy = False )
75
-
76
- preprocessors ['categorical' ] = [categorical_imputer , onehot_encoder ]
77
- preprocessors ['numerical' ] = [numerical_imputer , standard_scaler ]
63
+ preprocessors ['categorical' ] = [categorical_imputer , ordinal_encoder ]
78
64
79
65
return preprocessors
80
66
@@ -161,31 +147,47 @@ def _fit(
161
147
162
148
X = cast (pd .DataFrame , X )
163
149
164
- self .all_nan_columns = set ([column for column in X .columns if X [column ].isna ().all ()])
150
+ all_nan_columns = X .columns [X .isna ().all ()]
151
+ for col in all_nan_columns :
152
+ X [col ] = pd .to_numeric (X [col ])
153
+
154
+ # Handle objects if possible
155
+ exist_object_columns = has_object_columns (X .dtypes .values )
156
+ if exist_object_columns :
157
+ X = self .infer_objects (X )
165
158
166
- categorical_columns , numerical_columns , feat_type = self ._get_columns_info (X )
159
+ self .dtypes = [dt .name for dt in X .dtypes ] # Also note this change in self.dtypes
160
+ self .all_nan_columns = set (all_nan_columns )
167
161
168
- self .enc_columns = categorical_columns
162
+ self .enc_columns , self . feat_type = self . _get_columns_info ( X )
169
163
170
- preprocessors = get_tabular_preprocessors ()
171
- self .column_transformer = _create_column_transformer (
172
- preprocessors = preprocessors ,
173
- numerical_columns = numerical_columns ,
174
- categorical_columns = categorical_columns ,
175
- )
164
+ if len (self .enc_columns ) > 0 :
176
165
177
- # Mypy redefinition
178
- assert self .column_transformer is not None
179
- self .column_transformer .fit (X )
166
+ preprocessors = get_tabular_preprocessors ()
167
+ self .column_transformer = _create_column_transformer (
168
+ preprocessors = preprocessors ,
169
+ categorical_columns = self .enc_columns ,
170
+ )
180
171
181
- # The column transformer reorders the feature types
182
- # therefore, we need to change the order of columns as well
183
- # This means categorical columns are shifted to the left
172
+ # Mypy redefinition
173
+ assert self . column_transformer is not None
174
+ self . column_transformer . fit ( X )
184
175
185
- self .feat_type = sorted (
186
- feat_type ,
187
- key = functools .cmp_to_key (self ._comparator )
188
- )
176
+ # The column transformer moves categorical columns before all numerical columns
177
+ # therefore, we need to sort categorical columns so that it complies this change
178
+
179
+ self .feat_type = sorted (
180
+ self .feat_type ,
181
+ key = functools .cmp_to_key (self ._comparator )
182
+ )
183
+
184
+ encoded_categories = self .column_transformer .\
185
+ named_transformers_ ['categorical_pipeline' ].\
186
+ named_steps ['ordinalencoder' ].categories_
187
+ self .categories = [
188
+ list (range (len (cat )))
189
+ for cat in encoded_categories
190
+ ]
189
191
190
192
# differently to categorical_columns and numerical_columns,
191
193
# this saves the index of the column.
@@ -265,6 +267,23 @@ def transform(
265
267
if hasattr (X , "iloc" ) and not scipy .sparse .issparse (X ):
266
268
X = cast (Type [pd .DataFrame ], X )
267
269
270
+ if self .all_nan_columns is None :
271
+ raise ValueError ('_fit must be called before calling transform' )
272
+
273
+ for col in list (self .all_nan_columns ):
274
+ X [col ] = np .nan
275
+ X [col ] = pd .to_numeric (X [col ])
276
+
277
+ if len (self .categorical_columns ) > 0 :
278
+ # when some categorical columns are not all nan in the training set
279
+ # but they are all nan in the testing or validation set
280
+ # we change those columns to `object` dtype
281
+ # to ensure that these columns are changed to appropriate dtype
282
+ # in self.infer_objects
283
+ all_nan_cat_cols = set (X [self .enc_columns ].columns [X [self .enc_columns ].isna ().all ()])
284
+ dtype_dict = {col : 'object' for col in self .enc_columns if col in all_nan_cat_cols }
285
+ X = X .astype (dtype_dict )
286
+
268
287
# Check the data here so we catch problems on new test data
269
288
self ._check_data (X )
270
289
@@ -273,11 +292,6 @@ def transform(
273
292
# We need to convert the column in test data to
274
293
# object otherwise the test column is interpreted as float
275
294
if self .column_transformer is not None :
276
- if len (self .categorical_columns ) > 0 :
277
- categorical_columns = self .column_transformer .transformers_ [0 ][- 1 ]
278
- for column in categorical_columns :
279
- if X [column ].isna ().all ():
280
- X [column ] = X [column ].astype ('object' )
281
295
X = self .column_transformer .transform (X )
282
296
283
297
# Sparse related transformations
@@ -361,7 +375,6 @@ def _check_data(
361
375
self .column_order = column_order
362
376
363
377
dtypes = [dtype .name for dtype in X .dtypes ]
364
-
365
378
diff_cols = X .columns [[s_dtype != dtype for s_dtype , dtype in zip (self .dtypes , dtypes )]]
366
379
if len (self .dtypes ) == 0 :
367
380
self .dtypes = dtypes
@@ -373,7 +386,7 @@ def _check_data(
373
386
def _get_columns_info (
374
387
self ,
375
388
X : pd .DataFrame ,
376
- ) -> Tuple [List [str ], List [str ], List [ str ] ]:
389
+ ) -> Tuple [List [str ], List [str ]]:
377
390
"""
378
391
Return the columns to be encoded from a pandas dataframe
379
392
@@ -392,15 +405,12 @@ def _get_columns_info(
392
405
"""
393
406
394
407
# Register if a column needs encoding
395
- numerical_columns = []
396
408
categorical_columns = []
397
409
# Also, register the feature types for the estimator
398
410
feat_type = []
399
411
400
412
# Make sure each column is a valid type
401
413
for i , column in enumerate (X .columns ):
402
- if self .all_nan_columns is not None and column in self .all_nan_columns :
403
- continue
404
414
column_dtype = self .dtypes [i ]
405
415
err_msg = "Valid types are `numerical`, `categorical` or `boolean`, " \
406
416
"but input column {} has an invalid type `{}`." .format (column , column_dtype )
@@ -411,7 +421,6 @@ def _get_columns_info(
411
421
# TypeError: data type not understood in certain pandas types
412
422
elif is_numeric_dtype (column_dtype ):
413
423
feat_type .append ('numerical' )
414
- numerical_columns .append (column )
415
424
elif column_dtype == 'object' :
416
425
# TODO verify how would this happen when we always convert the object dtypes to category
417
426
raise TypeError (
@@ -437,7 +446,7 @@ def _get_columns_info(
437
446
"before feeding it to AutoPyTorch." .format (err_msg )
438
447
)
439
448
440
- return categorical_columns , numerical_columns , feat_type
449
+ return categorical_columns , feat_type
441
450
442
451
def list_to_pandas (
443
452
self ,
@@ -507,22 +516,26 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
507
516
pd.DataFrame
508
517
"""
509
518
if hasattr (self , 'object_dtype_mapping' ):
510
- # Mypy does not process the has attr. This dict is defined below
511
- for key , dtype in self . object_dtype_mapping . items (): # type: ignore[has-type]
512
- # honor the training data types
513
- try :
514
- X [ key ] = X [ key ]. astype ( dtype . name )
515
- except Exception as e :
516
- # Try inference if possible
517
- self . logger . warning ( f'Casting the column { key } to { dtype } caused the exception { e } ' )
518
- pass
519
+ # honor the training data types
520
+ try :
521
+ # Mypy does not process the has attr.
522
+ X = X . astype ( self . object_dtype_mapping ) # type: ignore[has-type]
523
+ except Exception as e :
524
+ # Try inference if possible
525
+ self . logger . warning ( f'Casting the columns to training dtypes ' # type: ignore[has-type]
526
+ f' { self . object_dtype_mapping } caused the exception { e } ' )
527
+ pass
519
528
else :
520
- # Calling for the first time to infer the categories
521
- X = X .infer_objects ()
522
- for column , data_type in zip (X .columns , X .dtypes ):
523
- if not is_numeric_dtype (data_type ):
524
- X [column ] = X [column ].astype ('category' )
525
-
529
+ if len (self .dtypes ) != 0 :
530
+ # when train data has no object dtype, but test does
531
+ # we prioritise the datatype given in training data
532
+ dtype_dict = {col : dtype for col , dtype in zip (X .columns , self .dtypes )}
533
+ X = X .astype (dtype_dict )
534
+ else :
535
+ # Calling for the first time to infer the categories
536
+ X = X .infer_objects ()
537
+ dtype_dict = {col : 'category' for col , dtype in zip (X .columns , X .dtypes ) if not is_numeric_dtype (dtype )}
538
+ X = X .astype (dtype_dict )
526
539
# only numerical attributes and categories
527
540
self .object_dtype_mapping = {column : data_type for column , data_type in zip (X .columns , X .dtypes )}
528
541
0 commit comments