@@ -147,9 +147,13 @@ def _fit(
147
147
# with nan values.
148
148
# Columns that are completely made of NaN values are provided to the pipeline
149
149
# so that later stages decide how to handle them
150
+
151
+ # Clear whatever null column markers we had previously
152
+ self .null_columns .clear ()
150
153
if np .any (pd .isnull (X )):
151
154
for column in X .columns :
152
155
if X [column ].isna ().all ():
156
+ self .null_columns .add (column )
153
157
X [column ] = pd .to_numeric (X [column ])
154
158
# Also note this change in self.dtypes
155
159
if len (self .dtypes ) != 0 :
@@ -158,9 +162,8 @@ def _fit(
158
162
if not X .select_dtypes (include = 'object' ).empty :
159
163
X = self .infer_objects (X )
160
164
161
- self .transformed_columns , self .feat_type = self ._get_columns_to_encode (X )
162
-
163
- assert self .feat_type is not None
165
+ self ._check_data (X )
166
+ self .enc_columns , self .feat_type = self ._get_columns_to_encode (X )
164
167
165
168
if len (self .transformed_columns ) > 0 :
166
169
@@ -230,29 +233,37 @@ def transform(
230
233
X = self .numpy_array_to_pandas (X )
231
234
232
235
if hasattr (X , "iloc" ) and not scipy .sparse .issparse (X ):
233
- if np .any (pd .isnull (X )):
234
- for column in X .columns :
235
- if X [column ].isna ().all ():
236
- X [column ] = pd .to_numeric (X [column ])
236
+ X = typing .cast (pd .DataFrame , X )
237
+ # If we had null columns in our fit call and we made them numeric, then:
238
+ # - If the columns are null even in transform, apply the same procedure.
239
+ # - Otherwise, substitute the values with np.NaN and then make the columns numeric.
240
+ # If the column is null here, but it was not in fit, it does not matter.
241
+ for column in self .null_columns :
242
+ # The column is not null, make it null since it was null in fit.
243
+ if not X [column ].isna ().all ():
244
+ X [column ] = np .NaN
245
+ X [column ] = pd .to_numeric (X [column ])
246
+
247
+ # for the test set, if we have columns with only null values
248
+ # they will probably have a numeric type. If these columns were not
249
+ # with only null values in the train set, they should be converted
250
+ # to the type that they had during fitting.
251
+ for column in X .columns :
252
+ if X [column ].isna ().all ():
253
+ X [column ] = X [column ].astype (self .dtypes [list (X .columns ).index (column )])
237
254
238
255
# Also remove the object dtype for new data
239
256
if not X .select_dtypes (include = 'object' ).empty :
240
257
X = self .infer_objects (X )
241
258
242
259
# Check the data here so we catch problems on new test data
243
260
self ._check_data (X )
261
+ # We also need to fillna on the transformation
262
+ # in case test data is provided
263
+ X = self .impute_nan_in_categories (X )
244
264
245
- # Pandas related transformations
246
- if hasattr (X , "iloc" ) and self .column_transformer is not None :
247
- if np .any (pd .isnull (X )):
248
- # After above check it means that if there is a NaN
249
- # the whole column must be NaN
250
- # Make sure it is numerical and let the pipeline handle it
251
- for column in X .columns :
252
- if X [column ].isna ().all ():
253
- X [column ] = pd .to_numeric (X [column ])
254
-
255
- X = self .column_transformer .transform (X )
265
+ if self .encoder is not None :
266
+ X = self .encoder .transform (X )
256
267
257
268
# Sparse related transformations
258
269
# Not all sparse format support index sorting
@@ -478,7 +489,7 @@ def numpy_array_to_pandas(
478
489
Returns:
479
490
pd.DataFrame
480
491
"""
481
- return pd .DataFrame (X ).infer_objects (). convert_dtypes ()
492
+ return pd .DataFrame (X ).convert_dtypes ()
482
493
483
494
def infer_objects (self , X : pd .DataFrame ) -> pd .DataFrame :
484
495
"""
@@ -496,18 +507,13 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
496
507
if hasattr (self , 'object_dtype_mapping' ):
497
508
# Mypy does not process the has attr. This dict is defined below
498
509
for key , dtype in self .object_dtype_mapping .items (): # type: ignore[has-type]
499
- if 'int' in dtype .name :
500
- # In the case train data was interpreted as int
501
- # and test data was interpreted as float, because of 0.0
502
- # for example, honor training data
503
- X [key ] = X [key ].applymap (np .int64 )
504
- else :
505
- try :
506
- X [key ] = X [key ].astype (dtype .name )
507
- except Exception as e :
508
- # Try inference if possible
509
- self .logger .warning (f"Tried to cast column { key } to { dtype } caused { e } " )
510
- pass
510
+ # honor the training data types
511
+ try :
512
+ X [key ] = X [key ].astype (dtype .name )
513
+ except Exception as e :
514
+ # Try inference if possible
515
+ self .logger .warning (f"Tried to cast column { key } to { dtype } caused { e } " )
516
+ pass
511
517
else :
512
518
X = X .infer_objects ()
513
519
for column in X .columns :
0 commit comments