Skip to content

Commit be2d212

Browse files
ArlindKadraravinkohli
authored andcommitted
Fixing issues with imbalanced datasets (#197)
* adding missing method from base_feature_validator * First try at a fix, removing redundant code * Fix bug * Updating unit test typo, fixing bug where the data type was not checked because X was a numpy array at the time of checking * Fixing flake 8 failing * Bug fix, implementation update for imbalanced datasets and unit tests to check the implementation * flake8 fix * Bug fix * Making the conversion to dataframe in the unit tests consistent with what happens at the validator, so the types do not change * flake8 fix * Addressing Ravin's comments
1 parent 4af3932 commit be2d212

File tree

4 files changed

+118
-36
lines changed

4 files changed

+118
-36
lines changed

autoPyTorch/data/base_feature_validator.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,20 @@ def _fit(
122122
"""
123123
raise NotImplementedError()
124124

125+
def _check_data(
126+
self,
127+
X: SUPPORTED_FEAT_TYPES,
128+
) -> None:
129+
"""
130+
Feature dimensionality and data type checks
131+
132+
Arguments:
133+
X (SUPPORTED_FEAT_TYPES):
134+
A set of features that are going to be validated (type and dimensionality
135+
checks) and a encoder fitted in the case the data needs encoding
136+
"""
137+
raise NotImplementedError()
138+
125139
def transform(
126140
self,
127141
X: SUPPORTED_FEAT_TYPES,

autoPyTorch/data/tabular_feature_validator.py

Lines changed: 37 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -147,9 +147,13 @@ def _fit(
147147
# with nan values.
148148
# Columns that are completely made of NaN values are provided to the pipeline
149149
# so that later stages decide how to handle them
150+
151+
# Clear whatever null column markers we had previously
152+
self.null_columns.clear()
150153
if np.any(pd.isnull(X)):
151154
for column in X.columns:
152155
if X[column].isna().all():
156+
self.null_columns.add(column)
153157
X[column] = pd.to_numeric(X[column])
154158
# Also note this change in self.dtypes
155159
if len(self.dtypes) != 0:
@@ -158,9 +162,8 @@ def _fit(
158162
if not X.select_dtypes(include='object').empty:
159163
X = self.infer_objects(X)
160164

161-
self.transformed_columns, self.feat_type = self._get_columns_to_encode(X)
162-
163-
assert self.feat_type is not None
165+
self._check_data(X)
166+
self.enc_columns, self.feat_type = self._get_columns_to_encode(X)
164167

165168
if len(self.transformed_columns) > 0:
166169

@@ -230,29 +233,37 @@ def transform(
230233
X = self.numpy_array_to_pandas(X)
231234

232235
if hasattr(X, "iloc") and not scipy.sparse.issparse(X):
233-
if np.any(pd.isnull(X)):
234-
for column in X.columns:
235-
if X[column].isna().all():
236-
X[column] = pd.to_numeric(X[column])
236+
X = typing.cast(pd.DataFrame, X)
237+
# If we had null columns in our fit call and we made them numeric, then:
238+
# - If the columns are null even in transform, apply the same procedure.
239+
# - Otherwise, substitute the values with np.NaN and then make the columns numeric.
240+
# If the column is null here, but it was not in fit, it does not matter.
241+
for column in self.null_columns:
242+
# The column is not null, make it null since it was null in fit.
243+
if not X[column].isna().all():
244+
X[column] = np.NaN
245+
X[column] = pd.to_numeric(X[column])
246+
247+
# for the test set, if we have columns with only null values
248+
# they will probably have a numeric type. If these columns were not
249+
# with only null values in the train set, they should be converted
250+
# to the type that they had during fitting.
251+
for column in X.columns:
252+
if X[column].isna().all():
253+
X[column] = X[column].astype(self.dtypes[list(X.columns).index(column)])
237254

238255
# Also remove the object dtype for new data
239256
if not X.select_dtypes(include='object').empty:
240257
X = self.infer_objects(X)
241258

242259
# Check the data here so we catch problems on new test data
243260
self._check_data(X)
261+
# We also need to fillna on the transformation
262+
# in case test data is provided
263+
X = self.impute_nan_in_categories(X)
244264

245-
# Pandas related transformations
246-
if hasattr(X, "iloc") and self.column_transformer is not None:
247-
if np.any(pd.isnull(X)):
248-
# After above check it means that if there is a NaN
249-
# the whole column must be NaN
250-
# Make sure it is numerical and let the pipeline handle it
251-
for column in X.columns:
252-
if X[column].isna().all():
253-
X[column] = pd.to_numeric(X[column])
254-
255-
X = self.column_transformer.transform(X)
265+
if self.encoder is not None:
266+
X = self.encoder.transform(X)
256267

257268
# Sparse related transformations
258269
# Not all sparse format support index sorting
@@ -478,7 +489,7 @@ def numpy_array_to_pandas(
478489
Returns:
479490
pd.DataFrame
480491
"""
481-
return pd.DataFrame(X).infer_objects().convert_dtypes()
492+
return pd.DataFrame(X).convert_dtypes()
482493

483494
def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
484495
"""
@@ -496,18 +507,13 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
496507
if hasattr(self, 'object_dtype_mapping'):
497508
# Mypy does not process the has attr. This dict is defined below
498509
for key, dtype in self.object_dtype_mapping.items(): # type: ignore[has-type]
499-
if 'int' in dtype.name:
500-
# In the case train data was interpreted as int
501-
# and test data was interpreted as float, because of 0.0
502-
# for example, honor training data
503-
X[key] = X[key].applymap(np.int64)
504-
else:
505-
try:
506-
X[key] = X[key].astype(dtype.name)
507-
except Exception as e:
508-
# Try inference if possible
509-
self.logger.warning(f"Tried to cast column {key} to {dtype} caused {e}")
510-
pass
510+
# honor the training data types
511+
try:
512+
X[key] = X[key].astype(dtype.name)
513+
except Exception as e:
514+
# Try inference if possible
515+
self.logger.warning(f"Tried to cast column {key} to {dtype} caused {e}")
516+
pass
511517
else:
512518
X = X.infer_objects()
513519
for column in X.columns:

test/test_data/test_feature_validator.py

Lines changed: 67 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -286,9 +286,9 @@ def test_featurevalidator_fitontypeA_transformtypeB(input_data_featuretest):
286286
if isinstance(input_data_featuretest, pd.DataFrame):
287287
pytest.skip("Column order change in pandas is not supported")
288288
elif isinstance(input_data_featuretest, np.ndarray):
289-
complementary_type = pd.DataFrame(input_data_featuretest)
289+
complementary_type = validator.numpy_array_to_pandas(input_data_featuretest)
290290
elif isinstance(input_data_featuretest, list):
291-
complementary_type = pd.DataFrame(input_data_featuretest)
291+
complementary_type, _ = validator.list_to_dataframe(input_data_featuretest)
292292
elif sparse.issparse(input_data_featuretest):
293293
complementary_type = sparse.csr_matrix(input_data_featuretest.todense())
294294
else:
@@ -478,8 +478,11 @@ def test_unknown_encode_value():
478478
)
479479
@pytest.mark.parametrize('train_data_type', ('numpy', 'pandas', 'list'))
480480
@pytest.mark.parametrize('test_data_type', ('numpy', 'pandas', 'list'))
481-
def test_featurevalidator_new_data_after_fit(openml_id,
482-
train_data_type, test_data_type):
481+
def test_feature_validator_new_data_after_fit(
482+
openml_id,
483+
train_data_type,
484+
test_data_type,
485+
):
483486

484487
# List is currently not supported as infer_objects
485488
# cast list objects to type objects
@@ -557,3 +560,63 @@ def test_comparator():
557560
key=functools.cmp_to_key(validator._comparator)
558561
)
559562
assert ans == feat_type
563+
def test_feature_validator_imbalanced_data():
564+
565+
# Null columns in the train split but not necessarily in the test split
566+
train_features = {
567+
'A': [np.NaN, np.NaN, np.NaN],
568+
'B': [1, 2, 3],
569+
'C': [np.NaN, np.NaN, np.NaN],
570+
'D': [np.NaN, np.NaN, np.NaN],
571+
}
572+
test_features = {
573+
'A': [3, 4, 5],
574+
'B': [6, 5, 7],
575+
'C': [np.NaN, np.NaN, np.NaN],
576+
'D': ['Blue', np.NaN, np.NaN],
577+
}
578+
579+
X_train = pd.DataFrame.from_dict(train_features)
580+
X_test = pd.DataFrame.from_dict(test_features)
581+
validator = TabularFeatureValidator()
582+
validator.fit(X_train)
583+
584+
train_feature_types = copy.deepcopy(validator.feat_type)
585+
assert train_feature_types == ['numerical', 'numerical', 'numerical', 'numerical']
586+
# validator will throw an error if the column types are not the same
587+
transformed_X_test = validator.transform(X_test)
588+
transformed_X_test = pd.DataFrame(transformed_X_test)
589+
null_columns = []
590+
for column in transformed_X_test.columns:
591+
if transformed_X_test[column].isna().all():
592+
null_columns.append(column)
593+
assert null_columns == [0, 2, 3]
594+
595+
# Columns with not all null values in the train split and
596+
# completely null on the test split.
597+
train_features = {
598+
'A': [np.NaN, np.NaN, 4],
599+
'B': [1, 2, 3],
600+
'C': ['Blue', np.NaN, np.NaN],
601+
}
602+
test_features = {
603+
'A': [np.NaN, np.NaN, np.NaN],
604+
'B': [6, 5, 7],
605+
'C': [np.NaN, np.NaN, np.NaN],
606+
}
607+
608+
X_train = pd.DataFrame.from_dict(train_features)
609+
X_test = pd.DataFrame.from_dict(test_features)
610+
validator = TabularFeatureValidator()
611+
validator.fit(X_train)
612+
train_feature_types = copy.deepcopy(validator.feat_type)
613+
assert train_feature_types == ['categorical', 'numerical', 'numerical']
614+
615+
transformed_X_test = validator.transform(X_test)
616+
transformed_X_test = pd.DataFrame(transformed_X_test)
617+
null_columns = []
618+
for column in transformed_X_test.columns:
619+
if transformed_X_test[column].isna().all():
620+
null_columns.append(column)
621+
622+
assert null_columns == [1]

test/test_data/test_validation.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ def test_data_validation_for_classification(openmlid, as_frame):
3131
x, y, test_size=0.33, random_state=0)
3232

3333
validator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
34-
3534
X_train_t, y_train_t = validator.transform(X_train, y_train)
3635
assert np.shape(X_train) == np.shape(X_train_t)
3736

0 commit comments

Comments
 (0)