Skip to content

Commit 0cfaf4c

Browse files
ArlindKadraravinkohli
authored andcommitted
Fixing issues with imbalanced datasets (#197)
* adding missing method from base_feature_validator * First try at a fix, removing redundant code * Fix bug * Updating unit test typo, fixing bug where the data type was not checked because X was a numpy array at the time of checking * Fixing flake 8 failing * Bug fix, implementation update for imbalanced datasets and unit tests to check the implementation * flake8 fix * Bug fix * Making the conversion to dataframe in the unit tests consistent with what happens at the validator, so the types do not change * flake8 fix * Addressing Ravin's comments
1 parent aa3f9f2 commit 0cfaf4c

7 files changed

+121
-53
lines changed

autoPyTorch/api/tabular_classification.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -91,17 +91,9 @@ def __init__(
9191
output_directory: Optional[str] = None,
9292
delete_tmp_folder_after_terminate: bool = True,
9393
delete_output_folder_after_terminate: bool = True,
94-
<<<<<<< HEAD
9594
include_components: Optional[Dict[str, Any]] = None,
9695
exclude_components: Optional[Dict[str, Any]] = None,
9796
resampling_strategy: ResamplingStrategies = HoldoutValTypes.holdout_validation,
98-
=======
99-
include_components: Optional[Dict] = None,
100-
exclude_components: Optional[Dict] = None,
101-
resampling_strategy: Union[CrossValTypes,
102-
HoldoutValTypes,
103-
NoResamplingStrategyTypes] = HoldoutValTypes.holdout_validation,
104-
>>>>>>> Create fit evaluator, no resampling strategy and fix bug for test statistics
10597
resampling_strategy_args: Optional[Dict[str, Any]] = None,
10698
backend: Optional[Backend] = None,
10799
search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None

autoPyTorch/api/tabular_regression.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -92,17 +92,9 @@ def __init__(
9292
output_directory: Optional[str] = None,
9393
delete_tmp_folder_after_terminate: bool = True,
9494
delete_output_folder_after_terminate: bool = True,
95-
<<<<<<< HEAD
9695
include_components: Optional[Dict[str, Any]] = None,
9796
exclude_components: Optional[Dict[str, Any]] = None,
9897
resampling_strategy: ResamplingStrategies = HoldoutValTypes.holdout_validation,
99-
=======
100-
include_components: Optional[Dict] = None,
101-
exclude_components: Optional[Dict] = None,
102-
resampling_strategy:Union[CrossValTypes,
103-
HoldoutValTypes,
104-
NoResamplingStrategyTypes] = HoldoutValTypes.holdout_validation,
105-
>>>>>>> Create fit evaluator, no resampling strategy and fix bug for test statistics
10698
resampling_strategy_args: Optional[Dict[str, Any]] = None,
10799
backend: Optional[Backend] = None,
108100
search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None

autoPyTorch/data/base_feature_validator.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,20 @@ def _fit(
110110
"""
111111
raise NotImplementedError()
112112

113+
def _check_data(
114+
self,
115+
X: SupportedFeatTypes,
116+
) -> None:
117+
"""
118+
Feature dimensionality and data type checks
119+
120+
Arguments:
121+
X (SupportedFeatTypes):
122+
A set of features that are going to be validated (type and dimensionality
123+
checks) and a encoder fitted in the case the data needs encoding
124+
"""
125+
raise NotImplementedError()
126+
113127
def transform(
114128
self,
115129
X: SupportedFeatTypes,

autoPyTorch/data/base_target_validator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ def _fit(
130130

131131
def transform(
132132
self,
133-
y: Union[SupportedTargetTypes],
133+
y: SupportedTargetTypes,
134134
) -> np.ndarray:
135135
"""
136136
Args:

autoPyTorch/data/tabular_feature_validator.py

Lines changed: 37 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -163,9 +163,13 @@ def _fit(
163163
# with nan values.
164164
# Columns that are completely made of NaN values are provided to the pipeline
165165
# so that later stages decide how to handle them
166+
167+
# Clear whatever null column markers we had previously
168+
self.null_columns.clear()
166169
if np.any(pd.isnull(X)):
167170
for column in X.columns:
168171
if X[column].isna().all():
172+
self.null_columns.add(column)
169173
X[column] = pd.to_numeric(X[column])
170174
# Also note this change in self.dtypes
171175
if len(self.dtypes) != 0:
@@ -174,9 +178,8 @@ def _fit(
174178
if not X.select_dtypes(include='object').empty:
175179
X = self.infer_objects(X)
176180

177-
self.transformed_columns, self.feat_type = self._get_columns_to_encode(X)
178-
179-
assert self.feat_type is not None
181+
self._check_data(X)
182+
self.enc_columns, self.feat_type = self._get_columns_to_encode(X)
180183

181184
if len(self.transformed_columns) > 0:
182185

@@ -246,29 +249,37 @@ def transform(
246249
X = self.numpy_array_to_pandas(X)
247250

248251
if hasattr(X, "iloc") and not issparse(X):
249-
if np.any(pd.isnull(X)):
250-
for column in X.columns:
251-
if X[column].isna().all():
252-
X[column] = pd.to_numeric(X[column])
252+
X = cast(pd.DataFrame, X)
253+
# If we had null columns in our fit call and we made them numeric, then:
254+
# - If the columns are null even in transform, apply the same procedure.
255+
# - Otherwise, substitute the values with np.NaN and then make the columns numeric.
256+
# If the column is null here, but it was not in fit, it does not matter.
257+
for column in self.null_columns:
258+
# The column is not null, make it null since it was null in fit.
259+
if not X[column].isna().all():
260+
X[column] = np.NaN
261+
X[column] = pd.to_numeric(X[column])
262+
263+
# for the test set, if we have columns with only null values
264+
# they will probably have a numeric type. If these columns were not
265+
# with only null values in the train set, they should be converted
266+
# to the type that they had during fitting.
267+
for column in X.columns:
268+
if X[column].isna().all():
269+
X[column] = X[column].astype(self.dtypes[list(X.columns).index(column)])
253270

254271
# Also remove the object dtype for new data
255272
if not X.select_dtypes(include='object').empty:
256273
X = self.infer_objects(X)
257274

258275
# Check the data here so we catch problems on new test data
259276
self._check_data(X)
277+
# We also need to fillna on the transformation
278+
# in case test data is provided
279+
X = self.impute_nan_in_categories(X)
260280

261-
# Pandas related transformations
262-
if hasattr(X, "iloc") and self.column_transformer is not None:
263-
if np.any(pd.isnull(X)):
264-
# After above check it means that if there is a NaN
265-
# the whole column must be NaN
266-
# Make sure it is numerical and let the pipeline handle it
267-
for column in X.columns:
268-
if X[column].isna().all():
269-
X[column] = pd.to_numeric(X[column])
270-
271-
X = self.column_transformer.transform(X)
281+
if self.encoder is not None:
282+
X = self.encoder.transform(X)
272283

273284
# Sparse related transformations
274285
# Not all sparse format support index sorting
@@ -525,7 +536,7 @@ def numpy_array_to_pandas(
525536
Returns:
526537
pd.DataFrame
527538
"""
528-
return pd.DataFrame(X).infer_objects().convert_dtypes()
539+
return pd.DataFrame(X).convert_dtypes()
529540

530541
def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
531542
"""
@@ -543,18 +554,13 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
543554
if hasattr(self, 'object_dtype_mapping'):
544555
# Mypy does not process the has attr. This dict is defined below
545556
for key, dtype in self.object_dtype_mapping.items(): # type: ignore[has-type]
546-
if 'int' in dtype.name:
547-
# In the case train data was interpreted as int
548-
# and test data was interpreted as float, because of 0.0
549-
# for example, honor training data
550-
X[key] = X[key].applymap(np.int64)
551-
else:
552-
try:
553-
X[key] = X[key].astype(dtype.name)
554-
except Exception as e:
555-
# Try inference if possible
556-
self.logger.warning(f"Tried to cast column {key} to {dtype} caused {e}")
557-
pass
557+
# honor the training data types
558+
try:
559+
X[key] = X[key].astype(dtype.name)
560+
except Exception as e:
561+
# Try inference if possible
562+
self.logger.warning(f"Tried to cast column {key} to {dtype} caused {e}")
563+
pass
558564
else:
559565
X = X.infer_objects()
560566
for column in X.columns:

test/test_data/test_feature_validator.py

Lines changed: 69 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -287,9 +287,9 @@ def test_featurevalidator_fitontypeA_transformtypeB(input_data_featuretest):
287287
if isinstance(input_data_featuretest, pd.DataFrame):
288288
pytest.skip("Column order change in pandas is not supported")
289289
elif isinstance(input_data_featuretest, np.ndarray):
290-
complementary_type = pd.DataFrame(input_data_featuretest)
290+
complementary_type = validator.numpy_array_to_pandas(input_data_featuretest)
291291
elif isinstance(input_data_featuretest, list):
292-
complementary_type = pd.DataFrame(input_data_featuretest)
292+
complementary_type, _ = validator.list_to_dataframe(input_data_featuretest)
293293
elif sparse.issparse(input_data_featuretest):
294294
complementary_type = sparse.csr_matrix(input_data_featuretest.todense())
295295
else:
@@ -479,8 +479,11 @@ def test_unknown_encode_value():
479479
)
480480
@pytest.mark.parametrize('train_data_type', ('numpy', 'pandas', 'list'))
481481
@pytest.mark.parametrize('test_data_type', ('numpy', 'pandas', 'list'))
482-
def test_featurevalidator_new_data_after_fit(openml_id,
483-
train_data_type, test_data_type):
482+
def test_feature_validator_new_data_after_fit(
483+
openml_id,
484+
train_data_type,
485+
test_data_type,
486+
):
484487

485488
# List is currently not supported as infer_objects
486489
# cast list objects to type objects
@@ -602,3 +605,65 @@ def test_featurevalidator_reduce_precision(input_data_featuretest):
602605
else:
603606
assert transformed_X_train.dtype == transformed_X_test.dtype
604607
assert transformed_X_test.dtype == validator._reduced_dtype
608+
609+
610+
def test_feature_validator_imbalanced_data():
611+
612+
# Null columns in the train split but not necessarily in the test split
613+
train_features = {
614+
'A': [np.NaN, np.NaN, np.NaN],
615+
'B': [1, 2, 3],
616+
'C': [np.NaN, np.NaN, np.NaN],
617+
'D': [np.NaN, np.NaN, np.NaN],
618+
}
619+
test_features = {
620+
'A': [3, 4, 5],
621+
'B': [6, 5, 7],
622+
'C': [np.NaN, np.NaN, np.NaN],
623+
'D': ['Blue', np.NaN, np.NaN],
624+
}
625+
626+
X_train = pd.DataFrame.from_dict(train_features)
627+
X_test = pd.DataFrame.from_dict(test_features)
628+
validator = TabularFeatureValidator()
629+
validator.fit(X_train)
630+
631+
train_feature_types = copy.deepcopy(validator.feat_type)
632+
assert train_feature_types == ['numerical', 'numerical', 'numerical', 'numerical']
633+
# validator will throw an error if the column types are not the same
634+
transformed_X_test = validator.transform(X_test)
635+
transformed_X_test = pd.DataFrame(transformed_X_test)
636+
null_columns = []
637+
for column in transformed_X_test.columns:
638+
if transformed_X_test[column].isna().all():
639+
null_columns.append(column)
640+
assert null_columns == [0, 2, 3]
641+
642+
# Columns with not all null values in the train split and
643+
# completely null on the test split.
644+
train_features = {
645+
'A': [np.NaN, np.NaN, 4],
646+
'B': [1, 2, 3],
647+
'C': ['Blue', np.NaN, np.NaN],
648+
}
649+
test_features = {
650+
'A': [np.NaN, np.NaN, np.NaN],
651+
'B': [6, 5, 7],
652+
'C': [np.NaN, np.NaN, np.NaN],
653+
}
654+
655+
X_train = pd.DataFrame.from_dict(train_features)
656+
X_test = pd.DataFrame.from_dict(test_features)
657+
validator = TabularFeatureValidator()
658+
validator.fit(X_train)
659+
train_feature_types = copy.deepcopy(validator.feat_type)
660+
assert train_feature_types == ['categorical', 'numerical', 'numerical']
661+
662+
transformed_X_test = validator.transform(X_test)
663+
transformed_X_test = pd.DataFrame(transformed_X_test)
664+
null_columns = []
665+
for column in transformed_X_test.columns:
666+
if transformed_X_test[column].isna().all():
667+
null_columns.append(column)
668+
669+
assert null_columns == [1]

test/test_data/test_validation.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ def test_data_validation_for_classification(openmlid, as_frame):
3131
x, y, test_size=0.33, random_state=0)
3232

3333
validator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
34-
3534
X_train_t, y_train_t = validator.transform(X_train, y_train)
3635
assert np.shape(X_train) == np.shape(X_train_t)
3736

0 commit comments

Comments
 (0)