-
Notifications
You must be signed in to change notification settings - Fork 300
[FIX] Passing checks #298
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[FIX] Passing checks #298
Changes from all commits
16cfb55
90ebe72
f965094
f7b4d70
3181ed1
4d69ff5
e8d1eb3
243573b
dad3a4b
6d94893
e049177
d079e04
245af31
0a18ab8
ec62e2e
f528698
b399dac
42ca211
296cc16
b4314f9
fefbdcf
042f478
aaefc83
8ebbc5e
e3c43ef
3564fa1
10aea66
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -204,7 +204,6 @@ def test_featurevalidator_supported_types(input_data_featuretest): | |||||
assert sparse.issparse(transformed_X) | ||||||
else: | ||||||
assert isinstance(transformed_X, np.ndarray) | ||||||
assert np.shape(input_data_featuretest) == np.shape(transformed_X) | ||||||
assert np.issubdtype(transformed_X.dtype, np.number) | ||||||
assert validator._is_fitted | ||||||
|
||||||
|
@@ -237,9 +236,10 @@ def test_featurevalidator_categorical_nan(input_data_featuretest): | |||||
validator.fit(input_data_featuretest) | ||||||
transformed_X = validator.transform(input_data_featuretest) | ||||||
assert any(pd.isna(input_data_featuretest)) | ||||||
assert any((-1 in categories) or ('-1' in categories) or ('Missing!' in categories) for categories in | ||||||
validator.encoder.named_transformers_['encoder'].categories_) | ||||||
assert np.shape(input_data_featuretest) == np.shape(transformed_X) | ||||||
ravinkohli marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
categories_ = validator.column_transformer.\ | ||||||
named_transformers_['categorical_pipeline'].named_steps['onehotencoder'].categories_ | ||||||
assert any(('0' in categories) or (0 in categories) or ('missing_value' in categories) for categories in | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This are the unique category values for the one hot encoder right, what would be the case where they are 0? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it will be 0 when the column is categorical but the dtype of the column is int There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. according to this Auto-PyTorch/autoPyTorch/data/tabular_feature_validator.py Lines 380 to 381 in 10aea66
It will be a categorical column only when it has a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does it fail if you run it without the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
not string, but category. And category can be string or int. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It works fine without |
||||||
categories_) | ||||||
assert np.issubdtype(transformed_X.dtype, np.number) | ||||||
assert validator._is_fitted | ||||||
assert isinstance(transformed_X, np.ndarray) | ||||||
|
@@ -292,7 +292,6 @@ def test_featurevalidator_fitontypeA_transformtypeB(input_data_featuretest): | |||||
else: | ||||||
raise ValueError(type(input_data_featuretest)) | ||||||
transformed_X = validator.transform(complementary_type) | ||||||
assert np.shape(input_data_featuretest) == np.shape(transformed_X) | ||||||
assert np.issubdtype(transformed_X.dtype, np.number) | ||||||
assert validator._is_fitted | ||||||
|
||||||
|
@@ -436,36 +435,29 @@ def test_features_unsupported_calls_are_raised(): | |||||
expected | ||||||
""" | ||||||
validator = TabularFeatureValidator() | ||||||
with pytest.raises(ValueError, match=r"AutoPyTorch does not support time"): | ||||||
with pytest.raises(TypeError, match=r".*?Convert the time information to a numerical value"): | ||||||
validator.fit( | ||||||
pd.DataFrame({'datetime': [pd.Timestamp('20180310')]}) | ||||||
) | ||||||
validator = TabularFeatureValidator() | ||||||
with pytest.raises(ValueError, match=r"AutoPyTorch only supports.*yet, the provided input"): | ||||||
validator.fit({'input1': 1, 'input2': 2}) | ||||||
with pytest.raises(ValueError, match=r"has unsupported dtype string"): | ||||||
validator = TabularFeatureValidator() | ||||||
with pytest.raises(TypeError, match=r".*?but input column A has an invalid type `string`.*"): | ||||||
validator.fit(pd.DataFrame([{'A': 1, 'B': 2}], dtype='string')) | ||||||
validator = TabularFeatureValidator() | ||||||
with pytest.raises(ValueError, match=r"The feature dimensionality of the train and test"): | ||||||
validator.fit(X_train=np.array([[1, 2, 3], [4, 5, 6]]), | ||||||
X_test=np.array([[1, 2, 3, 4], [4, 5, 6, 7]]), | ||||||
) | ||||||
validator = TabularFeatureValidator() | ||||||
with pytest.raises(ValueError, match=r"Cannot call transform on a validator that is not fit"): | ||||||
validator.transform(np.array([[1, 2, 3], [4, 5, 6]])) | ||||||
|
||||||
|
||||||
@pytest.mark.parametrize( | ||||||
'input_data_featuretest', | ||||||
( | ||||||
'numpy_numericalonly_nonan', | ||||||
'numpy_numericalonly_nan', | ||||||
'pandas_numericalonly_nonan', | ||||||
'pandas_numericalonly_nan', | ||||||
'list_numericalonly_nonan', | ||||||
'list_numericalonly_nan', | ||||||
# Category in numpy is handled via feat_type | ||||||
'numpy_categoricalonly_nonan', | ||||||
'numpy_mixed_nonan', | ||||||
'numpy_categoricalonly_nan', | ||||||
'numpy_mixed_nan', | ||||||
'sparse_bsr_nonan', | ||||||
'sparse_bsr_nan', | ||||||
'sparse_coo_nonan', | ||||||
|
@@ -483,14 +475,14 @@ def test_features_unsupported_calls_are_raised(): | |||||
), | ||||||
indirect=True | ||||||
) | ||||||
def test_no_encoder_created(input_data_featuretest): | ||||||
def test_no_column_transformer_created(input_data_featuretest): | ||||||
""" | ||||||
Makes sure that for numerical only features, no encoder is created | ||||||
""" | ||||||
validator = TabularFeatureValidator() | ||||||
validator.fit(input_data_featuretest) | ||||||
validator.transform(input_data_featuretest) | ||||||
assert validator.encoder is None | ||||||
assert validator.column_transformer is None | ||||||
|
||||||
|
||||||
@pytest.mark.parametrize( | ||||||
|
@@ -501,18 +493,18 @@ def test_no_encoder_created(input_data_featuretest): | |||||
), | ||||||
indirect=True | ||||||
) | ||||||
def test_encoder_created(input_data_featuretest): | ||||||
def test_column_transformer_created(input_data_featuretest): | ||||||
""" | ||||||
This test ensures an encoder is created if categorical data is provided | ||||||
This test ensures an column transformer is created if categorical data is provided | ||||||
""" | ||||||
validator = TabularFeatureValidator() | ||||||
validator.fit(input_data_featuretest) | ||||||
transformed_X = validator.transform(input_data_featuretest) | ||||||
assert validator.encoder is not None | ||||||
assert validator.column_transformer is not None | ||||||
|
||||||
# Make sure that the encoded features are actually encoded. Categorical columns are at | ||||||
# the start after transformation. In our fixtures, this is also honored prior encode | ||||||
enc_columns, feature_types = validator._get_columns_to_encode(input_data_featuretest) | ||||||
cat_columns, _, feature_types = validator._get_columns_info(input_data_featuretest) | ||||||
|
||||||
# At least one categorical | ||||||
assert 'categorical' in validator.feat_type | ||||||
|
@@ -521,20 +513,13 @@ def test_encoder_created(input_data_featuretest): | |||||
if np.any([pd.api.types.is_numeric_dtype(input_data_featuretest[col] | ||||||
) for col in input_data_featuretest.columns]): | ||||||
assert 'numerical' in validator.feat_type | ||||||
for i, feat_type in enumerate(feature_types): | ||||||
if 'numerical' in feat_type: | ||||||
np.testing.assert_array_equal( | ||||||
transformed_X[:, i], | ||||||
input_data_featuretest[input_data_featuretest.columns[i]].to_numpy() | ||||||
) | ||||||
elif 'categorical' in feat_type: | ||||||
np.testing.assert_array_equal( | ||||||
transformed_X[:, i], | ||||||
# Expect always 0, 1... because we use a ordinal encoder | ||||||
np.array([0, 1]) | ||||||
) | ||||||
else: | ||||||
raise ValueError(feat_type) | ||||||
# we expect this input to be the fixture 'pandas_mixed_nan' | ||||||
np.testing.assert_array_equal(transformed_X, np.array([[1., 0., -1.], [0., 1., 1.]])) | ||||||
else: | ||||||
np.testing.assert_array_equal(transformed_X, np.array([[1., 0., 1., 0.], [0., 1., 0., 1.]])) | ||||||
|
||||||
if not all([feat_type in ['numerical', 'categorical'] for feat_type in feature_types]): | ||||||
raise ValueError("Expected only numerical and categorical feature types") | ||||||
|
||||||
|
||||||
def test_no_new_category_after_fit(): | ||||||
|
@@ -566,13 +551,12 @@ def test_unknown_encode_value(): | |||||
x['c'].cat.add_categories(['NA'], inplace=True) | ||||||
x.loc[0, 'c'] = 'NA' # unknown value | ||||||
x_t = validator.transform(x) | ||||||
# The first row should have a -1 as we added a new categorical there | ||||||
expected_row = [-1, -41, -3, -987.2] | ||||||
# The first row should have a 0, 0 as we added a | ||||||
# new categorical there and one hot encoder marks | ||||||
# it as all zeros for the transformed column | ||||||
expected_row = [0.0, 0.0, -0.5584294383572701, 0.5000000000000004, -1.5136598016833485] | ||||||
assert expected_row == x_t[0].tolist() | ||||||
|
||||||
# Notice how there is only one column 'c' to encode | ||||||
assert validator.categories == [list(range(2)) for i in range(1)] | ||||||
|
||||||
|
||||||
# Actual checks for the features | ||||||
@pytest.mark.parametrize( | ||||||
|
@@ -624,19 +608,20 @@ def test_feature_validator_new_data_after_fit( | |||||
assert sparse.issparse(transformed_X) | ||||||
else: | ||||||
assert isinstance(transformed_X, np.ndarray) | ||||||
assert np.shape(X_test) == np.shape(transformed_X) | ||||||
|
||||||
# And then check proper error messages | ||||||
if train_data_type == 'pandas': | ||||||
old_dtypes = copy.deepcopy(validator.dtypes) | ||||||
validator.dtypes = ['dummy' for dtype in X_train.dtypes] | ||||||
with pytest.raises(ValueError, match=r"Changing the dtype of the features after fit"): | ||||||
with pytest.raises(ValueError, | ||||||
match=r"The dtype of the features must not be changed after fit"): | ||||||
transformed_X = validator.transform(X_test) | ||||||
validator.dtypes = old_dtypes | ||||||
if test_data_type == 'pandas': | ||||||
columns = X_test.columns.tolist() | ||||||
X_test = X_test[reversed(columns)] | ||||||
with pytest.raises(ValueError, match=r"Changing the column order of the features"): | ||||||
with pytest.raises(ValueError, | ||||||
match=r"The column order of the features must not be changed after fit"): | ||||||
transformed_X = validator.transform(X_test) | ||||||
|
||||||
|
||||||
|
Uh oh!
There was an error while loading. Please reload this page.