@@ -204,7 +204,6 @@ def test_featurevalidator_supported_types(input_data_featuretest):
204
204
assert sparse .issparse (transformed_X )
205
205
else :
206
206
assert isinstance (transformed_X , np .ndarray )
207
- assert np .shape (input_data_featuretest ) == np .shape (transformed_X )
208
207
assert np .issubdtype (transformed_X .dtype , np .number )
209
208
assert validator ._is_fitted
210
209
@@ -237,11 +236,10 @@ def test_featurevalidator_categorical_nan(input_data_featuretest):
237
236
validator .fit (input_data_featuretest )
238
237
transformed_X = validator .transform (input_data_featuretest )
239
238
assert any (pd .isna (input_data_featuretest ))
240
- categories_ = validator .column_transformer .named_transformers_ [ 'categorical_pipeline' ]. \
241
- named_steps ['ordinalencoder ' ].categories_
239
+ categories_ = validator .column_transformer .\
240
+ named_transformers_ [ 'categorical_pipeline' ]. named_steps ['onehotencoder ' ].categories_
242
241
assert any (('0' in categories ) or (0 in categories ) or ('missing_value' in categories ) for categories in
243
242
categories_ )
244
- assert np .shape (input_data_featuretest ) == np .shape (transformed_X )
245
243
assert np .issubdtype (transformed_X .dtype , np .number )
246
244
assert validator ._is_fitted
247
245
assert isinstance (transformed_X , np .ndarray )
@@ -294,7 +292,6 @@ def test_featurevalidator_fitontypeA_transformtypeB(input_data_featuretest):
294
292
else :
295
293
raise ValueError (type (input_data_featuretest ))
296
294
transformed_X = validator .transform (complementary_type )
297
- assert np .shape (input_data_featuretest ) == np .shape (transformed_X )
298
295
assert np .issubdtype (transformed_X .dtype , np .number )
299
296
assert validator ._is_fitted
300
297
@@ -314,12 +311,6 @@ def test_featurevalidator_get_columns_to_encode():
314
311
for col in df .columns :
315
312
df [col ] = df [col ].astype (col )
316
313
317
- < << << << HEAD
318
- transformed_columns , feature_types = validator ._get_columns_to_encode (df )
319
-
320
- assert transformed_columns == ['category' , 'bool' ]
321
- assert feature_types == ['numerical' , 'numerical' , 'categorical' , 'categorical' ]
322
- == == == =
323
314
validator .fit (df )
324
315
325
316
categorical_columns , numerical_columns , feat_type = validator ._get_columns_info (df )
@@ -435,7 +426,6 @@ def test_feature_validator_remove_nan_catcolumns():
435
426
)
436
427
ans_test = np .array ([[0 , 0 , 0 , 0 ], [0 , 0 , 0 , 0 ]], dtype = np .float64 )
437
428
feature_validator_remove_nan_catcolumns (df_train , df_test , ans_train , ans_test )
438
- >> >> >> > Bug fixes (#249)
439
429
440
430
441
431
def test_features_unsupported_calls_are_raised ():
@@ -445,36 +435,29 @@ def test_features_unsupported_calls_are_raised():
445
435
expected
446
436
"""
447
437
validator = TabularFeatureValidator ()
448
- with pytest .raises (ValueError , match = r"AutoPyTorch does not support time " ):
438
+ with pytest .raises (TypeError , match = r".*?Convert the time information to a numerical value " ):
449
439
validator .fit (
450
440
pd .DataFrame ({'datetime' : [pd .Timestamp ('20180310' )]})
451
441
)
442
+ validator = TabularFeatureValidator ()
452
443
with pytest .raises (ValueError , match = r"AutoPyTorch only supports.*yet, the provided input" ):
453
444
validator .fit ({'input1' : 1 , 'input2' : 2 })
454
- with pytest .raises (ValueError , match = r"has unsupported dtype string" ):
445
+ validator = TabularFeatureValidator ()
446
+ with pytest .raises (TypeError , match = r".*?but input column A has an invalid type `string`.*" ):
455
447
validator .fit (pd .DataFrame ([{'A' : 1 , 'B' : 2 }], dtype = 'string' ))
448
+ validator = TabularFeatureValidator ()
456
449
with pytest .raises (ValueError , match = r"The feature dimensionality of the train and test" ):
457
450
validator .fit (X_train = np .array ([[1 , 2 , 3 ], [4 , 5 , 6 ]]),
458
451
X_test = np .array ([[1 , 2 , 3 , 4 ], [4 , 5 , 6 , 7 ]]),
459
452
)
453
+ validator = TabularFeatureValidator ()
460
454
with pytest .raises (ValueError , match = r"Cannot call transform on a validator that is not fit" ):
461
455
validator .transform (np .array ([[1 , 2 , 3 ], [4 , 5 , 6 ]]))
462
456
463
457
464
458
@pytest .mark .parametrize (
465
459
'input_data_featuretest' ,
466
460
(
467
- 'numpy_numericalonly_nonan' ,
468
- 'numpy_numericalonly_nan' ,
469
- 'pandas_numericalonly_nonan' ,
470
- 'pandas_numericalonly_nan' ,
471
- 'list_numericalonly_nonan' ,
472
- 'list_numericalonly_nan' ,
473
- # Category in numpy is handled via feat_type
474
- 'numpy_categoricalonly_nonan' ,
475
- 'numpy_mixed_nonan' ,
476
- 'numpy_categoricalonly_nan' ,
477
- 'numpy_mixed_nan' ,
478
461
'sparse_bsr_nonan' ,
479
462
'sparse_bsr_nan' ,
480
463
'sparse_coo_nonan' ,
@@ -512,7 +495,7 @@ def test_no_column_transformer_created(input_data_featuretest):
512
495
)
513
496
def test_column_transformer_created (input_data_featuretest ):
514
497
"""
515
- This test ensures an encoder is created if categorical data is provided
498
+ This test ensures an column transformer is created if categorical data is provided
516
499
"""
517
500
validator = TabularFeatureValidator ()
518
501
validator .fit (input_data_featuretest )
@@ -521,7 +504,7 @@ def test_column_transformer_created(input_data_featuretest):
521
504
522
505
# Make sure that the encoded features are actually encoded. Categorical columns are at
523
506
# the start after transformation. In our fixtures, this is also honored prior encode
524
- transformed_columns , feature_types = validator ._get_columns_to_encode (input_data_featuretest )
507
+ cat_columns , _ , feature_types = validator ._get_columns_info (input_data_featuretest )
525
508
526
509
# At least one categorical
527
510
assert 'categorical' in validator .feat_type
@@ -530,20 +513,13 @@ def test_column_transformer_created(input_data_featuretest):
530
513
if np .any ([pd .api .types .is_numeric_dtype (input_data_featuretest [col ]
531
514
) for col in input_data_featuretest .columns ]):
532
515
assert 'numerical' in validator .feat_type
533
- for i , feat_type in enumerate (feature_types ):
534
- if 'numerical' in feat_type :
535
- np .testing .assert_array_equal (
536
- transformed_X [:, i ],
537
- input_data_featuretest [input_data_featuretest .columns [i ]].to_numpy ()
538
- )
539
- elif 'categorical' in feat_type :
540
- np .testing .assert_array_equal (
541
- transformed_X [:, i ],
542
- # Expect always 0, 1... because we use a ordinal encoder
543
- np .array ([0 , 1 ])
544
- )
545
- else :
546
- raise ValueError (feat_type )
516
+ # we expect this input to be the fixture 'pandas_mixed_nan'
517
+ np .testing .assert_array_equal (transformed_X , np .array ([[1. , 0. , - 1. ], [0. , 1. , 1. ]]))
518
+ else :
519
+ np .testing .assert_array_equal (transformed_X , np .array ([[1. , 0. , 1. , 0. ], [0. , 1. , 0. , 1. ]]))
520
+
521
+ if not all ([feat_type in ['numerical' , 'categorical' ] for feat_type in feature_types ]):
522
+ raise ValueError ("Expected only numerical and categorical feature types" )
547
523
548
524
549
525
def test_no_new_category_after_fit ():
@@ -575,13 +551,12 @@ def test_unknown_encode_value():
575
551
x ['c' ].cat .add_categories (['NA' ], inplace = True )
576
552
x .loc [0 , 'c' ] = 'NA' # unknown value
577
553
x_t = validator .transform (x )
578
- # The first row should have a -1 as we added a new categorical there
579
- expected_row = [- 1 , - 41 , - 3 , - 987.2 ]
554
+ # The first row should have a 0, 0 as we added a
555
+ # new categorical there and one hot encoder marks
556
+ # it as all zeros for the transformed column
557
+ expected_row = [0.0 , 0.0 , - 0.5584294383572701 , 0.5000000000000004 , - 1.5136598016833485 ]
580
558
assert expected_row == x_t [0 ].tolist ()
581
559
582
- # Notice how there is only one column 'c' to encode
583
- assert validator .categories == [list (range (2 )) for i in range (1 )]
584
-
585
560
586
561
# Actual checks for the features
587
562
@pytest .mark .parametrize (
@@ -633,19 +608,20 @@ def test_feature_validator_new_data_after_fit(
633
608
assert sparse .issparse (transformed_X )
634
609
else :
635
610
assert isinstance (transformed_X , np .ndarray )
636
- assert np .shape (X_test ) == np .shape (transformed_X )
637
611
638
612
# And then check proper error messages
639
613
if train_data_type == 'pandas' :
640
614
old_dtypes = copy .deepcopy (validator .dtypes )
641
615
validator .dtypes = ['dummy' for dtype in X_train .dtypes ]
642
- with pytest .raises (ValueError , match = r"Changing the dtype of the features after fit" ):
616
+ with pytest .raises (ValueError ,
617
+ match = r"The dtype of the features must not be changed after fit" ):
643
618
transformed_X = validator .transform (X_test )
644
619
validator .dtypes = old_dtypes
645
620
if test_data_type == 'pandas' :
646
621
columns = X_test .columns .tolist ()
647
622
X_test = X_test [reversed (columns )]
648
- with pytest .raises (ValueError , match = r"Changing the column order of the features" ):
623
+ with pytest .raises (ValueError ,
624
+ match = r"The column order of the features must not be changed after fit" ):
649
625
transformed_X = validator .transform (X_test )
650
626
651
627
0 commit comments