@@ -205,7 +205,6 @@ def test_featurevalidator_supported_types(input_data_featuretest):
205
205
assert sparse .issparse (transformed_X )
206
206
else :
207
207
assert isinstance (transformed_X , np .ndarray )
208
- assert np .shape (input_data_featuretest ) == np .shape (transformed_X )
209
208
assert np .issubdtype (transformed_X .dtype , np .number )
210
209
assert validator ._is_fitted
211
210
@@ -238,11 +237,10 @@ def test_featurevalidator_categorical_nan(input_data_featuretest):
238
237
validator .fit (input_data_featuretest )
239
238
transformed_X = validator .transform (input_data_featuretest )
240
239
assert any (pd .isna (input_data_featuretest ))
241
- categories_ = validator .column_transformer .named_transformers_ [ 'categorical_pipeline' ]. \
242
- named_steps ['ordinalencoder ' ].categories_
240
+ categories_ = validator .column_transformer .\
241
+ named_transformers_ [ 'categorical_pipeline' ]. named_steps ['onehotencoder ' ].categories_
243
242
assert any (('0' in categories ) or (0 in categories ) or ('missing_value' in categories ) for categories in
244
243
categories_ )
245
- assert np .shape (input_data_featuretest ) == np .shape (transformed_X )
246
244
assert np .issubdtype (transformed_X .dtype , np .number )
247
245
assert validator ._is_fitted
248
246
assert isinstance (transformed_X , np .ndarray )
@@ -295,7 +293,6 @@ def test_featurevalidator_fitontypeA_transformtypeB(input_data_featuretest):
295
293
else :
296
294
raise ValueError (type (input_data_featuretest ))
297
295
transformed_X = validator .transform (complementary_type )
298
- assert np .shape (input_data_featuretest ) == np .shape (transformed_X )
299
296
assert np .issubdtype (transformed_X .dtype , np .number )
300
297
assert validator ._is_fitted
301
298
@@ -315,12 +312,6 @@ def test_featurevalidator_get_columns_to_encode():
315
312
for col in df .columns :
316
313
df [col ] = df [col ].astype (col )
317
314
318
- < << << << HEAD
319
- transformed_columns , feature_types = validator ._get_columns_to_encode (df )
320
-
321
- assert transformed_columns == ['category' , 'bool' ]
322
- assert feature_types == ['numerical' , 'numerical' , 'categorical' , 'categorical' ]
323
- == == == =
324
315
validator .fit (df )
325
316
326
317
categorical_columns , numerical_columns , feat_type = validator ._get_columns_info (df )
@@ -436,7 +427,6 @@ def test_feature_validator_remove_nan_catcolumns():
436
427
)
437
428
ans_test = np .array ([[0 , 0 , 0 , 0 ], [0 , 0 , 0 , 0 ]], dtype = np .float64 )
438
429
feature_validator_remove_nan_catcolumns (df_train , df_test , ans_train , ans_test )
439
- >> >> >> > Bug fixes (#249)
440
430
441
431
442
432
def test_features_unsupported_calls_are_raised ():
@@ -446,36 +436,29 @@ def test_features_unsupported_calls_are_raised():
446
436
expected
447
437
"""
448
438
validator = TabularFeatureValidator ()
449
- with pytest .raises (ValueError , match = r"AutoPyTorch does not support time " ):
439
+ with pytest .raises (TypeError , match = r".*?Convert the time information to a numerical value " ):
450
440
validator .fit (
451
441
pd .DataFrame ({'datetime' : [pd .Timestamp ('20180310' )]})
452
442
)
443
+ validator = TabularFeatureValidator ()
453
444
with pytest .raises (ValueError , match = r"AutoPyTorch only supports.*yet, the provided input" ):
454
445
validator .fit ({'input1' : 1 , 'input2' : 2 })
455
- with pytest .raises (ValueError , match = r"has unsupported dtype string" ):
446
+ validator = TabularFeatureValidator ()
447
+ with pytest .raises (TypeError , match = r".*?but input column A has an invalid type `string`.*" ):
456
448
validator .fit (pd .DataFrame ([{'A' : 1 , 'B' : 2 }], dtype = 'string' ))
449
+ validator = TabularFeatureValidator ()
457
450
with pytest .raises (ValueError , match = r"The feature dimensionality of the train and test" ):
458
451
validator .fit (X_train = np .array ([[1 , 2 , 3 ], [4 , 5 , 6 ]]),
459
452
X_test = np .array ([[1 , 2 , 3 , 4 ], [4 , 5 , 6 , 7 ]]),
460
453
)
454
+ validator = TabularFeatureValidator ()
461
455
with pytest .raises (ValueError , match = r"Cannot call transform on a validator that is not fit" ):
462
456
validator .transform (np .array ([[1 , 2 , 3 ], [4 , 5 , 6 ]]))
463
457
464
458
465
459
@pytest .mark .parametrize (
466
460
'input_data_featuretest' ,
467
461
(
468
- 'numpy_numericalonly_nonan' ,
469
- 'numpy_numericalonly_nan' ,
470
- 'pandas_numericalonly_nonan' ,
471
- 'pandas_numericalonly_nan' ,
472
- 'list_numericalonly_nonan' ,
473
- 'list_numericalonly_nan' ,
474
- # Category in numpy is handled via feat_type
475
- 'numpy_categoricalonly_nonan' ,
476
- 'numpy_mixed_nonan' ,
477
- 'numpy_categoricalonly_nan' ,
478
- 'numpy_mixed_nan' ,
479
462
'sparse_bsr_nonan' ,
480
463
'sparse_bsr_nan' ,
481
464
'sparse_coo_nonan' ,
@@ -513,7 +496,7 @@ def test_no_column_transformer_created(input_data_featuretest):
513
496
)
514
497
def test_column_transformer_created (input_data_featuretest ):
515
498
"""
516
- This test ensures an encoder is created if categorical data is provided
499
+ This test ensures an column transformer is created if categorical data is provided
517
500
"""
518
501
validator = TabularFeatureValidator ()
519
502
validator .fit (input_data_featuretest )
@@ -522,7 +505,7 @@ def test_column_transformer_created(input_data_featuretest):
522
505
523
506
# Make sure that the encoded features are actually encoded. Categorical columns are at
524
507
# the start after transformation. In our fixtures, this is also honored prior encode
525
- transformed_columns , feature_types = validator ._get_columns_to_encode (input_data_featuretest )
508
+ cat_columns , _ , feature_types = validator ._get_columns_info (input_data_featuretest )
526
509
527
510
# At least one categorical
528
511
assert 'categorical' in validator .feat_type
@@ -531,20 +514,13 @@ def test_column_transformer_created(input_data_featuretest):
531
514
if np .any ([pd .api .types .is_numeric_dtype (input_data_featuretest [col ]
532
515
) for col in input_data_featuretest .columns ]):
533
516
assert 'numerical' in validator .feat_type
534
- for i , feat_type in enumerate (feature_types ):
535
- if 'numerical' in feat_type :
536
- np .testing .assert_array_equal (
537
- transformed_X [:, i ],
538
- input_data_featuretest [input_data_featuretest .columns [i ]].to_numpy ()
539
- )
540
- elif 'categorical' in feat_type :
541
- np .testing .assert_array_equal (
542
- transformed_X [:, i ],
543
- # Expect always 0, 1... because we use a ordinal encoder
544
- np .array ([0 , 1 ])
545
- )
546
- else :
547
- raise ValueError (feat_type )
517
+ # we expect this input to be the fixture 'pandas_mixed_nan'
518
+ np .testing .assert_array_equal (transformed_X , np .array ([[1. , 0. , - 1. ], [0. , 1. , 1. ]]))
519
+ else :
520
+ np .testing .assert_array_equal (transformed_X , np .array ([[1. , 0. , 1. , 0. ], [0. , 1. , 0. , 1. ]]))
521
+
522
+ if not all ([feat_type in ['numerical' , 'categorical' ] for feat_type in feature_types ]):
523
+ raise ValueError ("Expected only numerical and categorical feature types" )
548
524
549
525
550
526
def test_no_new_category_after_fit ():
@@ -576,13 +552,12 @@ def test_unknown_encode_value():
576
552
x ['c' ].cat .add_categories (['NA' ], inplace = True )
577
553
x .loc [0 , 'c' ] = 'NA' # unknown value
578
554
x_t = validator .transform (x )
579
- # The first row should have a -1 as we added a new categorical there
580
- expected_row = [- 1 , - 41 , - 3 , - 987.2 ]
555
+ # The first row should have a 0, 0 as we added a
556
+ # new categorical there and one hot encoder marks
557
+ # it as all zeros for the transformed column
558
+ expected_row = [0.0 , 0.0 , - 0.5584294383572701 , 0.5000000000000004 , - 1.5136598016833485 ]
581
559
assert expected_row == x_t [0 ].tolist ()
582
560
583
- # Notice how there is only one column 'c' to encode
584
- assert validator .categories == [list (range (2 )) for i in range (1 )]
585
-
586
561
587
562
# Actual checks for the features
588
563
@pytest .mark .parametrize (
@@ -634,19 +609,20 @@ def test_feature_validator_new_data_after_fit(
634
609
assert sparse .issparse (transformed_X )
635
610
else :
636
611
assert isinstance (transformed_X , np .ndarray )
637
- assert np .shape (X_test ) == np .shape (transformed_X )
638
612
639
613
# And then check proper error messages
640
614
if train_data_type == 'pandas' :
641
615
old_dtypes = copy .deepcopy (validator .dtypes )
642
616
validator .dtypes = ['dummy' for dtype in X_train .dtypes ]
643
- with pytest .raises (ValueError , match = r"Changing the dtype of the features after fit" ):
617
+ with pytest .raises (ValueError ,
618
+ match = r"The dtype of the features must not be changed after fit" ):
644
619
transformed_X = validator .transform (X_test )
645
620
validator .dtypes = old_dtypes
646
621
if test_data_type == 'pandas' :
647
622
columns = X_test .columns .tolist ()
648
623
X_test = X_test [reversed (columns )]
649
- with pytest .raises (ValueError , match = r"Changing the column order of the features" ):
624
+ with pytest .raises (ValueError ,
625
+ match = r"The column order of the features must not be changed after fit" ):
650
626
transformed_X = validator .transform (X_test )
651
627
652
628
0 commit comments