diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py index ea09798ce..3d7ca22b1 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py @@ -1,9 +1,7 @@ from typing import Any, Dict, List, Optional, Union from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import ( - CategoricalHyperparameter -) +from ConfigSpace.hyperparameters import CategoricalHyperparameter import numpy as np @@ -15,92 +13,143 @@ class SimpleImputer(BaseImputer): - """ - Impute missing values for categorical columns with '!missing!' - (In case of numpy data, the constant value is set to -1, under - the assumption that categorical data is fit with an Ordinal Scaler) + """An imputer for categorical and numerical columns + + Impute missing values for categorical columns with 'constant_!missing!' + + Note: + In case of numpy data, the constant value is set to -1, under the assumption + that categorical data is fit with an Ordinal Scaler. + + Attributes: + random_state (Optional[np.random.RandomState]): + The random state to use for the imputer. + numerical_strategy (str: default='mean'): + The strategy to use for imputing numerical columns. + Can be one of ['most_frequent', 'constant_!missing!'] + categorical_strategy (str: default='most_frequent') + The strategy to use for imputing categorical columns. + Can be one of ['mean', 'median', 'most_frequent', 'constant_zero'] """ - def __init__(self, - random_state: Optional[Union[np.random.RandomState, int]] = None, - numerical_strategy: str = 'mean', - categorical_strategy: str = 'most_frequent'): + def __init__( + self, + random_state: Optional[np.random.RandomState] = None, + numerical_strategy: str = 'mean', + categorical_strategy: str = 'most_frequent' + ): + """ + Note: + 'constant' as numerical_strategy uses 0 as the default fill_value while + 'constant_!missing!' uses a fill_value of -1. + This behaviour should probably be fixed. + """ super().__init__() self.random_state = random_state self.numerical_strategy = numerical_strategy self.categorical_strategy = categorical_strategy - def fit(self, X: Dict[str, Any], y: Any = None) -> BaseImputer: - """ - The fit function calls the fit function of the underlying model - and returns the transformed array. + def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseImputer: + """ Fits the underlying model and returns the transformed array. + Args: - X (np.ndarray): input features - y (Optional[np.ndarray]): input labels + X (np.ndarray): + The input features to fit on + y (Optional[np.ndarray]): + The labels for the input features `X` Returns: - instance of self + SimpleImputer: + returns self """ self.check_requirements(X, y) - categorical_columns = X['dataset_properties']['categorical_columns'] \ - if isinstance(X['dataset_properties']['categorical_columns'], List) else [] - if len(categorical_columns) != 0: + + # Choose an imputer for any categorical columns + categorical_columns = X['dataset_properties']['categorical_columns'] + + if isinstance(categorical_columns, List) and len(categorical_columns) != 0: if self.categorical_strategy == 'constant_!missing!': - self.preprocessor['categorical'] = SklearnSimpleImputer(strategy='constant', - # Train data is numpy - # as of this point, where - # Ordinal Encoding is using - # for categorical. Only - # Numbers are allowed - # fill_value='!missing!', - fill_value=-1, - copy=False) + # Train data is numpy as of this point, where an Ordinal Encoding is used + # for categoricals. Only Numbers are allowed for `fill_value` + imputer = SklearnSimpleImputer(strategy='constant', fill_value=-1, copy=False) + self.preprocessor['categorical'] = imputer else: - self.preprocessor['categorical'] = SklearnSimpleImputer(strategy=self.categorical_strategy, - copy=False) - numerical_columns = X['dataset_properties']['numerical_columns'] \ - if isinstance(X['dataset_properties']['numerical_columns'], List) else [] - if len(numerical_columns) != 0: + imputer = SklearnSimpleImputer(strategy=self.categorical_strategy, copy=False) + self.preprocessor['categorical'] = imputer + + # Choose an imputer for any numerical columns + numerical_columns = X['dataset_properties']['numerical_columns'] + + if isinstance(numerical_columns, List) and len(numerical_columns) > 0: if self.numerical_strategy == 'constant_zero': - self.preprocessor['numerical'] = SklearnSimpleImputer(strategy='constant', - fill_value=0, - copy=False) + imputer = SklearnSimpleImputer(strategy='constant', fill_value=0, copy=False) + self.preprocessor['numerical'] = imputer else: - self.preprocessor['numerical'] = SklearnSimpleImputer(strategy=self.numerical_strategy, copy=False) + imputer = SklearnSimpleImputer(strategy=self.numerical_strategy, copy=False) + self.preprocessor['numerical'] = imputer return self @staticmethod def get_hyperparameter_search_space( dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, - numerical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='numerical_strategy', - value_range=("mean", "median", - "most_frequent", - "constant_zero"), - default_value="mean", - ), + numerical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace( + hyperparameter='numerical_strategy', + value_range=("mean", "median", "most_frequent", "constant_zero"), + default_value="mean", + ), categorical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace( hyperparameter='categorical_strategy', - value_range=("most_frequent", - "constant_!missing!"), - default_value="most_frequent") + value_range=("most_frequent", "constant_!missing!"), + default_value="most_frequent" + ) ) -> ConfigurationSpace: + """Get the hyperparameter search space for the SimpleImputer + + Args: + dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]) + Properties that describe the dataset + Note: Not actually Optional, just adhering to its supertype + numerical_strategy (HyperparameterSearchSpace: default = ...) + The strategy to use for numerical imputation + caterogical_strategy (HyperparameterSearchSpace: default = ...) + The strategy to use for categorical imputation + + Returns: + ConfigurationSpace + The space of possible configurations for a SimpleImputer with the given + `dataset_properties` + """ cs = ConfigurationSpace() - assert dataset_properties is not None, "To create hyperparameter search space" \ - ", dataset_properties should not be None" - if len(dataset_properties['numerical_columns']) \ - if isinstance(dataset_properties['numerical_columns'], List) else 0 != 0: + + if dataset_properties is None: + raise ValueError("SimpleImputer requires `dataset_properties` for generating" + " a search space.") + + if ( + isinstance(dataset_properties['numerical_columns'], List) + and len(dataset_properties['numerical_columns']) != 0 + ): add_hyperparameter(cs, numerical_strategy, CategoricalHyperparameter) - if len(dataset_properties['categorical_columns']) \ - if isinstance(dataset_properties['categorical_columns'], List) else 0 != 0: + if ( + isinstance(dataset_properties['categorical_columns'], List) + and len(dataset_properties['categorical_columns']) + ): add_hyperparameter(cs, categorical_strategy, CategoricalHyperparameter) return cs @staticmethod - def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None - ) -> Dict[str, Union[str, bool]]: + def get_properties( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None + ) -> Dict[str, Union[str, bool]]: + """Get the properties of the SimpleImputer class and what it can handle + + Returns: + Dict[str, Union[str, bool]]: + A dict from property names to values + """ return { 'shortname': 'SimpleImputer', 'name': 'Simple Imputer', diff --git a/test/test_pipeline/components/preprocessing/test_imputers.py b/test/test_pipeline/components/preprocessing/test_imputers.py index 983737dfe..18b43bfa6 100644 --- a/test/test_pipeline/components/preprocessing/test_imputers.py +++ b/test/test_pipeline/components/preprocessing/test_imputers.py @@ -3,6 +3,8 @@ import numpy as np from numpy.testing import assert_array_equal +import pytest + from sklearn.base import BaseEstimator, clone from sklearn.compose import make_column_transformer @@ -213,6 +215,16 @@ def test_constant_imputation(self): [7.0, '0', 9], [4.0, '0', '0']], dtype=str)) + def test_imputation_without_dataset_properties_raises_error(self): + """Tests SimpleImputer checks for dataset properties when querying for + HyperparameterSearchSpace, even though the arg is marked `Optional`. + + Expects: + * Should raise a ValueError that no dataset_properties were passed + """ + with pytest.raises(ValueError): + SimpleImputer.get_hyperparameter_search_space() + if __name__ == '__main__': unittest.main()