From 61b1a2980a06c97436ed6b96c7dbf2277c38f19e Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Tue, 30 Nov 2021 11:38:31 +0100 Subject: [PATCH 1/7] cleanup of simple_imputer --- .../imputation/SimpleImputer.py | 170 ++++++++++++------ 1 file changed, 115 insertions(+), 55 deletions(-) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py index ea09798ce..d0a05bc9b 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py @@ -1,9 +1,7 @@ from typing import Any, Dict, List, Optional, Union from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import ( - CategoricalHyperparameter -) +from ConfigSpace.hyperparameters import CategoricalHyperparameter import numpy as np @@ -15,92 +13,154 @@ class SimpleImputer(BaseImputer): + """An imputer for categorical and numerical columns + + Impute missing values for categorical columns with 'constant_!missing!' + + Note: + In case of numpy data, the constant value is set to -1, under the assumption + that categorical data is fit with an Ordinal Scaler. """ - Impute missing values for categorical columns with '!missing!' - (In case of numpy data, the constant value is set to -1, under - the assumption that categorical data is fit with an Ordinal Scaler) - """ - def __init__(self, - random_state: Optional[Union[np.random.RandomState, int]] = None, - numerical_strategy: str = 'mean', - categorical_strategy: str = 'most_frequent'): + def __init__( + self, + random_state: Optional[Union[np.random.RandomState, int]] = None, + numerical_strategy: str = 'mean', + categorical_strategy: str = 'most_frequent' + ): + """ + Parameters + ---------- + random_state: Optional[Union[np.random.RandomState, int]] = None + The random state to use for the imputer + + numerical_strategy: str = 'mean', + The strategy to use for imputing numerical columns. + Can be one of ['mean', 'median', 'most_frequent', 'constant', 'constant_!missing!'] + + Note: + Using 'constant' defaults to fill_value of 0 where 'constant_!missing!' + uses a fill_value of -1. This behaviour should probably be fixed. + + categorical_strategy: str = 'most_frequent' + The strategy to use for imputing categorical columns. + Can be one of ['mean', 'median', 'most_frequent', 'constant_zero'] + """ super().__init__() self.random_state = random_state self.numerical_strategy = numerical_strategy self.categorical_strategy = categorical_strategy - def fit(self, X: Dict[str, Any], y: Any = None) -> BaseImputer: + def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseImputer: """ The fit function calls the fit function of the underlying model and returns the transformed array. - Args: - X (np.ndarray): input features - y (Optional[np.ndarray]): input labels - Returns: - instance of self + Parameters + ---------- + X: np.ndarray + The input features to fit on + + y: Optional[np.ndarray] + The labels for the input features `X` + + Returns + ------- + SimpleImputer + returns self """ self.check_requirements(X, y) - categorical_columns = X['dataset_properties']['categorical_columns'] \ - if isinstance(X['dataset_properties']['categorical_columns'], List) else [] - if len(categorical_columns) != 0: + + # Choose an imputer for any categorical columns + categorical_columns = X['dataset_properties']['categorical_columns'] + + if isinstance(categorical_columns, List) and len(categorical_columns) != 0: if self.categorical_strategy == 'constant_!missing!': - self.preprocessor['categorical'] = SklearnSimpleImputer(strategy='constant', - # Train data is numpy - # as of this point, where - # Ordinal Encoding is using - # for categorical. Only - # Numbers are allowed - # fill_value='!missing!', - fill_value=-1, - copy=False) + # Train data is numpy as of this point, where an Ordinal Encoding is used + # for categoricals. Only Numbers are allowed for `fill_value` + imputer = SklearnSimpleImputer(strategy='constant', fill_value=-1, copy=False) + self.preprocessor['categorical'] = imputer else: - self.preprocessor['categorical'] = SklearnSimpleImputer(strategy=self.categorical_strategy, - copy=False) - numerical_columns = X['dataset_properties']['numerical_columns'] \ - if isinstance(X['dataset_properties']['numerical_columns'], List) else [] - if len(numerical_columns) != 0: + imputer = SklearnSimpleImputer(strategy=self.categorical_strategy, copy=False) + self.preprocessor['categorical'] = imputer + + # Choose an imputer for any numerical columns + numerical_columns = X['dataset_properties']['numerical_columns'] + + if isinstance(numerical_columns, List) and len(numerical_columns) > 0: if self.numerical_strategy == 'constant_zero': - self.preprocessor['numerical'] = SklearnSimpleImputer(strategy='constant', - fill_value=0, - copy=False) + imputer = SklearnSimpleImputer(strategy='constant', fill_value=0, copy=False) + self.preprocessor['numerical'] = imputer else: - self.preprocessor['numerical'] = SklearnSimpleImputer(strategy=self.numerical_strategy, copy=False) + imputer = SklearnSimpleImputer(strategy=self.numerical_strategy, copy=False) + self.preprocessor['numerical'] = imputer return self @staticmethod def get_hyperparameter_search_space( dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, - numerical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='numerical_strategy', - value_range=("mean", "median", - "most_frequent", - "constant_zero"), - default_value="mean", - ), + numerical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace( + hyperparameter='numerical_strategy', + value_range=("mean", "median", "most_frequet", "constant_zero"), + default_value="mean", + ), categorical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace( hyperparameter='categorical_strategy', - value_range=("most_frequent", - "constant_!missing!"), - default_value="most_frequent") + value_range=("most_frequent", "constant_!missing!"), + default_value="most_frequent" + ) ) -> ConfigurationSpace: + """Get the hyperparameter search space for the SimpleImputer + + Parameters + ---------- + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None + Properties that describe the dataset + + numerical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(...) + The strategy to use for numerical imputation + + caterogical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(...) + The strategy to use for categorical imputation + + Returns + ------- + ConfigurationSpace + The space of possible configurations for a SimpleImputer with the given + `dataset_properties` + """ cs = ConfigurationSpace() - assert dataset_properties is not None, "To create hyperparameter search space" \ - ", dataset_properties should not be None" - if len(dataset_properties['numerical_columns']) \ - if isinstance(dataset_properties['numerical_columns'], List) else 0 != 0: + + if dataset_properties is None: + raise ValueError("SimpleImputer requires `dataset_properties` for generating" + " a search space.") + + if ( + isinstance(dataset_properties['numerical_columns'], List) + and len(dataset_properties['numerical_columns']) != 0 + ): add_hyperparameter(cs, numerical_strategy, CategoricalHyperparameter) - if len(dataset_properties['categorical_columns']) \ - if isinstance(dataset_properties['categorical_columns'], List) else 0 != 0: + if ( + isinstance(dataset_properties['categorical_columns'], List) + and len(dataset_properties['categorical_columns']) + ): add_hyperparameter(cs, categorical_strategy, CategoricalHyperparameter) return cs @staticmethod - def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None - ) -> Dict[str, Union[str, bool]]: + def get_properties( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None + ) -> Dict[str, Union[str, bool]]: + """Get the properties of the SimpleImputer class and what it can handle + + Returns + ------- + Dict[str, Union[str, bool]] + A dict from property names to values + """ return { 'shortname': 'SimpleImputer', 'name': 'Simple Imputer', From bbabad8f0f84a1e5cad4051a8ad44b646f38b8a3 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Tue, 30 Nov 2021 12:47:18 +0100 Subject: [PATCH 2/7] Fixed doc and typo --- .../imputation/SimpleImputer.py | 88 ++++++++----------- 1 file changed, 37 insertions(+), 51 deletions(-) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py index d0a05bc9b..a13246fae 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py @@ -29,22 +29,19 @@ def __init__( categorical_strategy: str = 'most_frequent' ): """ - Parameters - ---------- - random_state: Optional[Union[np.random.RandomState, int]] = None - The random state to use for the imputer - - numerical_strategy: str = 'mean', - The strategy to use for imputing numerical columns. - Can be one of ['mean', 'median', 'most_frequent', 'constant', 'constant_!missing!'] - - Note: - Using 'constant' defaults to fill_value of 0 where 'constant_!missing!' - uses a fill_value of -1. This behaviour should probably be fixed. - - categorical_strategy: str = 'most_frequent' - The strategy to use for imputing categorical columns. - Can be one of ['mean', 'median', 'most_frequent', 'constant_zero'] + Note: + Using 'constant' defaults to fill_value of 0 where 'constant_!missing!' + uses a fill_value of -1. This behaviour should probably be fixed. + + Args: + random_state (Optional[Union[np.random.RandomState, int]]): + The random state to use for the imputer. + numerical_strategy (str: default='mean'): + The strategy to use for imputing numerical columns. + Can be one of ['mean', 'median', 'most_frequent', 'constant', 'constant_!missing!'] + categorical_strategy (str: default='most_frequent') + The strategy to use for imputing categorical columns. + Can be one of ['mean', 'median', 'most_frequent', 'constant_zero'] """ super().__init__() self.random_state = random_state @@ -52,22 +49,16 @@ def __init__( self.categorical_strategy = categorical_strategy def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseImputer: - """ - The fit function calls the fit function of the underlying model - and returns the transformed array. + """ Fits the underlying model and returns the transformed array. - Parameters - ---------- - X: np.ndarray - The input features to fit on + Args: + X (np.ndarray): + The input features to fit on + y (Optional[np.ndarray]): + The labels for the input features `X` - y: Optional[np.ndarray] - The labels for the input features `X` - - Returns - ------- - SimpleImputer - returns self + Returns: + SimpleImputer: returns the object itself """ self.check_requirements(X, y) @@ -102,7 +93,7 @@ def get_hyperparameter_search_space( dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, numerical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace( hyperparameter='numerical_strategy', - value_range=("mean", "median", "most_frequet", "constant_zero"), + value_range=("mean", "median", "most_frequent", "constant_zero"), default_value="mean", ), categorical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace( @@ -113,22 +104,19 @@ def get_hyperparameter_search_space( ) -> ConfigurationSpace: """Get the hyperparameter search space for the SimpleImputer - Parameters - ---------- - dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None - Properties that describe the dataset - - numerical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(...) - The strategy to use for numerical imputation - - caterogical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(...) - The strategy to use for categorical imputation - - Returns - ------- - ConfigurationSpace - The space of possible configurations for a SimpleImputer with the given - `dataset_properties` + Args: + dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]) + Properties that describe the dataset + Note: Not actually Optional, just adhering to its supertype + numerical_strategy (HyperparameterSearchSpace: default = ...) + The strategy to use for numerical imputation + caterogical_strategy (HyperparameterSearchSpace: default = ...) + The strategy to use for categorical imputation + + Returns: + ConfigurationSpace + The space of possible configurations for a SimpleImputer with the given + `dataset_properties` """ cs = ConfigurationSpace() @@ -156,10 +144,8 @@ def get_properties( ) -> Dict[str, Union[str, bool]]: """Get the properties of the SimpleImputer class and what it can handle - Returns - ------- - Dict[str, Union[str, bool]] - A dict from property names to values + Returns: + Dict[str, Union[str, bool]]: A dict from property names to values """ return { 'shortname': 'SimpleImputer', From c92d03913e11f381bd9528dc6a61e7d38a54d3c4 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Tue, 30 Nov 2021 14:28:02 +0100 Subject: [PATCH 3/7] Fixed docs --- .../tabular_preprocessing/imputation/SimpleImputer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py index a13246fae..341964791 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py @@ -58,7 +58,8 @@ def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseImputer: The labels for the input features `X` Returns: - SimpleImputer: returns the object itself + SimpleImputer: + returns self """ self.check_requirements(X, y) @@ -145,7 +146,8 @@ def get_properties( """Get the properties of the SimpleImputer class and what it can handle Returns: - Dict[str, Union[str, bool]]: A dict from property names to values + Dict[str, Union[str, bool]]: + A dict from property names to values """ return { 'shortname': 'SimpleImputer', From 60b919468c597998cd0b41c067c9ba9a59c13457 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 1 Dec 2021 11:12:41 +0100 Subject: [PATCH 4/7] Made changes, added test --- .../imputation/SimpleImputer.py | 27 ++++++++++--------- .../components/preprocessing/test_imputers.py | 12 +++++++++ 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py index 341964791..cde459aae 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py @@ -20,28 +20,29 @@ class SimpleImputer(BaseImputer): Note: In case of numpy data, the constant value is set to -1, under the assumption that categorical data is fit with an Ordinal Scaler. + + Attributes: + random_state (Optional[np.random.RandomState]): + The random state to use for the imputer. + numerical_strategy (str: default='mean'): + The strategy to use for imputing numerical columns. + Can be one of ['mean', 'median', 'most_frequent', 'constant', 'constant_!missing!'] + categorical_strategy (str: default='most_frequent') + The strategy to use for imputing categorical columns. + Can be one of ['mean', 'median', 'most_frequent', 'constant_zero'] """ def __init__( - self, random_state: Optional[Union[np.random.RandomState, int]] = None, + random_state: Optional[np.random.RandomState] = None, numerical_strategy: str = 'mean', categorical_strategy: str = 'most_frequent' ): """ Note: - Using 'constant' defaults to fill_value of 0 where 'constant_!missing!' - uses a fill_value of -1. This behaviour should probably be fixed. - - Args: - random_state (Optional[Union[np.random.RandomState, int]]): - The random state to use for the imputer. - numerical_strategy (str: default='mean'): - The strategy to use for imputing numerical columns. - Can be one of ['mean', 'median', 'most_frequent', 'constant', 'constant_!missing!'] - categorical_strategy (str: default='most_frequent') - The strategy to use for imputing categorical columns. - Can be one of ['mean', 'median', 'most_frequent', 'constant_zero'] + 'constant' as numerical_strategy uses 0 as the default fill_value while + 'constant_!missing!' uses a fill_value of -1. + This behaviour should probably be fixed. """ super().__init__() self.random_state = random_state diff --git a/test/test_pipeline/components/preprocessing/test_imputers.py b/test/test_pipeline/components/preprocessing/test_imputers.py index 983737dfe..7fb71282f 100644 --- a/test/test_pipeline/components/preprocessing/test_imputers.py +++ b/test/test_pipeline/components/preprocessing/test_imputers.py @@ -1,4 +1,5 @@ import unittest +import pytest import numpy as np from numpy.testing import assert_array_equal @@ -213,6 +214,17 @@ def test_constant_imputation(self): [7.0, '0', 9], [4.0, '0', '0']], dtype=str)) + def test_imputation_without_dataset_properties_raises_error(self): + """Tests SimpleImputer checks for dataset properties when querying for + HyperparameterSearchSpace, even though the arg is marked `Optional`. + + Expects: + * Should raise a ValueError that no dataset_properties were passed + """ + with pytest.raises(ValueError): + SimpleImputer.get_hyperparameter_search_space() + + if __name__ == '__main__': unittest.main() From 7a3e792481a179073ef17a6063e180d0c7250e52 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 1 Dec 2021 11:22:30 +0100 Subject: [PATCH 5/7] Fixed init statement --- .../tabular_preprocessing/imputation/SimpleImputer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py index cde459aae..9b118cb61 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py @@ -33,7 +33,7 @@ class SimpleImputer(BaseImputer): """ def __init__( - random_state: Optional[Union[np.random.RandomState, int]] = None, + self, random_state: Optional[np.random.RandomState] = None, numerical_strategy: str = 'mean', categorical_strategy: str = 'most_frequent' From 549060415a63064b805596fc72e885821e43274f Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 1 Dec 2021 12:12:52 +0100 Subject: [PATCH 6/7] Fixed docs --- .../tabular_preprocessing/imputation/SimpleImputer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py index 9b118cb61..3d7ca22b1 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py @@ -26,7 +26,7 @@ class SimpleImputer(BaseImputer): The random state to use for the imputer. numerical_strategy (str: default='mean'): The strategy to use for imputing numerical columns. - Can be one of ['mean', 'median', 'most_frequent', 'constant', 'constant_!missing!'] + Can be one of ['most_frequent', 'constant_!missing!'] categorical_strategy (str: default='most_frequent') The strategy to use for imputing categorical columns. Can be one of ['mean', 'median', 'most_frequent', 'constant_zero'] From e790e71f2b0eb2ce24b88d6f97dd829efb562fe0 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 1 Dec 2021 12:14:56 +0100 Subject: [PATCH 7/7] Flake'd --- test/test_pipeline/components/preprocessing/test_imputers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_pipeline/components/preprocessing/test_imputers.py b/test/test_pipeline/components/preprocessing/test_imputers.py index 7fb71282f..18b43bfa6 100644 --- a/test/test_pipeline/components/preprocessing/test_imputers.py +++ b/test/test_pipeline/components/preprocessing/test_imputers.py @@ -1,9 +1,10 @@ import unittest -import pytest import numpy as np from numpy.testing import assert_array_equal +import pytest + from sklearn.base import BaseEstimator, clone from sklearn.compose import make_column_transformer @@ -225,6 +226,5 @@ def test_imputation_without_dataset_properties_raises_error(self): SimpleImputer.get_hyperparameter_search_space() - if __name__ == '__main__': unittest.main()