From d3df40fa47a2c0d1c93c20fc377aee35c67a5668 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Wed, 2 Feb 2022 20:22:42 +0100 Subject: [PATCH 1/3] add variance thresholding --- .../VarianceThreshold.py | 41 ++++++++++++++++ .../variance_thresholding/__init__.py | 0 .../pipeline/tabular_classification.py | 3 ++ autoPyTorch/pipeline/tabular_regression.py | 3 ++ .../components/preprocessing/base.py | 3 ++ .../test_variance_thresholding.py | 48 +++++++++++++++++++ 6 files changed, 98 insertions(+) create mode 100644 autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py create mode 100644 autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/__init__.py create mode 100644 test/test_pipeline/components/preprocessing/test_variance_thresholding.py diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py new file mode 100644 index 000000000..21d1a5667 --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py @@ -0,0 +1,41 @@ +from typing import Any, Dict, Optional, Union +import numpy as np +from sklearn.feature_selection import VarianceThreshold as SklearnVarianceThreshold +from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType + +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import \ + autoPyTorchTabularPreprocessingComponent + + +class VarianceThreshold(autoPyTorchTabularPreprocessingComponent): + """ + Removes features that have the same value in the training data. + """ + def __init__(self, random_state: Optional[np.random.RandomState] = None): + super().__init__() + + def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> 'VarianceThreshold': + + self.check_requirements(X, y) + + self.preprocessor['numerical'] = SklearnVarianceThreshold( + threshold=0.0 + ) + return self + + def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: + if self.preprocessor['numerical'] is None: + raise ValueError("cant call transform on {} without fitting first." + .format(self.__class__.__name__)) + X.update({'variance_threshold': self.preprocessor}) + return X + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None + ) -> Dict[str, Union[str, bool]]: + + return { + 'shortname': 'Variance Threshold', + 'name': 'Variance Threshold (constant feature removal)', + 'handles_sparse': True, + } \ No newline at end of file diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/autoPyTorch/pipeline/tabular_classification.py b/autoPyTorch/pipeline/tabular_classification.py index b95de512e..92dc764bb 100644 --- a/autoPyTorch/pipeline/tabular_classification.py +++ b/autoPyTorch/pipeline/tabular_classification.py @@ -27,6 +27,8 @@ ) from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding. \ + VarianceThreshold import VarianceThreshold from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent @@ -307,6 +309,7 @@ def _get_pipeline_steps( steps.extend([ ("imputer", SimpleImputer(random_state=self.random_state)), + ("variance_threshold", VarianceThreshold(random_state=self.random_state)), ("encoder", EncoderChoice(default_dataset_properties, random_state=self.random_state)), ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)), ("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties, diff --git a/autoPyTorch/pipeline/tabular_regression.py b/autoPyTorch/pipeline/tabular_regression.py index 57d0126d0..daee7f74a 100644 --- a/autoPyTorch/pipeline/tabular_regression.py +++ b/autoPyTorch/pipeline/tabular_regression.py @@ -27,6 +27,8 @@ ) from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding. \ + VarianceThreshold import VarianceThreshold from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent @@ -257,6 +259,7 @@ def _get_pipeline_steps( steps.extend([ ("imputer", SimpleImputer(random_state=self.random_state)), + ("variance_threshold", VarianceThreshold(random_state=self.random_state)), ("encoder", EncoderChoice(default_dataset_properties, random_state=self.random_state)), ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)), ("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties, diff --git a/test/test_pipeline/components/preprocessing/base.py b/test/test_pipeline/components/preprocessing/base.py index ac16e286a..35f6ed271 100644 --- a/test/test_pipeline/components/preprocessing/base.py +++ b/test/test_pipeline/components/preprocessing/base.py @@ -6,6 +6,8 @@ from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import EncoderChoice from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding. \ + VarianceThreshold import VarianceThreshold from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline @@ -28,6 +30,7 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]], steps.extend([ ("imputer", SimpleImputer()), + ("variance_threshold", VarianceThreshold()), ("encoder", EncoderChoice(default_dataset_properties)), ("scaler", ScalerChoice(default_dataset_properties)), ("tabular_transformer", TabularColumnTransformer()), diff --git a/test/test_pipeline/components/preprocessing/test_variance_thresholding.py b/test/test_pipeline/components/preprocessing/test_variance_thresholding.py new file mode 100644 index 000000000..3b9fa3d3f --- /dev/null +++ b/test/test_pipeline/components/preprocessing/test_variance_thresholding.py @@ -0,0 +1,48 @@ +import numpy as np +from numpy.testing import assert_array_equal + + +from sklearn.base import BaseEstimator +from sklearn.compose import make_column_transformer + +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding.VarianceThreshold import VarianceThreshold + + +def test_variance_threshold(): + data = np.array([[1, 2, 1], + [7, 8, 9], + [4, 5, 1], + [11, 12, 1], + [17, 18, 19], + [14, 15, 16]]) + numerical_columns = [0, 1, 2] + train_indices = np.array([0, 2, 3]) + test_indices = np.array([1, 4, 5]) + dataset_properties = { + 'categorical_columns': [], + 'numerical_columns': numerical_columns, + } + X = { + 'X_train': data[train_indices], + 'dataset_properties': dataset_properties + } + component = VarianceThreshold() + + component = component.fit(X) + X = component.transform(X) + variance_threshold = X['variance_threshold']['numerical'] + + # check if the fit dictionary X is modified as expected + assert isinstance(X['variance_threshold'], dict) + assert isinstance(variance_threshold, BaseEstimator) + + # make column transformer with returned encoder to fit on data + column_transformer = make_column_transformer((variance_threshold, + X['dataset_properties']['numerical_columns']), + remainder='passthrough') + column_transformer = column_transformer.fit(X['X_train']) + transformed = column_transformer.transform(data[test_indices]) + + assert_array_equal(transformed, np.array([[7, 8], + [17, 18], + [14, 15]])) From 8955996b956e06f7e795f8132c50182cb6002fc1 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Thu, 3 Feb 2022 11:21:28 +0100 Subject: [PATCH 2/3] fix flake and mypy --- .../variance_thresholding/VarianceThreshold.py | 9 ++++++--- .../preprocessing/test_variance_thresholding.py | 7 ++++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py index 21d1a5667..e4abd2e58 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py @@ -1,8 +1,10 @@ from typing import Any, Dict, Optional, Union + import numpy as np + from sklearn.feature_selection import VarianceThreshold as SklearnVarianceThreshold -from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType +from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import \ autoPyTorchTabularPreprocessingComponent @@ -31,11 +33,12 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: return X @staticmethod - def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None + def get_properties( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None ) -> Dict[str, Union[str, bool]]: return { 'shortname': 'Variance Threshold', 'name': 'Variance Threshold (constant feature removal)', 'handles_sparse': True, - } \ No newline at end of file + } diff --git a/test/test_pipeline/components/preprocessing/test_variance_thresholding.py b/test/test_pipeline/components/preprocessing/test_variance_thresholding.py index 3b9fa3d3f..3f22835b3 100644 --- a/test/test_pipeline/components/preprocessing/test_variance_thresholding.py +++ b/test/test_pipeline/components/preprocessing/test_variance_thresholding.py @@ -5,7 +5,8 @@ from sklearn.base import BaseEstimator from sklearn.compose import make_column_transformer -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding.VarianceThreshold import VarianceThreshold +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding. \ + VarianceThreshold import VarianceThreshold def test_variance_threshold(): @@ -38,8 +39,8 @@ def test_variance_threshold(): # make column transformer with returned encoder to fit on data column_transformer = make_column_transformer((variance_threshold, - X['dataset_properties']['numerical_columns']), - remainder='passthrough') + X['dataset_properties']['numerical_columns']), + remainder='passthrough') column_transformer = column_transformer.fit(X['X_train']) transformed = column_transformer.transform(data[test_indices]) From 06ed8965dc1a189efed29f4f4b5b8f670870e225 Mon Sep 17 00:00:00 2001 From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com> Date: Tue, 8 Feb 2022 20:36:37 +0100 Subject: [PATCH 3/3] Apply suggestions from code review Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com> --- .../variance_thresholding/VarianceThreshold.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py index e4abd2e58..e5e71ea1e 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py @@ -27,7 +27,7 @@ def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> 'VarianceThreshold' def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: if self.preprocessor['numerical'] is None: - raise ValueError("cant call transform on {} without fitting first." + raise ValueError("cannot call transform on {} without fitting first." .format(self.__class__.__name__)) X.update({'variance_threshold': self.preprocessor}) return X