From d3df40fa47a2c0d1c93c20fc377aee35c67a5668 Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Wed, 2 Feb 2022 20:22:42 +0100
Subject: [PATCH 1/3] add variance thresholding

---
 .../VarianceThreshold.py                      | 41 ++++++++++++++++
 .../variance_thresholding/__init__.py         |  0
 .../pipeline/tabular_classification.py        |  3 ++
 autoPyTorch/pipeline/tabular_regression.py    |  3 ++
 .../components/preprocessing/base.py          |  3 ++
 .../test_variance_thresholding.py             | 48 +++++++++++++++++++
 6 files changed, 98 insertions(+)
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/__init__.py
 create mode 100644 test/test_pipeline/components/preprocessing/test_variance_thresholding.py

diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py
new file mode 100644
index 000000000..21d1a5667
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py
@@ -0,0 +1,41 @@
+from typing import Any, Dict, Optional, Union
+import numpy as np
+from sklearn.feature_selection import VarianceThreshold as SklearnVarianceThreshold
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
+
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import \
+    autoPyTorchTabularPreprocessingComponent
+
+
+class VarianceThreshold(autoPyTorchTabularPreprocessingComponent):
+    """
+    Removes features that have the same value in the training data.
+    """
+    def __init__(self, random_state: Optional[np.random.RandomState] = None):
+        super().__init__()
+
+    def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> 'VarianceThreshold':
+
+        self.check_requirements(X, y)
+
+        self.preprocessor['numerical'] = SklearnVarianceThreshold(
+            threshold=0.0
+        )
+        return self
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        if self.preprocessor['numerical'] is None:
+            raise ValueError("cant call transform on {} without fitting first."
+                             .format(self.__class__.__name__))
+        X.update({'variance_threshold': self.preprocessor})
+        return X
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+    ) -> Dict[str, Union[str, bool]]:
+
+        return {
+            'shortname': 'Variance Threshold',
+            'name': 'Variance Threshold (constant feature removal)',
+            'handles_sparse': True,
+        }
\ No newline at end of file
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/autoPyTorch/pipeline/tabular_classification.py b/autoPyTorch/pipeline/tabular_classification.py
index b95de512e..92dc764bb 100644
--- a/autoPyTorch/pipeline/tabular_classification.py
+++ b/autoPyTorch/pipeline/tabular_classification.py
@@ -27,6 +27,8 @@
 )
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding. \
+    VarianceThreshold import VarianceThreshold
 from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing
 from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice
 from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
@@ -307,6 +309,7 @@ def _get_pipeline_steps(
 
         steps.extend([
             ("imputer", SimpleImputer(random_state=self.random_state)),
+            ("variance_threshold", VarianceThreshold(random_state=self.random_state)),
             ("encoder", EncoderChoice(default_dataset_properties, random_state=self.random_state)),
             ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
             ("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties,
diff --git a/autoPyTorch/pipeline/tabular_regression.py b/autoPyTorch/pipeline/tabular_regression.py
index 57d0126d0..daee7f74a 100644
--- a/autoPyTorch/pipeline/tabular_regression.py
+++ b/autoPyTorch/pipeline/tabular_regression.py
@@ -27,6 +27,8 @@
 )
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding. \
+    VarianceThreshold import VarianceThreshold
 from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing
 from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice
 from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
@@ -257,6 +259,7 @@ def _get_pipeline_steps(
 
         steps.extend([
             ("imputer", SimpleImputer(random_state=self.random_state)),
+            ("variance_threshold", VarianceThreshold(random_state=self.random_state)),
             ("encoder", EncoderChoice(default_dataset_properties, random_state=self.random_state)),
             ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
             ("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties,
diff --git a/test/test_pipeline/components/preprocessing/base.py b/test/test_pipeline/components/preprocessing/base.py
index ac16e286a..35f6ed271 100644
--- a/test/test_pipeline/components/preprocessing/base.py
+++ b/test/test_pipeline/components/preprocessing/base.py
@@ -6,6 +6,8 @@
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import EncoderChoice
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding. \
+    VarianceThreshold import VarianceThreshold
 from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline
 
 
@@ -28,6 +30,7 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]],
 
         steps.extend([
             ("imputer", SimpleImputer()),
+            ("variance_threshold", VarianceThreshold()),
             ("encoder", EncoderChoice(default_dataset_properties)),
             ("scaler", ScalerChoice(default_dataset_properties)),
             ("tabular_transformer", TabularColumnTransformer()),
diff --git a/test/test_pipeline/components/preprocessing/test_variance_thresholding.py b/test/test_pipeline/components/preprocessing/test_variance_thresholding.py
new file mode 100644
index 000000000..3b9fa3d3f
--- /dev/null
+++ b/test/test_pipeline/components/preprocessing/test_variance_thresholding.py
@@ -0,0 +1,48 @@
+import numpy as np
+from numpy.testing import assert_array_equal
+
+
+from sklearn.base import BaseEstimator
+from sklearn.compose import make_column_transformer
+
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding.VarianceThreshold import VarianceThreshold
+
+
+def test_variance_threshold():
+    data = np.array([[1, 2, 1],
+                     [7, 8, 9],
+                     [4, 5, 1],
+                     [11, 12, 1],
+                     [17, 18, 19],
+                     [14, 15, 16]])
+    numerical_columns = [0, 1, 2]
+    train_indices = np.array([0, 2, 3])
+    test_indices = np.array([1, 4, 5])
+    dataset_properties = {
+        'categorical_columns': [],
+        'numerical_columns': numerical_columns,
+    }
+    X = {
+        'X_train': data[train_indices],
+        'dataset_properties': dataset_properties
+    }
+    component = VarianceThreshold()
+
+    component = component.fit(X)
+    X = component.transform(X)
+    variance_threshold = X['variance_threshold']['numerical']
+
+    # check if the fit dictionary X is modified as expected
+    assert isinstance(X['variance_threshold'], dict)
+    assert isinstance(variance_threshold, BaseEstimator)
+
+    # make column transformer with returned encoder to fit on data
+    column_transformer = make_column_transformer((variance_threshold,
+                                                    X['dataset_properties']['numerical_columns']),
+                                                    remainder='passthrough')
+    column_transformer = column_transformer.fit(X['X_train'])
+    transformed = column_transformer.transform(data[test_indices])
+
+    assert_array_equal(transformed, np.array([[7, 8],
+                                              [17, 18],
+                                              [14, 15]]))

From 8955996b956e06f7e795f8132c50182cb6002fc1 Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Thu, 3 Feb 2022 11:21:28 +0100
Subject: [PATCH 2/3] fix flake and mypy

---
 .../variance_thresholding/VarianceThreshold.py           | 9 ++++++---
 .../preprocessing/test_variance_thresholding.py          | 7 ++++---
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py
index 21d1a5667..e4abd2e58 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py
@@ -1,8 +1,10 @@
 from typing import Any, Dict, Optional, Union
+
 import numpy as np
+
 from sklearn.feature_selection import VarianceThreshold as SklearnVarianceThreshold
-from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import \
     autoPyTorchTabularPreprocessingComponent
 
@@ -31,11 +33,12 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         return X
 
     @staticmethod
-    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+    def get_properties(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
     ) -> Dict[str, Union[str, bool]]:
 
         return {
             'shortname': 'Variance Threshold',
             'name': 'Variance Threshold (constant feature removal)',
             'handles_sparse': True,
-        }
\ No newline at end of file
+        }
diff --git a/test/test_pipeline/components/preprocessing/test_variance_thresholding.py b/test/test_pipeline/components/preprocessing/test_variance_thresholding.py
index 3b9fa3d3f..3f22835b3 100644
--- a/test/test_pipeline/components/preprocessing/test_variance_thresholding.py
+++ b/test/test_pipeline/components/preprocessing/test_variance_thresholding.py
@@ -5,7 +5,8 @@
 from sklearn.base import BaseEstimator
 from sklearn.compose import make_column_transformer
 
-from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding.VarianceThreshold import VarianceThreshold
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding. \
+    VarianceThreshold import VarianceThreshold
 
 
 def test_variance_threshold():
@@ -38,8 +39,8 @@ def test_variance_threshold():
 
     # make column transformer with returned encoder to fit on data
     column_transformer = make_column_transformer((variance_threshold,
-                                                    X['dataset_properties']['numerical_columns']),
-                                                    remainder='passthrough')
+                                                  X['dataset_properties']['numerical_columns']),
+                                                 remainder='passthrough')
     column_transformer = column_transformer.fit(X['X_train'])
     transformed = column_transformer.transform(data[test_indices])
 

From 06ed8965dc1a189efed29f4f4b5b8f670870e225 Mon Sep 17 00:00:00 2001
From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com>
Date: Tue, 8 Feb 2022 20:36:37 +0100
Subject: [PATCH 3/3] Apply suggestions from code review

Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com>
---
 .../variance_thresholding/VarianceThreshold.py                  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py
index e4abd2e58..e5e71ea1e 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py
@@ -27,7 +27,7 @@ def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> 'VarianceThreshold'
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         if self.preprocessor['numerical'] is None:
-            raise ValueError("cant call transform on {} without fitting first."
+            raise ValueError("cannot call transform on {} without fitting first."
                              .format(self.__class__.__name__))
         X.update({'variance_threshold': self.preprocessor})
         return X