From 61b1a2980a06c97436ed6b96c7dbf2277c38f19e Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Tue, 30 Nov 2021 11:38:31 +0100
Subject: [PATCH 1/7] cleanup of simple_imputer

---
 .../imputation/SimpleImputer.py               | 170 ++++++++++++------
 1 file changed, 115 insertions(+), 55 deletions(-)

diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py
index ea09798ce..d0a05bc9b 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py
@@ -1,9 +1,7 @@
 from typing import Any, Dict, List, Optional, Union
 
 from ConfigSpace.configuration_space import ConfigurationSpace
-from ConfigSpace.hyperparameters import (
-    CategoricalHyperparameter
-)
+from ConfigSpace.hyperparameters import CategoricalHyperparameter
 
 import numpy as np
 
@@ -15,92 +13,154 @@
 
 
 class SimpleImputer(BaseImputer):
+    """An imputer for categorical and numerical columns
+
+    Impute missing values for categorical columns with 'constant_!missing!'
+
+    Note:
+        In case of numpy data, the constant value is set to -1, under the assumption
+        that categorical data is fit with an Ordinal Scaler.
     """
-    Impute missing values for categorical columns with '!missing!'
-    (In case of numpy data, the constant value is set to -1, under
-    the assumption that categorical data is fit with an Ordinal Scaler)
-    """
 
-    def __init__(self,
-                 random_state: Optional[Union[np.random.RandomState, int]] = None,
-                 numerical_strategy: str = 'mean',
-                 categorical_strategy: str = 'most_frequent'):
+    def __init__(
+        self,
+        random_state: Optional[Union[np.random.RandomState, int]] = None,
+        numerical_strategy: str = 'mean',
+        categorical_strategy: str = 'most_frequent'
+    ):
+        """
+        Parameters
+        ----------
+        random_state: Optional[Union[np.random.RandomState, int]] = None
+            The random state to use for the imputer
+
+        numerical_strategy: str = 'mean',
+            The strategy to use for imputing numerical columns.
+            Can be one of ['mean', 'median', 'most_frequent', 'constant', 'constant_!missing!']
+
+            Note:
+                Using 'constant' defaults to fill_value of 0 where 'constant_!missing!'
+                uses a fill_value of -1. This behaviour should probably be fixed.
+
+        categorical_strategy: str = 'most_frequent'
+            The strategy to use for imputing categorical columns.
+            Can be one of ['mean', 'median', 'most_frequent', 'constant_zero']
+        """
         super().__init__()
         self.random_state = random_state
         self.numerical_strategy = numerical_strategy
         self.categorical_strategy = categorical_strategy
 
-    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseImputer:
+    def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseImputer:
         """
         The fit function calls the fit function of the underlying model
         and returns the transformed array.
-        Args:
-            X (np.ndarray): input features
-            y (Optional[np.ndarray]): input labels
 
-        Returns:
-            instance of self
+        Parameters
+        ----------
+        X: np.ndarray
+            The input features to fit on
+
+        y: Optional[np.ndarray]
+            The labels for the input features `X`
+
+        Returns
+        -------
+        SimpleImputer
+            returns self
         """
         self.check_requirements(X, y)
-        categorical_columns = X['dataset_properties']['categorical_columns'] \
-            if isinstance(X['dataset_properties']['categorical_columns'], List) else []
-        if len(categorical_columns) != 0:
+
+        # Choose an imputer for any categorical columns
+        categorical_columns = X['dataset_properties']['categorical_columns']
+
+        if isinstance(categorical_columns, List) and len(categorical_columns) != 0:
             if self.categorical_strategy == 'constant_!missing!':
-                self.preprocessor['categorical'] = SklearnSimpleImputer(strategy='constant',
-                                                                        # Train data is numpy
-                                                                        # as of this point, where
-                                                                        # Ordinal Encoding is using
-                                                                        # for categorical. Only
-                                                                        # Numbers are allowed
-                                                                        # fill_value='!missing!',
-                                                                        fill_value=-1,
-                                                                        copy=False)
+                # Train data is numpy as of this point, where an Ordinal Encoding is used
+                # for categoricals. Only Numbers are allowed for `fill_value`
+                imputer = SklearnSimpleImputer(strategy='constant', fill_value=-1, copy=False)
+                self.preprocessor['categorical'] = imputer
             else:
-                self.preprocessor['categorical'] = SklearnSimpleImputer(strategy=self.categorical_strategy,
-                                                                        copy=False)
-        numerical_columns = X['dataset_properties']['numerical_columns'] \
-            if isinstance(X['dataset_properties']['numerical_columns'], List) else []
-        if len(numerical_columns) != 0:
+                imputer = SklearnSimpleImputer(strategy=self.categorical_strategy, copy=False)
+                self.preprocessor['categorical'] = imputer
+
+        # Choose an imputer for any numerical columns
+        numerical_columns = X['dataset_properties']['numerical_columns']
+
+        if isinstance(numerical_columns, List) and len(numerical_columns) > 0:
             if self.numerical_strategy == 'constant_zero':
-                self.preprocessor['numerical'] = SklearnSimpleImputer(strategy='constant',
-                                                                      fill_value=0,
-                                                                      copy=False)
+                imputer = SklearnSimpleImputer(strategy='constant', fill_value=0, copy=False)
+                self.preprocessor['numerical'] = imputer
             else:
-                self.preprocessor['numerical'] = SklearnSimpleImputer(strategy=self.numerical_strategy, copy=False)
+                imputer = SklearnSimpleImputer(strategy=self.numerical_strategy, copy=False)
+                self.preprocessor['numerical'] = imputer
 
         return self
 
     @staticmethod
     def get_hyperparameter_search_space(
         dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-        numerical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='numerical_strategy',
-                                                                                  value_range=("mean", "median",
-                                                                                               "most_frequent",
-                                                                                               "constant_zero"),
-                                                                                  default_value="mean",
-                                                                                  ),
+        numerical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
+            hyperparameter='numerical_strategy',
+            value_range=("mean", "median", "most_frequet", "constant_zero"),
+            default_value="mean",
+        ),
         categorical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
             hyperparameter='categorical_strategy',
-            value_range=("most_frequent",
-                         "constant_!missing!"),
-            default_value="most_frequent")
+            value_range=("most_frequent", "constant_!missing!"),
+            default_value="most_frequent"
+        )
     ) -> ConfigurationSpace:
+        """Get the hyperparameter search space for the SimpleImputer
+
+        Parameters
+        ----------
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+            Properties that describe the dataset
+
+        numerical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(...)
+            The strategy to use for numerical imputation
+
+        caterogical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(...)
+            The strategy to use for categorical imputation
+
+        Returns
+        -------
+        ConfigurationSpace
+            The space of possible configurations for a SimpleImputer with the given
+            `dataset_properties`
+        """
         cs = ConfigurationSpace()
-        assert dataset_properties is not None, "To create hyperparameter search space" \
-                                               ", dataset_properties should not be None"
-        if len(dataset_properties['numerical_columns']) \
-                if isinstance(dataset_properties['numerical_columns'], List) else 0 != 0:
+
+        if dataset_properties is None:
+            raise ValueError("SimpleImputer requires `dataset_properties` for generating"
+                             " a search space.")
+
+        if (
+            isinstance(dataset_properties['numerical_columns'], List)
+            and len(dataset_properties['numerical_columns']) != 0
+        ):
             add_hyperparameter(cs, numerical_strategy, CategoricalHyperparameter)
 
-        if len(dataset_properties['categorical_columns']) \
-                if isinstance(dataset_properties['categorical_columns'], List) else 0 != 0:
+        if (
+            isinstance(dataset_properties['categorical_columns'], List)
+            and len(dataset_properties['categorical_columns'])
+        ):
             add_hyperparameter(cs, categorical_strategy, CategoricalHyperparameter)
 
         return cs
 
     @staticmethod
-    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
-                       ) -> Dict[str, Union[str, bool]]:
+    def get_properties(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+    ) -> Dict[str, Union[str, bool]]:
+        """Get the properties of the SimpleImputer class and what it can handle
+
+        Returns
+        -------
+        Dict[str, Union[str, bool]]
+            A dict from property names to values
+        """
         return {
             'shortname': 'SimpleImputer',
             'name': 'Simple Imputer',

From bbabad8f0f84a1e5cad4051a8ad44b646f38b8a3 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Tue, 30 Nov 2021 12:47:18 +0100
Subject: [PATCH 2/7] Fixed doc and typo

---
 .../imputation/SimpleImputer.py               | 88 ++++++++-----------
 1 file changed, 37 insertions(+), 51 deletions(-)

diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py
index d0a05bc9b..a13246fae 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py
@@ -29,22 +29,19 @@ def __init__(
         categorical_strategy: str = 'most_frequent'
     ):
         """
-        Parameters
-        ----------
-        random_state: Optional[Union[np.random.RandomState, int]] = None
-            The random state to use for the imputer
-
-        numerical_strategy: str = 'mean',
-            The strategy to use for imputing numerical columns.
-            Can be one of ['mean', 'median', 'most_frequent', 'constant', 'constant_!missing!']
-
-            Note:
-                Using 'constant' defaults to fill_value of 0 where 'constant_!missing!'
-                uses a fill_value of -1. This behaviour should probably be fixed.
-
-        categorical_strategy: str = 'most_frequent'
-            The strategy to use for imputing categorical columns.
-            Can be one of ['mean', 'median', 'most_frequent', 'constant_zero']
+        Note:
+            Using 'constant' defaults to fill_value of 0 where 'constant_!missing!'
+            uses a fill_value of -1. This behaviour should probably be fixed.
+
+        Args:
+            random_state (Optional[Union[np.random.RandomState, int]]):
+                The random state to use for the imputer.
+            numerical_strategy (str: default='mean'):
+                The strategy to use for imputing numerical columns.
+                Can be one of ['mean', 'median', 'most_frequent', 'constant', 'constant_!missing!']
+            categorical_strategy (str: default='most_frequent')
+                The strategy to use for imputing categorical columns.
+                Can be one of ['mean', 'median', 'most_frequent', 'constant_zero']
         """
         super().__init__()
         self.random_state = random_state
@@ -52,22 +49,16 @@ def __init__(
         self.categorical_strategy = categorical_strategy
 
     def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseImputer:
-        """
-        The fit function calls the fit function of the underlying model
-        and returns the transformed array.
+        """ Fits the underlying model and returns the transformed array.
 
-        Parameters
-        ----------
-        X: np.ndarray
-            The input features to fit on
+        Args:
+            X (np.ndarray):
+                The input features to fit on
+            y (Optional[np.ndarray]):
+                The labels for the input features `X`
 
-        y: Optional[np.ndarray]
-            The labels for the input features `X`
-
-        Returns
-        -------
-        SimpleImputer
-            returns self
+        Returns:
+            SimpleImputer: returns the object itself
         """
         self.check_requirements(X, y)
 
@@ -102,7 +93,7 @@ def get_hyperparameter_search_space(
         dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
         numerical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
             hyperparameter='numerical_strategy',
-            value_range=("mean", "median", "most_frequet", "constant_zero"),
+            value_range=("mean", "median", "most_frequent", "constant_zero"),
             default_value="mean",
         ),
         categorical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
@@ -113,22 +104,19 @@ def get_hyperparameter_search_space(
     ) -> ConfigurationSpace:
         """Get the hyperparameter search space for the SimpleImputer
 
-        Parameters
-        ----------
-        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
-            Properties that describe the dataset
-
-        numerical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(...)
-            The strategy to use for numerical imputation
-
-        caterogical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(...)
-            The strategy to use for categorical imputation
-
-        Returns
-        -------
-        ConfigurationSpace
-            The space of possible configurations for a SimpleImputer with the given
-            `dataset_properties`
+        Args:
+            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]])
+                Properties that describe the dataset
+                Note: Not actually Optional, just adhering to its supertype
+            numerical_strategy (HyperparameterSearchSpace: default = ...)
+                The strategy to use for numerical imputation
+            caterogical_strategy (HyperparameterSearchSpace: default = ...)
+                The strategy to use for categorical imputation
+
+        Returns:
+            ConfigurationSpace
+                The space of possible configurations for a SimpleImputer with the given
+                `dataset_properties`
         """
         cs = ConfigurationSpace()
 
@@ -156,10 +144,8 @@ def get_properties(
     ) -> Dict[str, Union[str, bool]]:
         """Get the properties of the SimpleImputer class and what it can handle
 
-        Returns
-        -------
-        Dict[str, Union[str, bool]]
-            A dict from property names to values
+        Returns:
+            Dict[str, Union[str, bool]]: A dict from property names to values
         """
         return {
             'shortname': 'SimpleImputer',

From c92d03913e11f381bd9528dc6a61e7d38a54d3c4 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Tue, 30 Nov 2021 14:28:02 +0100
Subject: [PATCH 3/7] Fixed docs

---
 .../tabular_preprocessing/imputation/SimpleImputer.py       | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py
index a13246fae..341964791 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py
@@ -58,7 +58,8 @@ def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseImputer:
                 The labels for the input features `X`
 
         Returns:
-            SimpleImputer: returns the object itself
+            SimpleImputer:
+                returns self
         """
         self.check_requirements(X, y)
 
@@ -145,7 +146,8 @@ def get_properties(
         """Get the properties of the SimpleImputer class and what it can handle
 
         Returns:
-            Dict[str, Union[str, bool]]: A dict from property names to values
+            Dict[str, Union[str, bool]]:
+                A dict from property names to values
         """
         return {
             'shortname': 'SimpleImputer',

From 60b919468c597998cd0b41c067c9ba9a59c13457 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 1 Dec 2021 11:12:41 +0100
Subject: [PATCH 4/7] Made changes, added test

---
 .../imputation/SimpleImputer.py               | 27 ++++++++++---------
 .../components/preprocessing/test_imputers.py | 12 +++++++++
 2 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py
index 341964791..cde459aae 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py
@@ -20,28 +20,29 @@ class SimpleImputer(BaseImputer):
     Note:
         In case of numpy data, the constant value is set to -1, under the assumption
         that categorical data is fit with an Ordinal Scaler.
+
+    Attributes:
+        random_state (Optional[np.random.RandomState]):
+            The random state to use for the imputer.
+        numerical_strategy (str: default='mean'):
+            The strategy to use for imputing numerical columns.
+            Can be one of ['mean', 'median', 'most_frequent', 'constant', 'constant_!missing!']
+        categorical_strategy (str: default='most_frequent')
+            The strategy to use for imputing categorical columns.
+            Can be one of ['mean', 'median', 'most_frequent', 'constant_zero']
     """
 
     def __init__(
-        self,
         random_state: Optional[Union[np.random.RandomState, int]] = None,
+        random_state: Optional[np.random.RandomState] = None,
         numerical_strategy: str = 'mean',
         categorical_strategy: str = 'most_frequent'
     ):
         """
         Note:
-            Using 'constant' defaults to fill_value of 0 where 'constant_!missing!'
-            uses a fill_value of -1. This behaviour should probably be fixed.
-
-        Args:
-            random_state (Optional[Union[np.random.RandomState, int]]):
-                The random state to use for the imputer.
-            numerical_strategy (str: default='mean'):
-                The strategy to use for imputing numerical columns.
-                Can be one of ['mean', 'median', 'most_frequent', 'constant', 'constant_!missing!']
-            categorical_strategy (str: default='most_frequent')
-                The strategy to use for imputing categorical columns.
-                Can be one of ['mean', 'median', 'most_frequent', 'constant_zero']
+            'constant' as numerical_strategy uses 0 as the default fill_value while
+            'constant_!missing!' uses a fill_value of -1.
+            This behaviour should probably be fixed.
         """
         super().__init__()
         self.random_state = random_state
diff --git a/test/test_pipeline/components/preprocessing/test_imputers.py b/test/test_pipeline/components/preprocessing/test_imputers.py
index 983737dfe..7fb71282f 100644
--- a/test/test_pipeline/components/preprocessing/test_imputers.py
+++ b/test/test_pipeline/components/preprocessing/test_imputers.py
@@ -1,4 +1,5 @@
 import unittest
+import pytest
 
 import numpy as np
 from numpy.testing import assert_array_equal
@@ -213,6 +214,17 @@ def test_constant_imputation(self):
                                                              [7.0, '0', 9],
                                                              [4.0, '0', '0']], dtype=str))
 
+    def test_imputation_without_dataset_properties_raises_error(self):
+        """Tests SimpleImputer checks for dataset properties when querying for
+        HyperparameterSearchSpace, even though the arg is marked `Optional`.
+
+        Expects:
+            * Should raise a ValueError that no dataset_properties were passed
+        """
+        with pytest.raises(ValueError):
+            SimpleImputer.get_hyperparameter_search_space()
+
+
 
 if __name__ == '__main__':
     unittest.main()

From 7a3e792481a179073ef17a6063e180d0c7250e52 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 1 Dec 2021 11:22:30 +0100
Subject: [PATCH 5/7] Fixed init statement

---
 .../tabular_preprocessing/imputation/SimpleImputer.py           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py
index cde459aae..9b118cb61 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py
@@ -33,7 +33,7 @@ class SimpleImputer(BaseImputer):
     """
 
     def __init__(
-        random_state: Optional[Union[np.random.RandomState, int]] = None,
+        self,
         random_state: Optional[np.random.RandomState] = None,
         numerical_strategy: str = 'mean',
         categorical_strategy: str = 'most_frequent'

From 549060415a63064b805596fc72e885821e43274f Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 1 Dec 2021 12:12:52 +0100
Subject: [PATCH 6/7] Fixed docs

---
 .../tabular_preprocessing/imputation/SimpleImputer.py           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py
index 9b118cb61..3d7ca22b1 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py
@@ -26,7 +26,7 @@ class SimpleImputer(BaseImputer):
             The random state to use for the imputer.
         numerical_strategy (str: default='mean'):
             The strategy to use for imputing numerical columns.
-            Can be one of ['mean', 'median', 'most_frequent', 'constant', 'constant_!missing!']
+            Can be one of ['most_frequent', 'constant_!missing!']
         categorical_strategy (str: default='most_frequent')
             The strategy to use for imputing categorical columns.
             Can be one of ['mean', 'median', 'most_frequent', 'constant_zero']

From e790e71f2b0eb2ce24b88d6f97dd829efb562fe0 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 1 Dec 2021 12:14:56 +0100
Subject: [PATCH 7/7] Flake'd

---
 test/test_pipeline/components/preprocessing/test_imputers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_pipeline/components/preprocessing/test_imputers.py b/test/test_pipeline/components/preprocessing/test_imputers.py
index 7fb71282f..18b43bfa6 100644
--- a/test/test_pipeline/components/preprocessing/test_imputers.py
+++ b/test/test_pipeline/components/preprocessing/test_imputers.py
@@ -1,9 +1,10 @@
 import unittest
-import pytest
 
 import numpy as np
 from numpy.testing import assert_array_equal
 
+import pytest
+
 from sklearn.base import BaseEstimator, clone
 from sklearn.compose import make_column_transformer
 
@@ -225,6 +226,5 @@ def test_imputation_without_dataset_properties_raises_error(self):
             SimpleImputer.get_hyperparameter_search_space()
 
 
-
 if __name__ == '__main__':
     unittest.main()