automl · franchuterivera · May 25, 2021 · May 25, 2021 · Jun 1, 2021 · Jun 1, 2021
diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
@@ -1072,11 +1072,17 @@ def refit(
             self
         """
 
-        self.dataset_name = dataset.dataset_name
-
         if self._logger is None:
             self._logger = self._get_logger(str(self.dataset_name))
 
+        if self.models_ is None or len(self.models_) == 0 or self.ensemble_ is None:
+            self._load_models()
+
+        # Refit is not applicable when ensemble_size is set to zero.
+        if self.ensemble_ is None:
+            raise ValueError("Refit can only be called if 'ensemble_size != 0'")
+
+        self.dataset_name = dataset.dataset_name
         dataset_requirements = get_dataset_requirements(
             info=dataset.get_required_dataset_info(),
             include=self.include_components,
@@ -1085,26 +1091,20 @@ def refit(
         dataset_properties = dataset.get_dataset_properties(dataset_requirements)
         self._backend.save_datamanager(dataset)
 
-        X: Dict[str, Any] = dict({'dataset_properties': dataset_properties,
-                                  'backend': self._backend,
-                                  'X_train': dataset.train_tensors[0],
-                                  'y_train': dataset.train_tensors[1],
-                                  'X_test': dataset.test_tensors[0] if dataset.test_tensors is not None else None,
-                                  'y_test': dataset.test_tensors[1] if dataset.test_tensors is not None else None,
-                                  'train_indices': dataset.splits[split_id][0],
-                                  'val_indices': dataset.splits[split_id][1],
-                                  'split_id': split_id,
-                                  'num_run': self._backend.get_next_num_run(),
-                                  })
-        X.update({**self.pipeline_options, **budget_config})
-        if self.models_ is None or len(self.models_) == 0 or self.ensemble_ is None:
-            self._load_models()
-
-        # Refit is not applicable when ensemble_size is set to zero.
-        if self.ensemble_ is None:
-            raise ValueError("Refit can only be called if 'ensemble_size != 0'")
-
         for identifier in self.models_:
+            X: Dict[str, Any] = dict({'dataset_properties': dataset_properties,
+                                      'backend': self._backend,
+                                      'X_train': dataset.train_tensors[0].copy(),
+                                      'y_train': dataset.train_tensors[1].copy(),
+                                      'X_test': dataset.test_tensors[0] if dataset.test_tensors is not None else None,
+                                      'y_test': dataset.test_tensors[1] if dataset.test_tensors is not None else None,
+                                      'train_indices': dataset.splits[split_id][0],
+                                      'val_indices': dataset.splits[split_id][1],
+                                      'split_id': split_id,
+                                      'num_run': self._backend.get_next_num_run(),
+                                      })
+            X.update({**self.pipeline_options, **budget_config})
+
             model = self.models_[identifier]
             # this updates the model inplace, it can then later be used in
             # predict method

diff --git a/autoPyTorch/configs/greedy_portfolio.json b/autoPyTorch/configs/greedy_portfolio.json
@@ -1,5 +1,6 @@
 [{"data_loader:batch_size": 60,
  "encoder:__choice__": "OneHotEncoder",
+ "coalescer:__choice__": "NoCoalescer",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
  "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
@@ -506,4 +507,4 @@
  "network_backbone:ShapedResNetBackbone:max_shake_drop_probability": 0.034431265307095615,
  "network_head:fully_connected:activation": "relu",
  "network_head:fully_connected:units_layer_1": 128,
- "network_backbone:ShapedResNetBackbone:max_dropout": 0.6296079567189131}]
+ "network_backbone:ShapedResNetBackbone:max_dropout": 0.6296079567189131}]
diff --git a/...ch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/MinorityCoalescer.py b/...ch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/MinorityCoalescer.py
@@ -0,0 +1,52 @@
+from typing import Any, Dict, Optional, Union
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (
+    UniformFloatHyperparameter,
+)
+
+import numpy as np
+
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
+from autoPyTorch.utils.implementations import MinorityCoalescing
+
+
+class MinorityCoalescer(BaseCoalescer):
+    """
+    Groups together classes in a categorical feature if the frequency
+    of occurrence is less than minimum_fraction
+    """
+    def __init__(self, minimum_fraction: float, random_state: np.random.RandomState):
+        super().__init__()
+        self.minimum_fraction = minimum_fraction
+        self.random_state = random_state
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseCoalescer:
+
+        self.check_requirements(X, y)
+
+        self.preprocessor['categorical'] = MinorityCoalescing(minimum_fraction=self.minimum_fraction)
+        return self
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'MinorityCoalescer',
+            'name': 'Minority Feature-class coalescer',
+            'handles_sparse': False
+        }
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict] = None,
+        minimum_fraction: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="minimum_fraction",
+                                                                                value_range=(0.0001, 0.5),
+                                                                                default_value=0.01,
+                                                                                log=True),
+    ) -> ConfigurationSpace:
+        cs = ConfigurationSpace()
+
+        add_hyperparameter(cs, minimum_fraction, UniformFloatHyperparameter)
+
+        return cs
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/NoCoalescer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/NoCoalescer.py
@@ -0,0 +1,53 @@
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer
+
+
+class NoCoalescer(BaseCoalescer):
+    """
+    Don't perform NoCoalescer on categorical features
+    """
+    def __init__(self,
+                 random_state: np.random.RandomState,
+                 ):
+        super().__init__()
+        self.random_state = random_state
+
+    def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseCoalescer:
+        """
+        As no coalescing happens, the input fit dictionary is unchanged.
+
+        Args:
+        X (Dict[str, Any]):
+            input fit dictionary
+        y (Optional[Any]):
+            Parameter to comply with scikit-learn API. Not used.
+
+        Returns:
+            instance of self
+        """
+        self.check_requirements(X, y)
+
+        return self
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Add self into the 'X' dictionary and return the modified dict.
+        Args:
+            X (Dict[str, Any]): 'X' dictionary
+
+        Returns:
+            (Dict[str, Any]): the updated 'X' dictionary
+        """
+        X.update({'coalescer': self.preprocessor})
+        return X
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'NoCoalescer',
+            'name': 'No Coalescer',
+            'handles_sparse': True
+        }
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py
@@ -0,0 +1,143 @@
+import os
+from collections import OrderedDict
+from typing import Any, Dict, List, Optional
+
+import ConfigSpace.hyperparameters as CSH
+from ConfigSpace.configuration_space import ConfigurationSpace
+
+from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
+from autoPyTorch.pipeline.components.base_component import (
+    ThirdPartyComponents,
+    autoPyTorchComponent,
+    find_components,
+)
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer
+
+
+coalescer_directory = os.path.split(__file__)[0]
+_coalescer = find_components(__package__,
+                             coalescer_directory,
+                             BaseCoalescer)
+_addons = ThirdPartyComponents(BaseCoalescer)
+
+
+def add_coalescer(coalescer: BaseCoalescer) -> None:
+    _addons.add_component(coalescer)
+
+
+class CoalescerChoice(autoPyTorchChoice):
+    """
+    Allows for dynamically choosing coalescer component at runtime
+    """
+
+    def get_components(self) -> Dict[str, autoPyTorchComponent]:
+        """Returns the available coalescer components
+
+        Args:
+            None
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: all BaseCoalescer components available
+                as choices for coalescer the categorical columns
+        """
+        components = OrderedDict()
+        components.update(_coalescer)
+        components.update(_addons.components)
+        return components
+
+    def get_hyperparameter_search_space(self,
+                                        dataset_properties: Optional[Dict[str, Any]] = None,
+                                        default: Optional[str] = None,
+                                        include: Optional[List[str]] = None,
+                                        exclude: Optional[List[str]] = None) -> ConfigurationSpace:
+        cs = ConfigurationSpace()
+
+        if dataset_properties is None:
+            dataset_properties = dict()
+
+        dataset_properties = {**self.dataset_properties, **dataset_properties}
+
+        available_preprocessors = self.get_available_components(dataset_properties=dataset_properties,
+                                                                include=include,
+                                                                exclude=exclude)
+
+        if len(available_preprocessors) == 0:
+            raise ValueError("No coalescer found, please add a coalescer via the include "
+                             "argument of the pipeline. Additionally, coalescer as a step "
+                             "can be removed as a pipeline step. ")
+
+        if default is None:
+            defaults = ['NoCoalescer', 'MinorityCoalescer']
+            for default_ in defaults:
+                if default_ in available_preprocessors:
+                    if include is not None and default_ not in include:
+                        continue
+                    if exclude is not None and default_ in exclude:
+                        continue
+                    default = default_
+                    break
+
+        updates = self._get_search_space_updates()
+        if '__choice__' in updates.keys():
+            choice_hyperparameter = updates['__choice__']
+            if not set(choice_hyperparameter.value_range).issubset(available_preprocessors):
+                raise ValueError("The update for {} was expected to be a subset of {} "
+                                 "but was {}".format(self.__class__.__name__,
+                                                     available_preprocessors,
+                                                     choice_hyperparameter.value_range))
+            if len(dataset_properties['categorical_columns']) == 0:
+                assert len(choice_hyperparameter.value_range) == 1
+                assert 'MinorityCoalescer' in choice_hyperparameter.value_range, \
+                    "Provided {} in choices, however, the dataset " \
+                    "is incompatible with it".format(choice_hyperparameter.value_range)
+
+            preprocessor = CSH.CategoricalHyperparameter('__choice__',
+                                                         choice_hyperparameter.value_range,
+                                                         default_value=choice_hyperparameter.default_value)
+        else:
+            # add only no coalescer to choice hyperparameters in case the dataset is only numerical
+            if len(dataset_properties['categorical_columns']) == 0:
+                default = 'NoCoalescer'
+                if include is not None and default not in include:
+                    raise ValueError("Provided coalescer {} are incompatible with "
+                                     "the dataset without categorical columns.".format(include))
+                preprocessor = CSH.CategoricalHyperparameter('__choice__',
+                                                             ['NoCoalescer'],
+                                                             default_value=default)
+            else:
+                preprocessor = CSH.CategoricalHyperparameter('__choice__',
+                                                             list(available_preprocessors.keys()),
+                                                             default_value=default)
+
+        cs.add_hyperparameter(preprocessor)
+
+        # add only child hyperparameters of early_preprocessor choices
+        for name in preprocessor.choices:
+            updates = self._get_search_space_updates(prefix=name)
+            # Call arg is ignored on mypy as the search space dynamically
+            # provides different args
+            preprocessor_configuration_space = available_preprocessors[       # type:ignore[call-arg]
+                name                                                          # type:ignore[call-arg]
+            ].get_hyperparameter_search_space(dataset_properties, **updates)  # type:ignore[call-arg]
+            parent_hyperparameter = {'parent': preprocessor, 'value': name}
+            cs.add_configuration_space(name, preprocessor_configuration_space,
+                                       parent_hyperparameter=parent_hyperparameter)
+
+        self.configuration_space = cs
+        self.dataset_properties = dataset_properties
+        return cs
+
+    def _check_dataset_properties(self, dataset_properties: Dict[str, Any]) -> None:
+        """
+        A mechanism in code to ensure the correctness of the fit dictionary
+        It recursively makes sure that the children and parent level requirements
+        are honored before fit.
+        Args:
+            dataset_properties:
+
+        """
+        super()._check_dataset_properties(dataset_properties)
+        assert 'numerical_columns' in dataset_properties.keys(), \
+            "Dataset properties must contain information about numerical columns"
+        assert 'categorical_columns' in dataset_properties.keys(), \
+            "Dataset properties must contain information about categorical columns"
diff --git a/...Torch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/base_coalescer.py b/...Torch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/base_coalescer.py
@@ -0,0 +1,36 @@
+from typing import Any, Dict, List
+
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import (
+    autoPyTorchTabularPreprocessingComponent
+)
+from autoPyTorch.utils.common import FitRequirement
+
+
+class BaseCoalescer(autoPyTorchTabularPreprocessingComponent):
+    """
+    Base class for coalescing
+    """
+    def __init__(self) -> None:
+        super().__init__()
+        self.add_fit_requirements([
+            FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True),
+            FitRequirement('categories', (List,), user_defined=True, dataset_property=True)])
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        The input X is the fit dictionary, that contains both the train data as
+        well as fit directives. For example, it indicates whether or not to use the gpu
+        or perform a cpu only run.
+
+        This method add the self into the 'X' dictionary and return it.
+        Args:
+            X (Dict[str, Any]): 'X' dictionary
+
+        Returns:
+            (Dict[str, Any]): the updated 'X' dictionary
+        """
+        if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None:
+            raise ValueError("Cannot call transform() on {} without calling fit() first."
+                             .format(self.__class__.__name__))
+        X.update({'coalescer': self.preprocessor})
+        return X
diff --git a/autoPyTorch/pipeline/tabular_classification.py b/autoPyTorch/pipeline/tabular_classification.py
@@ -20,6 +20,9 @@
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import (
     TabularColumnTransformer
 )
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer import (
+    CoalescerChoice
+)
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import (
     EncoderChoice
 )
@@ -277,6 +280,7 @@ def _get_pipeline_steps(
 
         steps.extend([
             ("imputer", SimpleImputer(random_state=self.random_state)),
+            ("coalescer", CoalescerChoice(default_dataset_properties, random_state=self.random_state)),
             ("encoder", EncoderChoice(default_dataset_properties, random_state=self.random_state)),
             ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
             ("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties,

diff --git a/autoPyTorch/pipeline/tabular_regression.py b/autoPyTorch/pipeline/tabular_regression.py
@@ -19,6 +19,9 @@
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import (
     TabularColumnTransformer
 )
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer import (
+    CoalescerChoice
+)
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import (
     EncoderChoice
 )
@@ -219,6 +222,7 @@ def _get_pipeline_steps(
 
         steps.extend([
             ("imputer", SimpleImputer(random_state=self.random_state)),
+            ("coalescer", CoalescerChoice(default_dataset_properties, random_state=self.random_state)),
             ("encoder", EncoderChoice(default_dataset_properties, random_state=self.random_state)),
             ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
             ("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties,