Skip to content

[ADD] Minority Coalescer #242

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 10 commits into from
42 changes: 21 additions & 21 deletions autoPyTorch/api/base_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -1072,11 +1072,17 @@ def refit(
self
"""

self.dataset_name = dataset.dataset_name

if self._logger is None:
self._logger = self._get_logger(str(self.dataset_name))

if self.models_ is None or len(self.models_) == 0 or self.ensemble_ is None:
self._load_models()

# Refit is not applicable when ensemble_size is set to zero.
if self.ensemble_ is None:
raise ValueError("Refit can only be called if 'ensemble_size != 0'")

self.dataset_name = dataset.dataset_name
dataset_requirements = get_dataset_requirements(
info=dataset.get_required_dataset_info(),
include=self.include_components,
Expand All @@ -1085,26 +1091,20 @@ def refit(
dataset_properties = dataset.get_dataset_properties(dataset_requirements)
self._backend.save_datamanager(dataset)

X: Dict[str, Any] = dict({'dataset_properties': dataset_properties,
'backend': self._backend,
'X_train': dataset.train_tensors[0],
'y_train': dataset.train_tensors[1],
'X_test': dataset.test_tensors[0] if dataset.test_tensors is not None else None,
'y_test': dataset.test_tensors[1] if dataset.test_tensors is not None else None,
'train_indices': dataset.splits[split_id][0],
'val_indices': dataset.splits[split_id][1],
'split_id': split_id,
'num_run': self._backend.get_next_num_run(),
})
X.update({**self.pipeline_options, **budget_config})
if self.models_ is None or len(self.models_) == 0 or self.ensemble_ is None:
self._load_models()

# Refit is not applicable when ensemble_size is set to zero.
if self.ensemble_ is None:
raise ValueError("Refit can only be called if 'ensemble_size != 0'")

for identifier in self.models_:
X: Dict[str, Any] = dict({'dataset_properties': dataset_properties,
'backend': self._backend,
'X_train': dataset.train_tensors[0].copy(),
'y_train': dataset.train_tensors[1].copy(),
'X_test': dataset.test_tensors[0] if dataset.test_tensors is not None else None,
'y_test': dataset.test_tensors[1] if dataset.test_tensors is not None else None,
'train_indices': dataset.splits[split_id][0],
'val_indices': dataset.splits[split_id][1],
'split_id': split_id,
'num_run': self._backend.get_next_num_run(),
})
X.update({**self.pipeline_options, **budget_config})

model = self.models_[identifier]
# this updates the model inplace, it can then later be used in
# predict method
Expand Down
3 changes: 2 additions & 1 deletion autoPyTorch/configs/greedy_portfolio.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
[{"data_loader:batch_size": 60,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
Expand Down Expand Up @@ -506,4 +507,4 @@
"network_backbone:ShapedResNetBackbone:max_shake_drop_probability": 0.034431265307095615,
"network_head:fully_connected:activation": "relu",
"network_head:fully_connected:units_layer_1": 128,
"network_backbone:ShapedResNetBackbone:max_dropout": 0.6296079567189131}]
"network_backbone:ShapedResNetBackbone:max_dropout": 0.6296079567189131}]
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from typing import Any, Dict, Optional, Union

from ConfigSpace.configuration_space import ConfigurationSpace
from ConfigSpace.hyperparameters import (
UniformFloatHyperparameter,
)

import numpy as np

from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer
from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
from autoPyTorch.utils.implementations import MinorityCoalescing


class MinorityCoalescer(BaseCoalescer):
"""
Groups together classes in a categorical feature if the frequency
of occurrence is less than minimum_fraction
"""
def __init__(self, minimum_fraction: float, random_state: np.random.RandomState):
super().__init__()
self.minimum_fraction = minimum_fraction
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minimum_fraction -> min_fraction (convention)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we have a convention for this, and in this case, having the complete word is more clear. If possible I would like to preserve it.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pytorch uses min rather than minimum.
When it uses minimum, it is only for the element-wise minimum.
But if you would like to stick to it, it is also fine.

self.random_state = random_state

def fit(self, X: Dict[str, Any], y: Any = None) -> BaseCoalescer:

self.check_requirements(X, y)

self.preprocessor['categorical'] = MinorityCoalescing(minimum_fraction=self.minimum_fraction)
return self

@staticmethod
def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
return {
'shortname': 'MinorityCoalescer',
'name': 'Minority Feature-class coalescer',
'handles_sparse': False
}

@staticmethod
def get_hyperparameter_search_space(
dataset_properties: Optional[Dict] = None,
minimum_fraction: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="minimum_fraction",
value_range=(0.0001, 0.5),
default_value=0.01,
log=True),
) -> ConfigurationSpace:
cs = ConfigurationSpace()

add_hyperparameter(cs, minimum_fraction, UniformFloatHyperparameter)

return cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from typing import Any, Dict, Optional, Union

import numpy as np

from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer


class NoCoalescer(BaseCoalescer):
"""
Don't perform NoCoalescer on categorical features
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a NoCoalescer class. This allows the BO model to enable/disable coalescing.

The choice object selects between MinorityCoalescer and NoCoalescer depending on what gives better performance.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean i did not get if you mean Do not perform NO coalescer or Do not perform coalescer.

"""
def __init__(self,
random_state: np.random.RandomState,
):
super().__init__()
self.random_state = random_state

def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseCoalescer:
"""
As no coalescing happens, the input fit dictionary is unchanged.

Args:
X (Dict[str, Any]):
input fit dictionary
y (Optional[Any]):
Parameter to comply with scikit-learn API. Not used.

Returns:
instance of self
"""
self.check_requirements(X, y)

return self

def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
"""
Add self into the 'X' dictionary and return the modified dict.
Args:
X (Dict[str, Any]): 'X' dictionary
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is X dictionary?
fit_dictionary? (Ravin says the fit_dictionary will deprecate soon, but do you have any idea when?)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Scikit-learn supports passing a dictionary alongside the data. See here

It makes a lot of sense to use it instead of X as a fit_dictionary.

From all of the refactoring changes, this is to me the most important.

When** depends on when there is a contributor that wants to do this change :)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry I did not get you, so please add your ideas to the doc-string as well?
Especially, I do not get why the meaning behind 'X' in the sentence.
But still it is a bit confusing for me.
Do you know why sklearn uses X for both fit_dictionary and feature_data?


Returns:
(Dict[str, Any]): the updated 'X' dictionary
"""
X.update({'coalescer': self.preprocessor})
return X

@staticmethod
def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
return {
'shortname': 'NoCoalescer',
'name': 'No Coalescer',
'handles_sparse': True
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import os
from collections import OrderedDict
from typing import Any, Dict, List, Optional

import ConfigSpace.hyperparameters as CSH
from ConfigSpace.configuration_space import ConfigurationSpace

from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
from autoPyTorch.pipeline.components.base_component import (
ThirdPartyComponents,
autoPyTorchComponent,
find_components,
)
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer


coalescer_directory = os.path.split(__file__)[0]
_coalescer = find_components(__package__,
coalescer_directory,
BaseCoalescer)
_addons = ThirdPartyComponents(BaseCoalescer)


def add_coalescer(coalescer: BaseCoalescer) -> None:
_addons.add_component(coalescer)


class CoalescerChoice(autoPyTorchChoice):
"""
Allows for dynamically choosing coalescer component at runtime
"""

def get_components(self) -> Dict[str, autoPyTorchComponent]:
"""Returns the available coalescer components

Args:
None

Returns:
Dict[str, autoPyTorchComponent]: all BaseCoalescer components available
as choices for coalescer the categorical columns
"""
components = OrderedDict()
components.update(_coalescer)
components.update(_addons.components)
return components

def get_hyperparameter_search_space(self,
dataset_properties: Optional[Dict[str, Any]] = None,
default: Optional[str] = None,
include: Optional[List[str]] = None,
exclude: Optional[List[str]] = None) -> ConfigurationSpace:
cs = ConfigurationSpace()

if dataset_properties is None:
dataset_properties = dict()

dataset_properties = {**self.dataset_properties, **dataset_properties}

available_preprocessors = self.get_available_components(dataset_properties=dataset_properties,
include=include,
exclude=exclude)

if len(available_preprocessors) == 0:
raise ValueError("No coalescer found, please add a coalescer via the include "
"argument of the pipeline. Additionally, coalescer as a step "
"can be removed as a pipeline step. ")

if default is None:
defaults = ['NoCoalescer', 'MinorityCoalescer']
for default_ in defaults:
if default_ in available_preprocessors:
if include is not None and default_ not in include:
continue
if exclude is not None and default_ in exclude:
continue
default = default_
break

updates = self._get_search_space_updates()
if '__choice__' in updates.keys():
choice_hyperparameter = updates['__choice__']
if not set(choice_hyperparameter.value_range).issubset(available_preprocessors):
raise ValueError("The update for {} was expected to be a subset of {} "
"but was {}".format(self.__class__.__name__,
available_preprocessors,
choice_hyperparameter.value_range))
if len(dataset_properties['categorical_columns']) == 0:
assert len(choice_hyperparameter.value_range) == 1
assert 'MinorityCoalescer' in choice_hyperparameter.value_range, \
"Provided {} in choices, however, the dataset " \
"is incompatible with it".format(choice_hyperparameter.value_range)

preprocessor = CSH.CategoricalHyperparameter('__choice__',
choice_hyperparameter.value_range,
default_value=choice_hyperparameter.default_value)
else:
# add only no coalescer to choice hyperparameters in case the dataset is only numerical
if len(dataset_properties['categorical_columns']) == 0:
default = 'NoCoalescer'
if include is not None and default not in include:
raise ValueError("Provided coalescer {} are incompatible with "
"the dataset without categorical columns.".format(include))
preprocessor = CSH.CategoricalHyperparameter('__choice__',
['NoCoalescer'],
default_value=default)
else:
preprocessor = CSH.CategoricalHyperparameter('__choice__',
list(available_preprocessors.keys()),
default_value=default)

cs.add_hyperparameter(preprocessor)

# add only child hyperparameters of early_preprocessor choices
for name in preprocessor.choices:
updates = self._get_search_space_updates(prefix=name)
# Call arg is ignored on mypy as the search space dynamically
# provides different args
preprocessor_configuration_space = available_preprocessors[ # type:ignore[call-arg]
name # type:ignore[call-arg]
].get_hyperparameter_search_space(dataset_properties, **updates) # type:ignore[call-arg]
parent_hyperparameter = {'parent': preprocessor, 'value': name}
cs.add_configuration_space(name, preprocessor_configuration_space,
parent_hyperparameter=parent_hyperparameter)

self.configuration_space = cs
self.dataset_properties = dataset_properties
return cs

def _check_dataset_properties(self, dataset_properties: Dict[str, Any]) -> None:
"""
A mechanism in code to ensure the correctness of the fit dictionary
It recursively makes sure that the children and parent level requirements
are honored before fit.
Args:
dataset_properties:

"""
super()._check_dataset_properties(dataset_properties)
assert 'numerical_columns' in dataset_properties.keys(), \
"Dataset properties must contain information about numerical columns"
assert 'categorical_columns' in dataset_properties.keys(), \
"Dataset properties must contain information about categorical columns"
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from typing import Any, Dict, List

from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import (
autoPyTorchTabularPreprocessingComponent
)
from autoPyTorch.utils.common import FitRequirement


class BaseCoalescer(autoPyTorchTabularPreprocessingComponent):
"""
Base class for coalescing
"""
def __init__(self) -> None:
super().__init__()
self.add_fit_requirements([
FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True),
FitRequirement('categories', (List,), user_defined=True, dataset_property=True)])

def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
"""
The input X is the fit dictionary, that contains both the train data as
well as fit directives. For example, it indicates whether or not to use the gpu
or perform a cpu only run.

This method add the self into the 'X' dictionary and return it.
Args:
X (Dict[str, Any]): 'X' dictionary

Returns:
(Dict[str, Any]): the updated 'X' dictionary
"""
if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None:
raise ValueError("Cannot call transform() on {} without calling fit() first."
.format(self.__class__.__name__))
X.update({'coalescer': self.preprocessor})
return X
4 changes: 4 additions & 0 deletions autoPyTorch/pipeline/tabular_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import (
TabularColumnTransformer
)
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer import (
CoalescerChoice
)
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import (
EncoderChoice
)
Expand Down Expand Up @@ -277,6 +280,7 @@ def _get_pipeline_steps(

steps.extend([
("imputer", SimpleImputer(random_state=self.random_state)),
("coalescer", CoalescerChoice(default_dataset_properties, random_state=self.random_state)),
("encoder", EncoderChoice(default_dataset_properties, random_state=self.random_state)),
("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties,
Expand Down
4 changes: 4 additions & 0 deletions autoPyTorch/pipeline/tabular_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import (
TabularColumnTransformer
)
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer import (
CoalescerChoice
)
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import (
EncoderChoice
)
Expand Down Expand Up @@ -219,6 +222,7 @@ def _get_pipeline_steps(

steps.extend([
("imputer", SimpleImputer(random_state=self.random_state)),
("coalescer", CoalescerChoice(default_dataset_properties, random_state=self.random_state)),
("encoder", EncoderChoice(default_dataset_properties, random_state=self.random_state)),
("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties,
Expand Down
Loading