Skip to content

[refactor] Getting dataset properties from the dataset object #164

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 16 additions & 12 deletions autoPyTorch/api/base_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,14 +196,6 @@ def __init__(
raise ValueError("Expected search space updates to be of instance"
" HyperparameterSearchSpaceUpdates got {}".format(type(self.search_space_updates)))

@abstractmethod
def _get_required_dataset_properties(self, dataset: BaseDataset) -> Dict[str, Any]:
"""
given a pipeline type, this function returns the
dataset properties required by the dataset object
"""
raise NotImplementedError

@abstractmethod
def build_pipeline(self, dataset_properties: Dict[str, Any]) -> BasePipeline:
"""
Expand Down Expand Up @@ -267,7 +259,10 @@ def get_search_space(self, dataset: BaseDataset = None) -> ConfigurationSpace:
return self.search_space
elif dataset is not None:
dataset_requirements = get_dataset_requirements(
info=self._get_required_dataset_properties(dataset))
info=dataset.get_required_dataset_info(),
include=self.include_components,
exclude=self.exclude_components,
search_space_updates=self.search_space_updates)
return get_configuration_space(info=dataset.get_dataset_properties(dataset_requirements),
include=self.include_components,
exclude=self.exclude_components,
Expand Down Expand Up @@ -785,7 +780,10 @@ def _search(
# Initialise information needed for the experiment
experiment_task_name: str = 'runSearch'
dataset_requirements = get_dataset_requirements(
info=self._get_required_dataset_properties(dataset))
info=dataset.get_required_dataset_info(),
include=self.include_components,
exclude=self.exclude_components,
search_space_updates=self.search_space_updates)
self._dataset_requirements = dataset_requirements
dataset_properties = dataset.get_dataset_properties(dataset_requirements)
self._stopwatch.start_task(experiment_task_name)
Expand Down Expand Up @@ -1049,7 +1047,10 @@ def refit(
self._logger = self._get_logger(str(self.dataset_name))

dataset_requirements = get_dataset_requirements(
info=self._get_required_dataset_properties(dataset))
info=dataset.get_required_dataset_info(),
include=self.include_components,
exclude=self.exclude_components,
search_space_updates=self.search_space_updates)
dataset_properties = dataset.get_dataset_properties(dataset_requirements)
self._backend.save_datamanager(dataset)

Expand Down Expand Up @@ -1119,7 +1120,10 @@ def fit(self,

# get dataset properties
dataset_requirements = get_dataset_requirements(
info=self._get_required_dataset_properties(dataset))
info=dataset.get_required_dataset_info(),
include=self.include_components,
exclude=self.exclude_components,
search_space_updates=self.search_space_updates)
dataset_properties = dataset.get_dataset_properties(dataset_requirements)
self._backend.save_datamanager(dataset)

Expand Down
12 changes: 0 additions & 12 deletions autoPyTorch/api/tabular_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
TASK_TYPES_TO_STRING,
)
from autoPyTorch.data.tabular_validator import TabularInputValidator
from autoPyTorch.datasets.base_dataset import BaseDataset
from autoPyTorch.datasets.resampling_strategy import (
CrossValTypes,
HoldoutValTypes,
Expand Down Expand Up @@ -97,17 +96,6 @@ def __init__(
task_type=TASK_TYPES_TO_STRING[TABULAR_CLASSIFICATION],
)

def _get_required_dataset_properties(self, dataset: BaseDataset) -> Dict[str, Any]:
if not isinstance(dataset, TabularDataset):
raise ValueError("Dataset is incompatible for the given task,: {}".format(
type(dataset)
))
return {'task_type': dataset.task_type,
'output_type': dataset.output_type,
'issparse': dataset.issparse,
'numerical_columns': dataset.numerical_columns,
'categorical_columns': dataset.categorical_columns}

def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TabularClassificationPipeline:
return TabularClassificationPipeline(dataset_properties=dataset_properties)

Expand Down
12 changes: 0 additions & 12 deletions autoPyTorch/api/tabular_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
TASK_TYPES_TO_STRING
)
from autoPyTorch.data.tabular_validator import TabularInputValidator
from autoPyTorch.datasets.base_dataset import BaseDataset
from autoPyTorch.datasets.resampling_strategy import (
CrossValTypes,
HoldoutValTypes,
Expand Down Expand Up @@ -89,17 +88,6 @@ def __init__(
task_type=TASK_TYPES_TO_STRING[TABULAR_REGRESSION],
)

def _get_required_dataset_properties(self, dataset: BaseDataset) -> Dict[str, Any]:
if not isinstance(dataset, TabularDataset):
raise ValueError("Dataset is incompatible for the given task,: {}".format(
type(dataset)
))
return {'task_type': dataset.task_type,
'output_type': dataset.output_type,
'issparse': dataset.issparse,
'numerical_columns': dataset.numerical_columns,
'categorical_columns': dataset.categorical_columns}

def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TabularRegressionPipeline:
return TabularRegressionPipeline(dataset_properties=dataset_properties)

Expand Down
24 changes: 13 additions & 11 deletions autoPyTorch/datasets/base_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,11 +348,17 @@ def replace_data(self, X_train: BaseDatasetInputType,

def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) -> Dict[str, Any]:
"""
Gets the dataset properties required in the fit dictionary
Gets the dataset properties required in the fit dictionary.
This depends on the components that are active in the
pipeline and returns the properties they need about the dataset.
Information of the required properties of each component
can be found in their documentation.
Args:
dataset_requirements (List[FitRequirement]): List of
fit requirements that the dataset properties must
contain.
contain. This is created using the `get_dataset_requirements
function in
<https://github.com/automl/Auto-PyTorch/blob/refactor_development/autoPyTorch/utils/pipeline.py#L25>`

Returns:
dataset_properties (Dict[str, Any]):
Expand All @@ -362,19 +368,15 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) ->
for dataset_requirement in dataset_requirements:
dataset_properties[dataset_requirement.name] = getattr(self, dataset_requirement.name)

# Add task type, output type and issparse to dataset properties as
# they are not a dataset requirement in the pipeline
dataset_properties.update({'task_type': self.task_type,
'output_type': self.output_type,
'issparse': self.issparse,
'input_shape': self.input_shape,
'output_shape': self.output_shape
})
# Add the required dataset info to dataset properties as
# they might not be a dataset requirement in the pipeline
dataset_properties.update(self.get_required_dataset_info())
return dataset_properties

def get_required_dataset_info(self) -> Dict[str, Any]:
"""
Returns a dictionary containing required dataset properties to instantiate a pipeline,
Returns a dictionary containing required dataset
properties to instantiate a pipeline.
"""
info = {'output_type': self.output_type,
'issparse': self.issparse}
Expand Down
19 changes: 18 additions & 1 deletion autoPyTorch/datasets/tabular_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,24 @@ def __init__(self,

def get_required_dataset_info(self) -> Dict[str, Any]:
"""
Returns a dictionary containing required dataset properties to instantiate a pipeline,
Returns a dictionary containing required dataset
properties to instantiate a pipeline.
For a Tabular Dataset this includes-
1. 'output_type'- Enum indicating the type of the output for this problem.
We currently use the `sklearn type_of_target
<https://scikit-learn.org/stable/modules/generated/sklearn.utils.multiclass.type_of_target.html>`
to infer the output type from the data and we encode it to an
Enum for which you can find more info in `autopytorch/constants.py
<https://github.com/automl/Auto-PyTorch/blob/refactor_development/autoPyTorch/constants.py>`
2. 'issparse'- A flag indicating if the input is in a sparse matrix.
3. 'numerical_columns'- a list which contains the column numbers
for the numerical columns in the input dataset
4. 'categorical_columns'- a list which contains the column numbers
for the categorical columns in the input dataset
5. 'task_type'- Enum indicating the type of task. For tabular datasets,
currently we support 'tabular_classification' and 'tabular_regression'. and we encode it to an
Enum for which you can find more info in `autopytorch/constants.py
<https://github.com/automl/Auto-PyTorch/blob/refactor_development/autoPyTorch/constants.py>`
"""
info = super().get_required_dataset_info()
info.update({
Expand Down
34 changes: 17 additions & 17 deletions autoPyTorch/evaluation/abstract_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
TABULAR_TASKS,
)
from autoPyTorch.datasets.base_dataset import BaseDataset
from autoPyTorch.datasets.tabular_dataset import TabularDataset
from autoPyTorch.evaluation.utils import (
VotingRegressorWrapper,
convert_multioutput_multiclass_to_multilabel
Expand Down Expand Up @@ -71,6 +70,7 @@ class MyTraditionalTabularClassificationPipeline(BaseEstimator):
An optional dictionary that is passed to the pipeline's steps. It complies
a similar function as the kwargs
"""

def __init__(self, config: str,
dataset_properties: Dict[str, Any],
random_state: Optional[Union[int, np.random.RandomState]] = None,
Expand Down Expand Up @@ -141,6 +141,7 @@ class DummyClassificationPipeline(DummyClassifier):
An optional dictionary that is passed to the pipeline's steps. It complies
a similar function as the kwargs
"""

def __init__(self, config: Configuration,
random_state: Optional[Union[int, np.random.RandomState]] = None,
init_params: Optional[Dict] = None
Expand Down Expand Up @@ -208,6 +209,7 @@ class DummyRegressionPipeline(DummyRegressor):
An optional dictionary that is passed to the pipeline's steps. It complies
a similar function as the kwargs
"""

def __init__(self, config: Configuration,
random_state: Optional[Union[int, np.random.RandomState]] = None,
init_params: Optional[Dict] = None) -> None:
Expand Down Expand Up @@ -394,12 +396,9 @@ def __init__(self, backend: Backend,
raise ValueError('disable_file_output should be either a bool or a list')

self.pipeline_class: Optional[Union[BaseEstimator, BasePipeline]] = None
info: Dict[str, Any] = {'task_type': self.datamanager.task_type,
'output_type': self.datamanager.output_type,
'issparse': self.issparse}
if self.task_type in REGRESSION_TASKS:
if isinstance(self.configuration, int):
self.pipeline_class = DummyClassificationPipeline
self.pipeline_class = DummyRegressionPipeline
elif isinstance(self.configuration, str):
raise ValueError("Only tabular classifications tasks "
"are currently supported with traditional methods")
Expand All @@ -425,11 +424,12 @@ def __init__(self, backend: Backend,
else:
raise ValueError('task {} not available'.format(self.task_type))
self.predict_function = self._predict_proba
if self.task_type in TABULAR_TASKS:
assert isinstance(self.datamanager, TabularDataset)
info.update({'numerical_columns': self.datamanager.numerical_columns,
'categorical_columns': self.datamanager.categorical_columns})
self.dataset_properties = self.datamanager.get_dataset_properties(get_dataset_requirements(info))
self.dataset_properties = self.datamanager.get_dataset_properties(
get_dataset_requirements(info=self.datamanager.get_required_dataset_info(),
include=self.include,
exclude=self.exclude,
search_space_updates=self.search_space_updates
))

self.additional_metrics: Optional[List[autoPyTorchMetric]] = None
if all_supported_metrics:
Expand Down Expand Up @@ -630,9 +630,9 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
return None

def calculate_auxiliary_losses(
self,
Y_valid_pred: np.ndarray,
Y_test_pred: np.ndarray,
self,
Y_valid_pred: np.ndarray,
Y_test_pred: np.ndarray,
) -> Tuple[Optional[float], Optional[float]]:
"""
A helper function to calculate the performance estimate of the
Expand Down Expand Up @@ -670,10 +670,10 @@ def calculate_auxiliary_losses(
return validation_loss, test_loss

def file_output(
self,
Y_optimization_pred: np.ndarray,
Y_valid_pred: np.ndarray,
Y_test_pred: np.ndarray
self,
Y_optimization_pred: np.ndarray,
Y_valid_pred: np.ndarray,
Y_test_pred: np.ndarray
) -> Tuple[Optional[float], Dict]:
"""
This method decides what file outputs are written to disk.
Expand Down
Loading