Skip to content

[ADD] fit pipeline honoring API constraints with tests #348

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 32 commits into from
Dec 20, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
cdcf766
Add fit pipeline with tests
ravinkohli Nov 30, 2021
bc5b469
Add documentation for get dataset
ravinkohli Nov 30, 2021
0359c8c
update documentation
ravinkohli Nov 30, 2021
75eb604
fix tests
ravinkohli Nov 30, 2021
136f619
remove permutation importance from visualisation example
ravinkohli Nov 30, 2021
4731363
change disable_file_output
ravinkohli Nov 30, 2021
af48ebf
add
ravinkohli Dec 6, 2021
3df4e06
fix flake
ravinkohli Dec 6, 2021
e8289e4
fix test and examples
ravinkohli Dec 6, 2021
4018d02
change type of disable_file_output
ravinkohli Dec 6, 2021
add8890
Address comments from eddie
ravinkohli Dec 6, 2021
d8739cd
fix docstring in api
ravinkohli Dec 6, 2021
f1ea974
fix tests for base api
ravinkohli Dec 6, 2021
38471f1
fix tests for base api
ravinkohli Dec 6, 2021
02ac9de
fix tests after rebase
ravinkohli Dec 6, 2021
fd32939
reduce dataset size in example
ravinkohli Dec 7, 2021
3958750
remove optional from doc string
ravinkohli Dec 7, 2021
c33381a
Handle unsuccessful fitting of pipeline better
ravinkohli Dec 7, 2021
dff0e5c
fix flake in tests
ravinkohli Dec 7, 2021
eb648e5
change to default configuration for documentation
ravinkohli Dec 7, 2021
974ea1c
add warning for no ensemble created when y_optimization in disable_fi…
ravinkohli Dec 7, 2021
cc19e4c
reduce budget for single configuration
ravinkohli Dec 7, 2021
ab93ee6
address comments from eddie
ravinkohli Dec 7, 2021
c246b20
address comments from shuhei
ravinkohli Dec 9, 2021
a0a4e75
Add autoPyTorchEnum
ravinkohli Dec 9, 2021
a0fef77
fix flake in tests
ravinkohli Dec 10, 2021
8094ff1
address comments from shuhei
ravinkohli Dec 19, 2021
4d90706
Apply suggestions from code review
ravinkohli Dec 19, 2021
c7cc712
fix flake
ravinkohli Dec 19, 2021
14113f9
use **dataset_kwargs
ravinkohli Dec 20, 2021
5b2f75f
fix flake
ravinkohli Dec 20, 2021
24aac05
change to enforce keyword args
ravinkohli Dec 20, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
447 changes: 397 additions & 50 deletions autoPyTorch/api/base_task.py

Large diffs are not rendered by default.

176 changes: 128 additions & 48 deletions autoPyTorch/api/tabular_classification.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import os
import uuid
from typing import Any, Callable, Dict, List, Optional, Union
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

import numpy as np

Expand All @@ -13,11 +11,13 @@
TASK_TYPES_TO_STRING,
)
from autoPyTorch.data.tabular_validator import TabularInputValidator
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
from autoPyTorch.datasets.resampling_strategy import (
CrossValTypes,
HoldoutValTypes,
)
from autoPyTorch.datasets.tabular_dataset import TabularDataset
from autoPyTorch.evaluation.utils import DisableFileOutputParameters
from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline
from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates

Expand Down Expand Up @@ -54,13 +54,16 @@ class TabularClassificationTask(BaseTask):
delete_tmp_folder_after_terminate (bool):
Determines whether to delete the temporary directory,
when finished
include_components (Optional[Dict]):
If None, all possible components are used.
Otherwise specifies set of components to use.
exclude_components (Optional[Dict]):
If None, all possible components are used.
Otherwise specifies set of components not to use.
Incompatible with include components.
include_components (Optional[Dict[str, Any]]):
Dictionary containing components to include. Key is the node
name and Value is an Iterable of the names of the components
to include. Only these components will be present in the
search space.
exclude_components (Optional[Dict[str, Any]]):
Dictionary containing components to exclude. Key is the node
name and Value is an Iterable of the names of the components
to exclude. All except these components will be present in
the search space.
search_space_updates (Optional[HyperparameterSearchSpaceUpdates]):
search space updates that can be used to modify the search
space of particular components or choice modules of the pipeline
Expand All @@ -78,8 +81,8 @@ def __init__(
output_directory: Optional[str] = None,
delete_tmp_folder_after_terminate: bool = True,
delete_output_folder_after_terminate: bool = True,
include_components: Optional[Dict] = None,
exclude_components: Optional[Dict] = None,
include_components: Optional[Dict[str, Any]] = None,
exclude_components: Optional[Dict[str, Any]] = None,
resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
resampling_strategy_args: Optional[Dict[str, Any]] = None,
backend: Optional[Backend] = None,
Expand All @@ -106,18 +109,109 @@ def __init__(
task_type=TASK_TYPES_TO_STRING[TABULAR_CLASSIFICATION],
)

def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TabularClassificationPipeline:
def build_pipeline(
self,
dataset_properties: Dict[str, BaseDatasetPropertiesType],
include_components: Optional[Dict[str, Any]] = None,
exclude_components: Optional[Dict[str, Any]] = None,
search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
) -> TabularClassificationPipeline:
"""
Build pipeline according to current task and for the passed dataset properties
Build pipeline according to current task
and for the passed dataset properties

Args:
dataset_properties (Dict[str,Any])
dataset_properties (Dict[str, Any]):
Characteristics of the dataset to guide the pipeline
choices of components
include_components (Optional[Dict[str, Any]]):
Dictionary containing components to include. Key is the node
name and Value is an Iterable of the names of the components
to include. Only these components will be present in the
search space.
exclude_components (Optional[Dict[str, Any]]):
Dictionary containing components to exclude. Key is the node
name and Value is an Iterable of the names of the components
to exclude. All except these components will be present in
the search space.
search_space_updates (Optional[HyperparameterSearchSpaceUpdates]):
Search space updates that can be used to modify the search
space of particular components or choice modules of the pipeline

Returns:
TabularClassificationPipeline:
Pipeline compatible with the given dataset properties.
TabularClassificationPipeline

"""
return TabularClassificationPipeline(dataset_properties=dataset_properties,
include=include_components,
exclude=exclude_components,
search_space_updates=search_space_updates)

def _get_dataset_input_validator(
self,
X_train: Union[List, pd.DataFrame, np.ndarray],
y_train: Union[List, pd.DataFrame, np.ndarray],
X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
resampling_strategy: Optional[Union[CrossValTypes, HoldoutValTypes]] = None,
resampling_strategy_args: Optional[Dict[str, Any]] = None,
dataset_name: Optional[str] = None,
) -> Tuple[TabularDataset, TabularInputValidator]:
"""
return TabularClassificationPipeline(dataset_properties=dataset_properties)
Returns an object of `TabularDataset` and an object of
`TabularInputValidator` according to the current task.

Args:
X_train (Union[List, pd.DataFrame, np.ndarray]):
Training feature set.
y_train (Union[List, pd.DataFrame, np.ndarray]):
Training target set.
X_test (Optional[Union[List, pd.DataFrame, np.ndarray]]):
Testing feature set
y_test (Optional[Union[List, pd.DataFrame, np.ndarray]]):
Testing target set
resampling_strategy (Optional[Union[CrossValTypes, HoldoutValTypes]]):
Strategy to split the training data. if None, uses
HoldoutValTypes.holdout_validation.
resampling_strategy_args (Optional[Dict[str, Any]]):
arguments required for the chosen resampling strategy. If None, uses
the default values provided in DEFAULT_RESAMPLING_PARAMETERS
in ```datasets/resampling_strategy.py```.
dataset_name (Optional[str]):
name of the dataset, used as experiment name.
Returns:
TabularDataset:
the dataset object.
TabularInputValidator:
the input validator fitted on the data.
"""

resampling_strategy = resampling_strategy if resampling_strategy is not None else self.resampling_strategy
resampling_strategy_args = resampling_strategy_args if resampling_strategy_args is not None else \
self.resampling_strategy_args

# Create a validator object to make sure that the data provided by
# the user matches the autopytorch requirements
InputValidator = TabularInputValidator(
is_classification=True,
logger_port=self._logger_port,
)

# Fit a input validator to check the provided data
# Also, an encoder is fit to both train and test data,
# to prevent unseen categories during inference
InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)

dataset = TabularDataset(
X=X_train, Y=y_train,
X_test=X_test, Y_test=y_test,
validator=InputValidator,
resampling_strategy=resampling_strategy,
resampling_strategy_args=resampling_strategy_args,
dataset_name=dataset_name
)

return dataset, InputValidator

def search(
self,
Expand All @@ -138,7 +232,7 @@ def search(
get_smac_object_callback: Optional[Callable] = None,
all_supported_metrics: bool = True,
precision: int = 32,
disable_file_output: List = [],
disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
load_models: bool = True,
portfolio_selection: Optional[str] = None,
) -> 'BaseTask':
Expand Down Expand Up @@ -237,10 +331,10 @@ def search(
precision (int: default=32):
Numeric precision used when loading ensemble data.
Can be either '16', '32' or '64'.
disable_file_output (Union[bool, List]):
If True, disable model and prediction output.
Can also be used as a list to pass more fine-grained
information on what to save. Allowed elements in the list are:
disable_file_output (Optional[List[Union[str, DisableFileOutputParameters]]]):
Used as a list to pass more fine-grained
information on what to save. Must be a member of `DisableFileOutputParameters`.
Allowed elements in the list are:

+ `y_optimization`:
do not save the predictions for the optimization set,
Expand All @@ -253,6 +347,9 @@ def search(
pipelines fit on each fold.
+ `y_test`:
do not save the predictions for the test set.
+ `all`:
do not save any of the above.
For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`.
load_models (bool: default=True):
Whether to load the models after fitting AutoPyTorch.
portfolio_selection (Optional[str]):
Expand All @@ -269,32 +366,15 @@ def search(
self

"""
if dataset_name is None:
dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))

# we have to create a logger for at this point for the validator
self._logger = self._get_logger(dataset_name)

# Create a validator object to make sure that the data provided by
# the user matches the autopytorch requirements
self.InputValidator = TabularInputValidator(
is_classification=True,
logger_port=self._logger_port,
)

# Fit a input validator to check the provided data
# Also, an encoder is fit to both train and test data,
# to prevent unseen categories during inference
self.InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)

self.dataset = TabularDataset(
X=X_train, Y=y_train,
X_test=X_test, Y_test=y_test,
validator=self.InputValidator,
dataset_name=dataset_name,
self.dataset, self.InputValidator = self._get_dataset_input_validator(
X_train=X_train,
y_train=y_train,
X_test=X_test,
y_test=y_test,
resampling_strategy=self.resampling_strategy,
resampling_strategy_args=self.resampling_strategy_args,
)
dataset_name=dataset_name)

return self._search(
dataset=self.dataset,
Expand Down Expand Up @@ -333,7 +413,7 @@ def predict(
"""
if self.InputValidator is None or not self.InputValidator._is_fitted:
raise ValueError("predict() is only supported after calling search. Kindly call first "
"the estimator fit() method.")
"the estimator search() method.")

X_test = self.InputValidator.feature_validator.transform(X_test)
predicted_probabilities = super().predict(X_test, batch_size=batch_size,
Expand All @@ -353,6 +433,6 @@ def predict_proba(self,
batch_size: Optional[int] = None, n_jobs: int = 1) -> np.ndarray:
if self.InputValidator is None or not self.InputValidator._is_fitted:
raise ValueError("predict() is only supported after calling search. Kindly call first "
"the estimator fit() method.")
"the estimator search() method.")
X_test = self.InputValidator.feature_validator.transform(X_test)
return super().predict(X_test, batch_size=batch_size, n_jobs=n_jobs)
Loading