diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index b4d20165e..531125bff 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -11,7 +11,7 @@ import typing import unittest.mock import warnings -from abc import abstractmethod +from abc import ABC, abstractmethod from typing import Any, Callable, Dict, List, Optional, Tuple, Union from ConfigSpace.configuration_space import Configuration, ConfigurationSpace @@ -27,7 +27,7 @@ import pandas as pd -from smac.runhistory.runhistory import DataOrigin, RunHistory +from smac.runhistory.runhistory import DataOrigin, RunHistory, RunInfo, RunValue from smac.stats.stats import Stats from smac.tae import StatusType @@ -45,6 +45,7 @@ from autoPyTorch.ensemble.singlebest_ensemble import SingleBest from autoPyTorch.evaluation.abstract_evaluator import fit_and_suppress_warnings from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash +from autoPyTorch.evaluation.utils import DisableFileOutputParameters from autoPyTorch.optimizer.smbo import AutoMLSMBO from autoPyTorch.pipeline.base_pipeline import BasePipeline from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner import get_available_traditional_learners @@ -104,7 +105,7 @@ def send_warnings_to_log( return prediction -class BaseTask: +class BaseTask(ABC): """ Base class for the tasks that serve as API to the pipelines. @@ -134,13 +135,16 @@ class BaseTask: delete_tmp_folder_after_terminate (bool): Determines whether to delete the temporary directory, when finished - include_components (Optional[Dict]): - If None, all possible components are used. - Otherwise specifies set of components to use. - exclude_components (Optional[Dict]): - If None, all possible components are used. - Otherwise specifies set of components not to use. - Incompatible with include components + include_components (Optional[Dict[str, Any]]): + Dictionary containing components to include. Key is the node + name and Value is an Iterable of the names of the components + to include. Only these components will be present in the + search space. + exclude_components (Optional[Dict[str, Any]]): + Dictionary containing components to exclude. Key is the node + name and Value is an Iterable of the names of the components + to exclude. All except these components will be present in + the search space. search_space_updates (Optional[HyperparameterSearchSpaceUpdates]): Search space updates that can be used to modify the search space of particular components or choice modules of the pipeline @@ -159,8 +163,8 @@ def __init__( output_directory: Optional[str] = None, delete_tmp_folder_after_terminate: bool = True, delete_output_folder_after_terminate: bool = True, - include_components: Optional[Dict] = None, - exclude_components: Optional[Dict] = None, + include_components: Optional[Dict[str, Any]] = None, + exclude_components: Optional[Dict[str, Any]] = None, backend: Optional[Backend] = None, resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation, resampling_strategy_args: Optional[Dict[str, Any]] = None, @@ -233,19 +237,132 @@ def __init__( " HyperparameterSearchSpaceUpdates got {}".format(type(self.search_space_updates))) @abstractmethod - def build_pipeline(self, dataset_properties: Dict[str, Any]) -> BasePipeline: + def build_pipeline( + self, + dataset_properties: Dict[str, BaseDatasetPropertiesType], + include_components: Optional[Dict[str, Any]] = None, + exclude_components: Optional[Dict[str, Any]] = None, + search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None + ) -> BasePipeline: """ Build pipeline according to current task and for the passed dataset properties Args: - dataset_properties (Dict[str,Any]) + dataset_properties (Dict[str, Any]): + Characteristics of the dataset to guide the pipeline + choices of components + include_components (Optional[Dict[str, Any]]): + Dictionary containing components to include. Key is the node + name and Value is an Iterable of the names of the components + to include. Only these components will be present in the + search space. + exclude_components (Optional[Dict[str, Any]]): + Dictionary containing components to exclude. Key is the node + name and Value is an Iterable of the names of the components + to exclude. All except these components will be present in + the search space. + search_space_updates (Optional[HyperparameterSearchSpaceUpdates]): + Search space updates that can be used to modify the search + space of particular components or choice modules of the pipeline Returns: + BasePipeline + + """ + raise NotImplementedError("Function called on BaseTask, this can only be called by " + "specific task which is a child of the BaseTask") + @abstractmethod + def _get_dataset_input_validator( + self, + X_train: Union[List, pd.DataFrame, np.ndarray], + y_train: Union[List, pd.DataFrame, np.ndarray], + X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + resampling_strategy: Optional[Union[CrossValTypes, HoldoutValTypes]] = None, + resampling_strategy_args: Optional[Dict[str, Any]] = None, + dataset_name: Optional[str] = None, + ) -> Tuple[BaseDataset, BaseInputValidator]: + """ + Returns an object of a child class of `BaseDataset` and + an object of a child class of `BaseInputValidator` according + to the current task. + + Args: + X_train (Union[List, pd.DataFrame, np.ndarray]): + Training feature set. + y_train (Union[List, pd.DataFrame, np.ndarray]): + Training target set. + X_test (Optional[Union[List, pd.DataFrame, np.ndarray]]): + Testing feature set + y_test (Optional[Union[List, pd.DataFrame, np.ndarray]]): + Testing target set + resampling_strategy (Optional[Union[CrossValTypes, HoldoutValTypes]]): + Strategy to split the training data. if None, uses + HoldoutValTypes.holdout_validation. + resampling_strategy_args (Optional[Dict[str, Any]]): + arguments required for the chosen resampling strategy. If None, uses + the default values provided in DEFAULT_RESAMPLING_PARAMETERS + in ```datasets/resampling_strategy.py```. + dataset_name (Optional[str]): + name of the dataset, used as experiment name. + + Returns: + BaseDataset: + the dataset object + BaseInputValidator: + fitted input validator """ raise NotImplementedError + def get_dataset( + self, + X_train: Union[List, pd.DataFrame, np.ndarray], + y_train: Union[List, pd.DataFrame, np.ndarray], + X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + resampling_strategy: Optional[Union[CrossValTypes, HoldoutValTypes]] = None, + resampling_strategy_args: Optional[Dict[str, Any]] = None, + dataset_name: Optional[str] = None, + ) -> BaseDataset: + """ + Returns an object of a child class of `BaseDataset` according to the current task. + + Args: + X_train (Union[List, pd.DataFrame, np.ndarray]): + Training feature set. + y_train (Union[List, pd.DataFrame, np.ndarray]): + Training target set. + X_test (Optional[Union[List, pd.DataFrame, np.ndarray]]): + Testing feature set + y_test (Optional[Union[List, pd.DataFrame, np.ndarray]]): + Testing target set + resampling_strategy (Optional[Union[CrossValTypes, HoldoutValTypes]]): + Strategy to split the training data. if None, uses + HoldoutValTypes.holdout_validation. + resampling_strategy_args (Optional[Dict[str, Any]]): + arguments required for the chosen resampling strategy. If None, uses + the default values provided in DEFAULT_RESAMPLING_PARAMETERS + in ```datasets/resampling_strategy.py```. + dataset_name (Optional[str]): + name of the dataset, used as experiment name. + + Returns: + BaseDataset: + the dataset object + """ + dataset, _ = self._get_dataset_input_validator( + X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, + resampling_strategy=resampling_strategy, + resampling_strategy_args=resampling_strategy_args, + dataset_name=dataset_name) + + return dataset + @property def run_history(self) -> RunHistory: return self._results_manager.run_history @@ -563,7 +680,7 @@ def _do_dummy_prediction(self) -> None: initial_num_run=num_run, stats=stats, memory_limit=memory_limit, - disable_file_output=True if len(self._disable_file_output) > 0 else False, + disable_file_output=self._disable_file_output, all_supported_metrics=self._all_supported_metrics ) @@ -647,7 +764,7 @@ def _do_traditional_prediction(self, time_left: int, func_eval_time_limit_secs: initial_num_run=self._backend.get_next_num_run(), stats=stats, memory_limit=memory_limit, - disable_file_output=True if len(self._disable_file_output) > 0 else False, + disable_file_output=self._disable_file_output, all_supported_metrics=self._all_supported_metrics ) dask_futures.append([ @@ -743,7 +860,7 @@ def _search( tae_func: Optional[Callable] = None, all_supported_metrics: bool = True, precision: int = 32, - disable_file_output: List = [], + disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, load_models: bool = True, portfolio_selection: Optional[str] = None, dask_client: Optional[dask.distributed.Client] = None @@ -844,10 +961,10 @@ def _search( precision (int: default=32): Numeric precision used when loading ensemble data. Can be either '16', '32' or '64'. - disable_file_output (Union[bool, List]): - If True, disable model and prediction output. - Can also be used as a list to pass more fine-grained - information on what to save. Allowed elements in the list are: + disable_file_output (Optional[List[Union[str, DisableFileOutputParameters]]]): + Used as a list to pass more fine-grained + information on what to save. Must be a member of `DisableFileOutputParameters`. + Allowed elements in the list are: + `y_optimization`: do not save the predictions for the optimization set, @@ -860,6 +977,9 @@ def _search( pipelines fit on each fold. + `y_test`: do not save the predictions for the test set. + + `all`: + do not save any of the above. + For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`. load_models (bool: default=True): Whether to load the models after fitting AutoPyTorch. portfolio_selection (Optional[str]): @@ -901,7 +1021,14 @@ def _search( self._backend.setup_logger(port=self._logger_port) self._all_supported_metrics = all_supported_metrics - self._disable_file_output = disable_file_output + self._disable_file_output = disable_file_output if disable_file_output is not None else [] + if ( + DisableFileOutputParameters.y_optimization in self._disable_file_output + and self.ensemble_size > 1 + ): + self._logger.warning(f"No ensemble will be created when {DisableFileOutputParameters.y_optimization}" + f" is in disable_file_output") + self._memory_limit = memory_limit self._time_for_task = total_walltime_limit # Save start time to backend @@ -1223,10 +1350,30 @@ def refit( return self - def fit(self, - dataset: BaseDataset, - pipeline_config: Optional[Configuration] = None, - split_id: int = 0) -> BasePipeline: + def fit_pipeline( + self, + configuration: Configuration, + *, + dataset: Optional[BaseDataset] = None, + X_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + y_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + dataset_name: Optional[str] = None, + resampling_strategy: Optional[Union[HoldoutValTypes, CrossValTypes]] = None, + resampling_strategy_args: Optional[Dict[str, Any]] = None, + run_time_limit_secs: int = 60, + memory_limit: Optional[int] = None, + eval_metric: Optional[str] = None, + all_supported_metrics: bool = False, + budget_type: Optional[str] = None, + include_components: Optional[Dict[str, Any]] = None, + exclude_components: Optional[Dict[str, Any]] = None, + search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, + budget: Optional[float] = None, + pipeline_options: Optional[Dict] = None, + disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, + ) -> Tuple[Optional[BasePipeline], RunInfo, RunValue, BaseDataset]: """ Fit a pipeline on the given task for the budget. A pipeline configuration can be specified if None, @@ -1237,24 +1384,130 @@ def fit(self, methods. Args: - dataset (Dataset): - The argument that will provide the dataset splits. It can either - be a dictionary with the splits, or the dataset object which can - generate the splits based on different restrictions. - split_id (int: default=0): - split id to fit on. - pipeline_config (Optional[Configuration]): - configuration to fit the pipeline with. If None, - uses default + configuration (Configuration): + configuration to fit the pipeline with. + dataset (BaseDataset): + An object of the appropriate child class of `BaseDataset`, + that will be used to fit the pipeline + X_train, y_train, X_test, y_test: Union[np.ndarray, List, pd.DataFrame] + A pair of features (X_train) and targets (y_train) used to fit a + pipeline. Additionally, a holdout of this pairs (X_test, y_test) can + be provided to track the generalization performance of each stage. + dataset_name (Optional[str]): + Name of the dataset, if None, random value is used. + resampling_strategy (Optional[Union[CrossValTypes, HoldoutValTypes]]): + Strategy to split the training data. if None, uses + HoldoutValTypes.holdout_validation. + resampling_strategy_args (Optional[Dict[str, Any]]): + Arguments required for the chosen resampling strategy. If None, uses + the default values provided in DEFAULT_RESAMPLING_PARAMETERS + in ```datasets/resampling_strategy.py```. + dataset_name (Optional[str]): + name of the dataset, used as experiment name. + run_time_limit_secs (int: default=60): + Time limit for a single call to the machine learning model. + Model fitting will be terminated if the machine learning algorithm + runs over the time limit. Set this value high enough so that + typical machine learning algorithms can be fit on the training + data. + memory_limit (Optional[int]): + Memory limit in MB for the machine learning algorithm. autopytorch + will stop fitting the machine learning algorithm if it tries + to allocate more than memory_limit MB. If None is provided, + no memory limit is set. In case of multi-processing, memory_limit + will be per job. This memory limit also applies to the ensemble + creation process. + eval_metric (Optional[str]): + Name of the metric that is used to evaluate a pipeline. + all_supported_metrics (bool: default=True): + if True, all metrics supporting current task will be calculated + for each pipeline and results will be available via cv_results + budget_type (str): + Type of budget to be used when fitting the pipeline. + It can be one of: + + + `epochs`: The training of each pipeline will be terminated after + a number of epochs have passed. This number of epochs is determined by the + budget argument of this method. + + `runtime`: The training of each pipeline will be terminated after + a number of seconds have passed. This number of seconds is determined by the + budget argument of this method. The overall fitting time of a pipeline is + controlled by func_eval_time_limit_secs. 'runtime' only controls the allocated + time to train a pipeline, but it does not consider the overall time it takes + to create a pipeline (data loading and preprocessing, other i/o operations, etc.). + include_components (Optional[Dict[str, Any]]): + Dictionary containing components to include. Key is the node + name and Value is an Iterable of the names of the components + to include. Only these components will be present in the + search space. + exclude_components (Optional[Dict[str, Any]]): + Dictionary containing components to exclude. Key is the node + name and Value is an Iterable of the names of the components + to exclude. All except these components will be present in + the search space. + search_space_updates(Optional[HyperparameterSearchSpaceUpdates]): + Updates to be made to the hyperparameter search space of the pipeline + budget (Optional[float]): + Budget to fit a single run of the pipeline. If not + provided, uses the default in the pipeline config + pipeline_options (Optional[Dict]): + Valid config options include "device", + "torch_num_threads", "early_stopping", "use_tensorboard_logger", + "metrics_during_training" + disable_file_output (Optional[List[Union[str, DisableFileOutputParameters]]]): + Used as a list to pass more fine-grained + information on what to save. Must be a member of `DisableFileOutputParameters`. + Allowed elements in the list are: + + + `y_optimization`: + do not save the predictions for the optimization set, + which would later on be used to build an ensemble. Note that SMAC + optimizes a metric evaluated on the optimization set. + + `pipeline`: + do not save any individual pipeline files + + `pipelines`: + In case of cross validation, disables saving the joint model of the + pipelines fit on each fold. + + `y_test`: + do not save the predictions for the test set. + + `all`: + do not save any of the above. + For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`. Returns: - BasePipeline: + (BasePipeline): fitted pipeline + (RunInfo): + Run information + (RunValue): + Result of fitting the pipeline + (BaseDataset): + Dataset created from the given tensors """ - self.dataset_name = dataset.dataset_name - if self._logger is None: - self._logger = self._get_logger(str(self.dataset_name)) + if dataset is None: + if ( + X_train is not None + and y_train is not None + ): + raise ValueError("No dataset provided, must provide X_train, y_train tensors") + dataset = self.get_dataset(X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, + resampling_strategy=resampling_strategy, + resampling_strategy_args=resampling_strategy_args, + dataset_name=dataset_name + ) + + # dataset_name is created inside the constructor of BaseDataset + # we expect it to be not None. This is for mypy + assert dataset.dataset_name is not None + + # TAE expects each configuration to have a config_id. + # For fitting a pipeline as it is not part of the + # search process, it makes sense to set it to 0 + configuration.__setattr__('config_id', 0) # get dataset properties dataset_requirements = get_dataset_requirements( @@ -1265,21 +1518,115 @@ def fit(self, dataset_properties = dataset.get_dataset_properties(dataset_requirements) self._backend.save_datamanager(dataset) - # build pipeline - pipeline = self.build_pipeline(dataset_properties) - if pipeline_config is not None: - pipeline.set_hyperparameters(pipeline_config) + if self._logger is None: + self._logger = self._get_logger(dataset.dataset_name) + + include_components = self.include_components if include_components is None else include_components + exclude_components = self.exclude_components if exclude_components is None else exclude_components + search_space_updates = self.search_space_updates if search_space_updates is None else search_space_updates - # initialise fit dictionary - X = self._get_fit_dictionary( - dataset_properties=dataset_properties, - dataset=dataset, - split_id=split_id) + scenario_mock = unittest.mock.Mock() + scenario_mock.wallclock_limit = run_time_limit_secs + # This stats object is a hack - maybe the SMAC stats object should + # already be generated here! + stats = Stats(scenario_mock) + + if memory_limit is None and getattr(self, '_memory_limit', None) is not None: + memory_limit = self._memory_limit + + metric = get_metrics(dataset_properties=dataset_properties, + names=[eval_metric] if eval_metric is not None else None, + all_supported_metrics=False).pop() + + pipeline_options = self.pipeline_options.copy().update(pipeline_options) if pipeline_options is not None \ + else self.pipeline_options.copy() + + assert pipeline_options is not None + + if budget_type is not None: + pipeline_options.update({'budget_type': budget_type}) + else: + budget_type = pipeline_options['budget_type'] - fit_and_suppress_warnings(self._logger, pipeline, X, y=None) + budget = budget if budget is not None else pipeline_options[budget_type] + + if disable_file_output is None: + disable_file_output = getattr(self, '_disable_file_output', []) + + stats.start_timing() + + tae = ExecuteTaFuncWithQueue( + backend=self._backend, + seed=self.seed, + metric=metric, + logger_port=self._logger_port, + cost_for_crash=get_cost_of_crash(metric), + abort_on_first_run_crash=False, + initial_num_run=self._backend.get_next_num_run(), + stats=stats, + memory_limit=memory_limit, + disable_file_output=disable_file_output, + all_supported_metrics=all_supported_metrics, + budget_type=budget_type, + include=include_components, + exclude=exclude_components, + search_space_updates=search_space_updates, + pipeline_config=pipeline_options, + pynisher_context=self._multiprocessing_context + ) + + run_info, run_value = tae.run_wrapper( + RunInfo(config=configuration, + budget=budget, + seed=self.seed, + cutoff=run_time_limit_secs, + capped=False, + instance_specific=None, + instance=None) + ) + + fitted_pipeline = self._get_fitted_pipeline( + dataset_name=dataset.dataset_name, + pipeline_idx=run_info.config.config_id + tae.initial_num_run, + run_info=run_info, + run_value=run_value, + disable_file_output=disable_file_output + ) self._clean_logger() - return pipeline + + return fitted_pipeline, run_info, run_value, dataset + + def _get_fitted_pipeline( + self, + dataset_name: str, + pipeline_idx: int, + run_info: RunInfo, + run_value: RunValue, + disable_file_output: List[Union[str, DisableFileOutputParameters]] + ) -> Optional[BasePipeline]: + + if self._logger is None: + self._logger = self._get_logger(str(dataset_name)) + + if run_value.status != StatusType.SUCCESS: + warnings.warn(f"Fitting pipeline failed with status: {run_value.status}" + f", additional_info: {run_value.additional_info}") + return None + elif any(disable_file_output for c in ['all', 'pipeline']): + self._logger.warning("File output is disabled. No pipeline can returned") + return None + + if self.resampling_strategy in CrossValTypes: + load_function = self._backend.load_cv_model_by_seed_and_id_and_budget + else: + load_function = self._backend.load_model_by_seed_and_id_and_budget + + return load_function( # type: ignore[no-any-return] + seed=self.seed, + idx=pipeline_idx, + budget=float(run_info.budget), + ) def predict( self, diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index d83f1dc01..aeb69277c 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -1,6 +1,4 @@ -import os -import uuid -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import numpy as np @@ -13,11 +11,13 @@ TASK_TYPES_TO_STRING, ) from autoPyTorch.data.tabular_validator import TabularInputValidator +from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType from autoPyTorch.datasets.resampling_strategy import ( CrossValTypes, HoldoutValTypes, ) from autoPyTorch.datasets.tabular_dataset import TabularDataset +from autoPyTorch.evaluation.utils import DisableFileOutputParameters from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates @@ -54,13 +54,16 @@ class TabularClassificationTask(BaseTask): delete_tmp_folder_after_terminate (bool): Determines whether to delete the temporary directory, when finished - include_components (Optional[Dict]): - If None, all possible components are used. - Otherwise specifies set of components to use. - exclude_components (Optional[Dict]): - If None, all possible components are used. - Otherwise specifies set of components not to use. - Incompatible with include components. + include_components (Optional[Dict[str, Any]]): + Dictionary containing components to include. Key is the node + name and Value is an Iterable of the names of the components + to include. Only these components will be present in the + search space. + exclude_components (Optional[Dict[str, Any]]): + Dictionary containing components to exclude. Key is the node + name and Value is an Iterable of the names of the components + to exclude. All except these components will be present in + the search space. search_space_updates (Optional[HyperparameterSearchSpaceUpdates]): search space updates that can be used to modify the search space of particular components or choice modules of the pipeline @@ -78,8 +81,8 @@ def __init__( output_directory: Optional[str] = None, delete_tmp_folder_after_terminate: bool = True, delete_output_folder_after_terminate: bool = True, - include_components: Optional[Dict] = None, - exclude_components: Optional[Dict] = None, + include_components: Optional[Dict[str, Any]] = None, + exclude_components: Optional[Dict[str, Any]] = None, resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation, resampling_strategy_args: Optional[Dict[str, Any]] = None, backend: Optional[Backend] = None, @@ -106,18 +109,109 @@ def __init__( task_type=TASK_TYPES_TO_STRING[TABULAR_CLASSIFICATION], ) - def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TabularClassificationPipeline: + def build_pipeline( + self, + dataset_properties: Dict[str, BaseDatasetPropertiesType], + include_components: Optional[Dict[str, Any]] = None, + exclude_components: Optional[Dict[str, Any]] = None, + search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None + ) -> TabularClassificationPipeline: """ - Build pipeline according to current task and for the passed dataset properties + Build pipeline according to current task + and for the passed dataset properties Args: - dataset_properties (Dict[str,Any]) + dataset_properties (Dict[str, Any]): + Characteristics of the dataset to guide the pipeline + choices of components + include_components (Optional[Dict[str, Any]]): + Dictionary containing components to include. Key is the node + name and Value is an Iterable of the names of the components + to include. Only these components will be present in the + search space. + exclude_components (Optional[Dict[str, Any]]): + Dictionary containing components to exclude. Key is the node + name and Value is an Iterable of the names of the components + to exclude. All except these components will be present in + the search space. + search_space_updates (Optional[HyperparameterSearchSpaceUpdates]): + Search space updates that can be used to modify the search + space of particular components or choice modules of the pipeline Returns: - TabularClassificationPipeline: - Pipeline compatible with the given dataset properties. + TabularClassificationPipeline + + """ + return TabularClassificationPipeline(dataset_properties=dataset_properties, + include=include_components, + exclude=exclude_components, + search_space_updates=search_space_updates) + + def _get_dataset_input_validator( + self, + X_train: Union[List, pd.DataFrame, np.ndarray], + y_train: Union[List, pd.DataFrame, np.ndarray], + X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + resampling_strategy: Optional[Union[CrossValTypes, HoldoutValTypes]] = None, + resampling_strategy_args: Optional[Dict[str, Any]] = None, + dataset_name: Optional[str] = None, + ) -> Tuple[TabularDataset, TabularInputValidator]: """ - return TabularClassificationPipeline(dataset_properties=dataset_properties) + Returns an object of `TabularDataset` and an object of + `TabularInputValidator` according to the current task. + + Args: + X_train (Union[List, pd.DataFrame, np.ndarray]): + Training feature set. + y_train (Union[List, pd.DataFrame, np.ndarray]): + Training target set. + X_test (Optional[Union[List, pd.DataFrame, np.ndarray]]): + Testing feature set + y_test (Optional[Union[List, pd.DataFrame, np.ndarray]]): + Testing target set + resampling_strategy (Optional[Union[CrossValTypes, HoldoutValTypes]]): + Strategy to split the training data. if None, uses + HoldoutValTypes.holdout_validation. + resampling_strategy_args (Optional[Dict[str, Any]]): + arguments required for the chosen resampling strategy. If None, uses + the default values provided in DEFAULT_RESAMPLING_PARAMETERS + in ```datasets/resampling_strategy.py```. + dataset_name (Optional[str]): + name of the dataset, used as experiment name. + Returns: + TabularDataset: + the dataset object. + TabularInputValidator: + the input validator fitted on the data. + """ + + resampling_strategy = resampling_strategy if resampling_strategy is not None else self.resampling_strategy + resampling_strategy_args = resampling_strategy_args if resampling_strategy_args is not None else \ + self.resampling_strategy_args + + # Create a validator object to make sure that the data provided by + # the user matches the autopytorch requirements + InputValidator = TabularInputValidator( + is_classification=True, + logger_port=self._logger_port, + ) + + # Fit a input validator to check the provided data + # Also, an encoder is fit to both train and test data, + # to prevent unseen categories during inference + InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) + + dataset = TabularDataset( + X=X_train, Y=y_train, + X_test=X_test, Y_test=y_test, + validator=InputValidator, + resampling_strategy=resampling_strategy, + resampling_strategy_args=resampling_strategy_args, + dataset_name=dataset_name + ) + + return dataset, InputValidator def search( self, @@ -138,7 +232,7 @@ def search( get_smac_object_callback: Optional[Callable] = None, all_supported_metrics: bool = True, precision: int = 32, - disable_file_output: List = [], + disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, load_models: bool = True, portfolio_selection: Optional[str] = None, ) -> 'BaseTask': @@ -237,10 +331,10 @@ def search( precision (int: default=32): Numeric precision used when loading ensemble data. Can be either '16', '32' or '64'. - disable_file_output (Union[bool, List]): - If True, disable model and prediction output. - Can also be used as a list to pass more fine-grained - information on what to save. Allowed elements in the list are: + disable_file_output (Optional[List[Union[str, DisableFileOutputParameters]]]): + Used as a list to pass more fine-grained + information on what to save. Must be a member of `DisableFileOutputParameters`. + Allowed elements in the list are: + `y_optimization`: do not save the predictions for the optimization set, @@ -253,6 +347,9 @@ def search( pipelines fit on each fold. + `y_test`: do not save the predictions for the test set. + + `all`: + do not save any of the above. + For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`. load_models (bool: default=True): Whether to load the models after fitting AutoPyTorch. portfolio_selection (Optional[str]): @@ -269,32 +366,15 @@ def search( self """ - if dataset_name is None: - dataset_name = str(uuid.uuid1(clock_seq=os.getpid())) - # we have to create a logger for at this point for the validator - self._logger = self._get_logger(dataset_name) - - # Create a validator object to make sure that the data provided by - # the user matches the autopytorch requirements - self.InputValidator = TabularInputValidator( - is_classification=True, - logger_port=self._logger_port, - ) - - # Fit a input validator to check the provided data - # Also, an encoder is fit to both train and test data, - # to prevent unseen categories during inference - self.InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) - - self.dataset = TabularDataset( - X=X_train, Y=y_train, - X_test=X_test, Y_test=y_test, - validator=self.InputValidator, - dataset_name=dataset_name, + self.dataset, self.InputValidator = self._get_dataset_input_validator( + X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, resampling_strategy=self.resampling_strategy, resampling_strategy_args=self.resampling_strategy_args, - ) + dataset_name=dataset_name) return self._search( dataset=self.dataset, @@ -333,7 +413,7 @@ def predict( """ if self.InputValidator is None or not self.InputValidator._is_fitted: raise ValueError("predict() is only supported after calling search. Kindly call first " - "the estimator fit() method.") + "the estimator search() method.") X_test = self.InputValidator.feature_validator.transform(X_test) predicted_probabilities = super().predict(X_test, batch_size=batch_size, @@ -353,6 +433,6 @@ def predict_proba(self, batch_size: Optional[int] = None, n_jobs: int = 1) -> np.ndarray: if self.InputValidator is None or not self.InputValidator._is_fitted: raise ValueError("predict() is only supported after calling search. Kindly call first " - "the estimator fit() method.") + "the estimator search() method.") X_test = self.InputValidator.feature_validator.transform(X_test) return super().predict(X_test, batch_size=batch_size, n_jobs=n_jobs) diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py index a68990732..f429b210c 100644 --- a/autoPyTorch/api/tabular_regression.py +++ b/autoPyTorch/api/tabular_regression.py @@ -1,6 +1,4 @@ -import os -import uuid -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import numpy as np @@ -13,11 +11,13 @@ TASK_TYPES_TO_STRING ) from autoPyTorch.data.tabular_validator import TabularInputValidator +from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType from autoPyTorch.datasets.resampling_strategy import ( CrossValTypes, HoldoutValTypes, ) from autoPyTorch.datasets.tabular_dataset import TabularDataset +from autoPyTorch.evaluation.utils import DisableFileOutputParameters from autoPyTorch.pipeline.tabular_regression import TabularRegressionPipeline from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates @@ -54,13 +54,16 @@ class TabularRegressionTask(BaseTask): delete_tmp_folder_after_terminate (bool): Determines whether to delete the temporary directory, when finished - include_components (Optional[Dict]): - If None, all possible components are used. - Otherwise specifies set of components to use. - exclude_components (Optional[Dict]): - If None, all possible components are used. - Otherwise specifies set of components not to use. - Incompatible with include components. + include_components (Optional[Dict[str, Any]]): + Dictionary containing components to include. Key is the node + name and Value is an Iterable of the names of the components + to include. Only these components will be present in the + search space. + exclude_components (Optional[Dict[str, Any]]): + Dictionary containing components to exclude. Key is the node + name and Value is an Iterable of the names of the components + to exclude. All except these components will be present in + the search space. search_space_updates (Optional[HyperparameterSearchSpaceUpdates]): search space updates that can be used to modify the search space of particular components or choice modules of the pipeline @@ -79,8 +82,8 @@ def __init__( output_directory: Optional[str] = None, delete_tmp_folder_after_terminate: bool = True, delete_output_folder_after_terminate: bool = True, - include_components: Optional[Dict] = None, - exclude_components: Optional[Dict] = None, + include_components: Optional[Dict[str, Any]] = None, + exclude_components: Optional[Dict[str, Any]] = None, resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation, resampling_strategy_args: Optional[Dict[str, Any]] = None, backend: Optional[Backend] = None, @@ -107,18 +110,109 @@ def __init__( task_type=TASK_TYPES_TO_STRING[TABULAR_REGRESSION], ) - def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TabularRegressionPipeline: + def build_pipeline( + self, + dataset_properties: Dict[str, BaseDatasetPropertiesType], + include_components: Optional[Dict[str, Any]] = None, + exclude_components: Optional[Dict[str, Any]] = None, + search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None + ) -> TabularRegressionPipeline: """ - Build pipeline according to current task and for the passed dataset properties + Build pipeline according to current task + and for the passed dataset properties Args: - dataset_properties (Dict[str,Any]) + dataset_properties (Dict[str, Any]): + Characteristics of the dataset to guide the pipeline + choices of components + include_components (Optional[Dict[str, Any]]): + Dictionary containing components to include. Key is the node + name and Value is an Iterable of the names of the components + to include. Only these components will be present in the + search space. + exclude_components (Optional[Dict[str, Any]]): + Dictionary containing components to exclude. Key is the node + name and Value is an Iterable of the names of the components + to exclude. All except these components will be present in + the search space. + search_space_updates (Optional[HyperparameterSearchSpaceUpdates]): + Search space updates that can be used to modify the search + space of particular components or choice modules of the pipeline Returns: TabularRegressionPipeline: - Pipeline compatible with the given dataset properties. + + """ + return TabularRegressionPipeline(dataset_properties=dataset_properties, + include=include_components, + exclude=exclude_components, + search_space_updates=search_space_updates) + + def _get_dataset_input_validator( + self, + X_train: Union[List, pd.DataFrame, np.ndarray], + y_train: Union[List, pd.DataFrame, np.ndarray], + X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + resampling_strategy: Optional[Union[CrossValTypes, HoldoutValTypes]] = None, + resampling_strategy_args: Optional[Dict[str, Any]] = None, + dataset_name: Optional[str] = None, + ) -> Tuple[TabularDataset, TabularInputValidator]: + """ + Returns an object of `TabularDataset` and an object of + `TabularInputValidator` according to the current task. + + Args: + X_train (Union[List, pd.DataFrame, np.ndarray]): + Training feature set. + y_train (Union[List, pd.DataFrame, np.ndarray]): + Training target set. + X_test (Optional[Union[List, pd.DataFrame, np.ndarray]]): + Testing feature set + y_test (Optional[Union[List, pd.DataFrame, np.ndarray]]): + Testing target set + resampling_strategy (Optional[Union[CrossValTypes, HoldoutValTypes]]): + Strategy to split the training data. if None, uses + HoldoutValTypes.holdout_validation. + resampling_strategy_args (Optional[Dict[str, Any]]): + arguments required for the chosen resampling strategy. If None, uses + the default values provided in DEFAULT_RESAMPLING_PARAMETERS + in ```datasets/resampling_strategy.py```. + dataset_name (Optional[str]): + name of the dataset, used as experiment name. + Returns: + TabularDataset: + the dataset object. + TabularInputValidator: + the input validator fitted on the data. """ - return TabularRegressionPipeline(dataset_properties=dataset_properties) + + resampling_strategy = resampling_strategy if resampling_strategy is not None else self.resampling_strategy + resampling_strategy_args = resampling_strategy_args if resampling_strategy_args is not None else \ + self.resampling_strategy_args + + # Create a validator object to make sure that the data provided by + # the user matches the autopytorch requirements + InputValidator = TabularInputValidator( + is_classification=False, + logger_port=self._logger_port, + ) + + # Fit a input validator to check the provided data + # Also, an encoder is fit to both train and test data, + # to prevent unseen categories during inference + InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) + + dataset = TabularDataset( + X=X_train, Y=y_train, + X_test=X_test, Y_test=y_test, + validator=InputValidator, + resampling_strategy=resampling_strategy, + resampling_strategy_args=resampling_strategy_args, + dataset_name=dataset_name + ) + + return dataset, InputValidator def search( self, @@ -139,7 +233,7 @@ def search( get_smac_object_callback: Optional[Callable] = None, all_supported_metrics: bool = True, precision: int = 32, - disable_file_output: List = [], + disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, load_models: bool = True, portfolio_selection: Optional[str] = None, ) -> 'BaseTask': @@ -155,8 +249,8 @@ def search( A pair of features (X_train) and targets (y_train) used to fit a pipeline. Additionally, a holdout of this pairs (X_test, y_test) can be provided to track the generalization performance of each stage. - optimize_metric (str): name of the metric that is used to - evaluate a pipeline. + optimize_metric (str): + Name of the metric that is used to evaluate a pipeline. budget_type (str): Type of budget to be used when fitting the pipeline. It can be one of: @@ -238,10 +332,10 @@ def search( precision (int: default=32): Numeric precision used when loading ensemble data. Can be either '16', '32' or '64'. - disable_file_output (Union[bool, List]): - If True, disable model and prediction output. - Can also be used as a list to pass more fine-grained - information on what to save. Allowed elements in the list are: + disable_file_output (Optional[List[Union[str, DisableFileOutputParameters]]]): + Used as a list to pass more fine-grained + information on what to save. Must be a member of `DisableFileOutputParameters`. + Allowed elements in the list are: + `y_optimization`: do not save the predictions for the optimization set, @@ -254,6 +348,9 @@ def search( pipelines fit on each fold. + `y_test`: do not save the predictions for the test set. + + `all`: + do not save any of the above. + For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`. load_models (bool: default=True): Whether to load the models after fitting AutoPyTorch. portfolio_selection (Optional[str]): @@ -270,32 +367,14 @@ def search( self """ - if dataset_name is None: - dataset_name = str(uuid.uuid1(clock_seq=os.getpid())) - - # we have to create a logger for at this point for the validator - self._logger = self._get_logger(dataset_name) - - # Create a validator object to make sure that the data provided by - # the user matches the autopytorch requirements - self.InputValidator = TabularInputValidator( - is_classification=False, - logger_port=self._logger_port, - ) - - # Fit a input validator to check the provided data - # Also, an encoder is fit to both train and test data, - # to prevent unseen categories during inference - self.InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) - - self.dataset = TabularDataset( - X=X_train, Y=y_train, - X_test=X_test, Y_test=y_test, - validator=self.InputValidator, - dataset_name=dataset_name, + self.dataset, self.InputValidator = self._get_dataset_input_validator( + X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, resampling_strategy=self.resampling_strategy, resampling_strategy_args=self.resampling_strategy_args, - ) + dataset_name=dataset_name) return self._search( dataset=self.dataset, @@ -324,7 +403,7 @@ def predict( ) -> np.ndarray: if self.InputValidator is None or not self.InputValidator._is_fitted: raise ValueError("predict() is only supported after calling search. Kindly call first " - "the estimator fit() method.") + "the estimator search() method.") X_test = self.InputValidator.feature_validator.transform(X_test) predicted_values = super().predict(X_test, batch_size=batch_size, diff --git a/autoPyTorch/datasets/tabular_dataset.py b/autoPyTorch/datasets/tabular_dataset.py index c2e229868..16335dfbb 100644 --- a/autoPyTorch/datasets/tabular_dataset.py +++ b/autoPyTorch/datasets/tabular_dataset.py @@ -35,8 +35,8 @@ class TabularDataset(BaseDataset): resampling_strategy (Union[CrossValTypes, HoldoutValTypes]), (default=HoldoutValTypes.holdout_validation): strategy to split the training data. - resampling_strategy_args (Optional[Dict[str, Any]]): arguments - required for the chosen resampling strategy. If None, uses + resampling_strategy_args (Optional[Dict[str, Any]]): + arguments required for the chosen resampling strategy. If None, uses the default values provided in DEFAULT_RESAMPLING_PARAMETERS in ```datasets/resampling_strategy.py```. shuffle: Whether to shuffle the data before performing splits diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py index 027c7211a..2f792b7a8 100644 --- a/autoPyTorch/evaluation/abstract_evaluator.py +++ b/autoPyTorch/evaluation/abstract_evaluator.py @@ -33,8 +33,9 @@ ) from autoPyTorch.datasets.base_dataset import BaseDataset, BaseDatasetPropertiesType from autoPyTorch.evaluation.utils import ( + DisableFileOutputParameters, VotingRegressorWrapper, - convert_multioutput_multiclass_to_multilabel + convert_multioutput_multiclass_to_multilabel, ) from autoPyTorch.pipeline.base_pipeline import BasePipeline from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric @@ -375,10 +376,25 @@ class AbstractEvaluator(object): An optional dictionary to include components of the pipeline steps. exclude (Optional[Dict[str, Any]]): An optional dictionary to exclude components of the pipeline steps. - disable_file_output (Union[bool, List[str]]): - By default, the model, it's predictions and other metadata is stored on disk - for each finished configuration. This argument allows the user to skip - saving certain file type, for example the model, from being written to disk. + disable_file_output (Optional[List[Union[str, DisableFileOutputParameters]]]): + Used as a list to pass more fine-grained + information on what to save. Must be a member of `DisableFileOutputParameters`. + Allowed elements in the list are: + + + `y_optimization`: + do not save the predictions for the optimization set, + which would later on be used to build an ensemble. Note that SMAC + optimizes a metric evaluated on the optimization set. + + `pipeline`: + do not save any individual pipeline files + + `pipelines`: + In case of cross validation, disables saving the joint model of the + pipelines fit on each fold. + + `y_test`: + do not save the predictions for the test set. + + `all`: + do not save any of the above. + For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`. init_params (Optional[Dict[str, Any]]): Optional argument that is passed to each pipeline step. It is the equivalent of kwargs for the pipeline steps. @@ -404,7 +420,7 @@ def __init__(self, backend: Backend, num_run: Optional[int] = None, include: Optional[Dict[str, Any]] = None, exclude: Optional[Dict[str, Any]] = None, - disable_file_output: Union[bool, List[str]] = False, + disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, init_params: Optional[Dict[str, Any]] = None, logger_port: Optional[int] = None, all_supported_metrics: bool = True, @@ -448,12 +464,11 @@ def __init__(self, backend: Backend, # Flag to save target for ensemble self.output_y_hat_optimization = output_y_hat_optimization - if isinstance(disable_file_output, bool): - self.disable_file_output: bool = disable_file_output - elif isinstance(disable_file_output, List): - self.disabled_file_outputs: List[str] = disable_file_output - else: - raise ValueError('disable_file_output should be either a bool or a list') + disable_file_output = disable_file_output if disable_file_output is not None else [] + # check compatibility of disable file output + DisableFileOutputParameters.check_compatibility(disable_file_output) + + self.disable_file_output = disable_file_output self.pipeline_class: Optional[Union[BaseEstimator, BasePipeline]] = None if self.task_type in REGRESSION_TASKS: @@ -834,20 +849,17 @@ def file_output( ) # Abort if we don't want to output anything. - if hasattr(self, 'disable_file_output'): - if self.disable_file_output: - return None, {} - else: - self.disabled_file_outputs = [] + if 'all' in self.disable_file_output: + return None, {} # This file can be written independently of the others down bellow - if 'y_optimization' not in self.disabled_file_outputs: + if 'y_optimization' not in self.disable_file_output: if self.output_y_hat_optimization: self.backend.save_targets_ensemble(self.Y_optimization) - if hasattr(self, 'pipelines') and self.pipelines is not None: - if self.pipelines[0] is not None and len(self.pipelines) > 0: - if 'pipelines' not in self.disabled_file_outputs: + if getattr(self, 'pipelines', None) is not None: + if self.pipelines[0] is not None and len(self.pipelines) > 0: # type: ignore[index, arg-type] + if 'pipelines' not in self.disable_file_output: if self.task_type in CLASSIFICATION_TASKS: pipelines = VotingClassifier(estimators=None, voting='soft', ) else: @@ -860,8 +872,8 @@ def file_output( else: pipelines = None - if hasattr(self, 'pipeline') and self.pipeline is not None: - if 'pipeline' not in self.disabled_file_outputs: + if getattr(self, 'pipeline', None) is not None: + if 'pipeline' not in self.disable_file_output: pipeline = self.pipeline else: pipeline = None @@ -877,15 +889,15 @@ def file_output( cv_model=pipelines, ensemble_predictions=( Y_optimization_pred if 'y_optimization' not in - self.disabled_file_outputs else None + self.disable_file_output else None ), valid_predictions=( Y_valid_pred if 'y_valid' not in - self.disabled_file_outputs else None + self.disable_file_output else None ), test_predictions=( Y_test_pred if 'y_test' not in - self.disabled_file_outputs else None + self.disable_file_output else None ), ) diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py index d99251d3d..683870304 100644 --- a/autoPyTorch/evaluation/tae.py +++ b/autoPyTorch/evaluation/tae.py @@ -24,7 +24,12 @@ import autoPyTorch.evaluation.train_evaluator from autoPyTorch.automl_common.common.utils.backend import Backend -from autoPyTorch.evaluation.utils import empty_queue, extract_learning_curve, read_queue +from autoPyTorch.evaluation.utils import ( + DisableFileOutputParameters, + empty_queue, + extract_learning_curve, + read_queue +) from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric from autoPyTorch.utils.common import dict_repr, replace_string_bool_to_bool from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates @@ -109,7 +114,7 @@ def __init__( include: Optional[Dict[str, Any]] = None, exclude: Optional[Dict[str, Any]] = None, memory_limit: Optional[int] = None, - disable_file_output: bool = False, + disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, init_params: Dict[str, Any] = None, budget_type: str = None, ta: Optional[Callable] = None, diff --git a/autoPyTorch/evaluation/train_evaluator.py b/autoPyTorch/evaluation/train_evaluator.py index 37926a8c0..1bf1bce4c 100644 --- a/autoPyTorch/evaluation/train_evaluator.py +++ b/autoPyTorch/evaluation/train_evaluator.py @@ -18,6 +18,7 @@ AbstractEvaluator, fit_and_suppress_warnings ) +from autoPyTorch.evaluation.utils import DisableFileOutputParameters from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric from autoPyTorch.utils.common import dict_repr, subsampler from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates @@ -79,10 +80,25 @@ class TrainEvaluator(AbstractEvaluator): An optional dictionary to include components of the pipeline steps. exclude (Optional[Dict[str, Any]]): An optional dictionary to exclude components of the pipeline steps. - disable_file_output (Union[bool, List[str]]): - By default, the model, it's predictions and other metadata is stored on disk - for each finished configuration. This argument allows the user to skip - saving certain file type, for example the model, from being written to disk. + disable_file_output (Optional[List[Union[str, DisableFileOutputParameters]]]): + Used as a list to pass more fine-grained + information on what to save. Must be a member of `DisableFileOutputParameters`. + Allowed elements in the list are: + + + `y_optimization`: + do not save the predictions for the optimization set, + which would later on be used to build an ensemble. Note that SMAC + optimizes a metric evaluated on the optimization set. + + `pipeline`: + do not save any individual pipeline files + + `pipelines`: + In case of cross validation, disables saving the joint model of the + pipelines fit on each fold. + + `y_test`: + do not save the predictions for the test set. + + `all`: + do not save any of the above. + For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`. init_params (Optional[Dict[str, Any]]): Optional argument that is passed to each pipeline step. It is the equivalent of kwargs for the pipeline steps. @@ -107,7 +123,7 @@ def __init__(self, backend: Backend, queue: Queue, num_run: Optional[int] = None, include: Optional[Dict[str, Any]] = None, exclude: Optional[Dict[str, Any]] = None, - disable_file_output: Union[bool, List] = False, + disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, init_params: Optional[Dict[str, Any]] = None, logger_port: Optional[int] = None, keep_models: Optional[bool] = None, @@ -397,7 +413,7 @@ def eval_function( num_run: int, include: Optional[Dict[str, Any]], exclude: Optional[Dict[str, Any]], - disable_file_output: Union[bool, List], + disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, pipeline_config: Optional[Dict[str, Any]] = None, budget_type: str = None, init_params: Optional[Dict[str, Any]] = None, diff --git a/autoPyTorch/evaluation/utils.py b/autoPyTorch/evaluation/utils.py index 1bf93fa84..37e5fa36d 100644 --- a/autoPyTorch/evaluation/utils.py +++ b/autoPyTorch/evaluation/utils.py @@ -8,6 +8,9 @@ from smac.runhistory.runhistory import RunValue +from autoPyTorch.utils.common import autoPyTorchEnum + + __all__ = [ 'read_queue', 'convert_multioutput_multiclass_to_multilabel', @@ -102,3 +105,40 @@ def _predict(self, X: np.ndarray) -> np.ndarray: predictions.append(pred.ravel()) return np.asarray(predictions).T + + +class DisableFileOutputParameters(autoPyTorchEnum): + """ + Contains literals that can be passed in to `disable_file_output` list. + These include: + + + `y_optimization`: + do not save the predictions for the optimization set, + which would later on be used to build an ensemble. Note that SMAC + optimizes a metric evaluated on the optimization set. + + `pipeline`: + do not save any individual pipeline files + + `pipelines`: + In case of cross validation, disables saving the joint model of the + pipelines fit on each fold. + + `y_test`: + do not save the predictions for the test set. + + `all`: + do not save any of the above. + """ + pipeline = 'pipeline' + pipelines = 'pipelines' + y_optimization = 'y_optimization' + y_test = 'y_test' + all = 'all' + + @classmethod + def check_compatibility( + cls, + disable_file_output: List[Union[str, 'DisableFileOutputParameters']] + ) -> None: + for item in disable_file_output: + if item not in cls.__members__ and not isinstance(item, cls): + raise ValueError(f"Expected {item} to be in the members (" + f"{list(cls.__members__.keys())}) of {cls.__name__}" + f" or as string value of a member.") diff --git a/autoPyTorch/utils/common.py b/autoPyTorch/utils/common.py index 7be8a233c..1488d5fcd 100644 --- a/autoPyTorch/utils/common.py +++ b/autoPyTorch/utils/common.py @@ -1,3 +1,4 @@ +from enum import Enum from typing import Any, Dict, Iterable, List, NamedTuple, Optional, Sequence, Type, Union from ConfigSpace.configuration_space import ConfigurationSpace @@ -75,6 +76,27 @@ def __str__(self) -> str: self.hyperparameter, self.value_range, self.default_value, self.log) +class autoPyTorchEnum(str, Enum): + """ + Utility class for enums in autoPyTorch. + Allows users to use strings, while we internally use + this enum + """ + def __eq__(self, other: Any) -> bool: + if isinstance(other, autoPyTorchEnum): + return type(self) == type(other) and self.value == other.value + elif isinstance(other, str): + return bool(self.value == other) + else: + enum_name = self.__class__.__name__ + raise RuntimeError(f"Unsupported type {type(other)}. " + f"{enum_name} only supports `str` and" + f"`{enum_name}`") + + def __hash__(self) -> int: + return hash(self.value) + + def custom_collate_fn(batch: List) -> List[Optional[torch.Tensor]]: """ In the case of not providing a y tensor, in a diff --git a/examples/40_advanced/example_single_configuration.py b/examples/40_advanced/example_single_configuration.py new file mode 100644 index 000000000..453ac4636 --- /dev/null +++ b/examples/40_advanced/example_single_configuration.py @@ -0,0 +1,81 @@ +# -*- encoding: utf-8 -*- +""" +========================== +Fit a single configuration +========================== +*Auto-PyTorch* searches for the best combination of machine learning algorithms +and their hyper-parameter configuration for a given task. +This example shows how one can fit one of these pipelines, both, with a user defined +configuration, and a randomly sampled one form the configuration space. +The pipelines that Auto-PyTorch fits are compatible with Scikit-Learn API. You can +get further documentation about Scikit-Learn models here: _ +""" +import os +import tempfile as tmp +import warnings + +os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir() +os.environ['OMP_NUM_THREADS'] = '1' +os.environ['OPENBLAS_NUM_THREADS'] = '1' +os.environ['MKL_NUM_THREADS'] = '1' + +warnings.simplefilter(action='ignore', category=UserWarning) +warnings.simplefilter(action='ignore', category=FutureWarning) + +import sklearn.datasets +import sklearn.metrics + +from autoPyTorch.api.tabular_classification import TabularClassificationTask +from autoPyTorch.datasets.resampling_strategy import HoldoutValTypes + + +############################################################################ +# Data Loading +# ============ + +X, y = sklearn.datasets.fetch_openml(data_id=3, return_X_y=True, as_frame=True) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, test_size=0.5, random_state=3 +) + +############################################################################ +# Define an estimator +# =================== + +estimator = TabularClassificationTask( + resampling_strategy=HoldoutValTypes.holdout_validation, + resampling_strategy_args={'val_share': 0.5}, +) + +############################################################################ +# Get a configuration of the pipeline for current dataset +# =============================================================== + +dataset = estimator.get_dataset(X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, + dataset_name='kr-vs-kp') +configuration = estimator.get_search_space(dataset).get_default_configuration() + +print("Passed Configuration:", configuration) +########################################################################### +# Fit the configuration +# ===================== + +pipeline, run_info, run_value, dataset = estimator.fit_pipeline(dataset=dataset, + configuration=configuration, + budget_type='epochs', + budget=10, + run_time_limit_secs=100 + ) + +# The fit_pipeline command also returns a named tuple with the pipeline constraints +print(run_info) + +# The fit_pipeline command also returns a named tuple with train/test performance +print(run_value) + +# This object complies with Scikit-Learn Pipeline API. +# https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html +print(pipeline.named_steps) diff --git a/examples/40_advanced/example_visualization.py b/examples/40_advanced/example_visualization.py index 37c1c6dc3..a88899e81 100644 --- a/examples/40_advanced/example_visualization.py +++ b/examples/40_advanced/example_visualization.py @@ -149,18 +149,3 @@ grid=True, ) plt.show() - -# We then can understand the importance of each input feature using -# a permutation importance analysis. This is done as a proof of concept, to -# showcase that we can leverage of scikit-learn API. -result = permutation_importance(estimator, X_train, y_train, n_repeats=5, - scoring='accuracy', - random_state=seed) -sorted_idx = result.importances_mean.argsort() - -fig, ax = plt.subplots() -ax.boxplot(result.importances[sorted_idx].T, - vert=False, labels=X_test.columns[sorted_idx]) -ax.set_title("Permutation Importances (Train set)") -fig.tight_layout() -plt.show() diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py index 5cb271eb0..fda013612 100644 --- a/test/test_api/test_api.py +++ b/test/test_api/test_api.py @@ -2,6 +2,7 @@ import os import pathlib import pickle +import tempfile import unittest from test.test_api.utils import dummy_do_dummy_prediction, dummy_eval_function @@ -17,14 +18,14 @@ import sklearn import sklearn.datasets -from sklearn.base import BaseEstimator -from sklearn.base import clone +from sklearn.base import BaseEstimator, clone from sklearn.ensemble import VotingClassifier, VotingRegressor -from smac.runhistory.runhistory import RunHistory +from smac.runhistory.runhistory import RunHistory, RunInfo, RunValue from autoPyTorch.api.tabular_classification import TabularClassificationTask from autoPyTorch.api.tabular_regression import TabularRegressionTask +from autoPyTorch.datasets.base_dataset import BaseDataset from autoPyTorch.datasets.resampling_strategy import ( CrossValTypes, HoldoutValTypes, @@ -216,9 +217,6 @@ def test_tabular_classification(openml_id, resampling_strategy, backend, resampl # Make sure that a configuration space is stored in the estimator assert isinstance(estimator.get_search_space(), CS.ConfigurationSpace) - # test fit on dummy data - assert isinstance(estimator.fit(dataset=backend.load_datamanager()), BasePipeline) - @pytest.mark.parametrize('openml_name', ("boston", )) @unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_function', @@ -645,3 +643,150 @@ def test_build_pipeline(api_type, fit_dictionary_tabular): pipeline = api.build_pipeline(fit_dictionary_tabular['dataset_properties']) assert isinstance(pipeline, BaseEstimator) assert len(pipeline.steps) > 0 + + +@pytest.mark.parametrize("disable_file_output", [['all'], None]) +@pytest.mark.parametrize('openml_id', (40984,)) +@pytest.mark.parametrize('resampling_strategy,resampling_strategy_args', + ((HoldoutValTypes.holdout_validation, {'val_share': 0.8}), + (CrossValTypes.k_fold_cross_validation, {'num_splits': 2}) + ) + ) +@pytest.mark.parametrize("budget", [15, 20]) +def test_pipeline_fit(openml_id, + resampling_strategy, + resampling_strategy_args, + backend, + disable_file_output, + budget, + n_samples): + # Get the data and check that contents of data-manager make sense + X, y = sklearn.datasets.fetch_openml( + data_id=int(openml_id), + return_X_y=True, as_frame=True + ) + X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X[:n_samples], y[:n_samples], random_state=1) + + # Search for a good configuration + estimator = TabularClassificationTask( + backend=backend, + resampling_strategy=resampling_strategy, + ) + + dataset = estimator.get_dataset(X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, + resampling_strategy=resampling_strategy, + resampling_strategy_args=resampling_strategy_args) + + configuration = estimator.get_search_space(dataset).get_default_configuration() + pipeline, run_info, run_value, dataset = estimator.fit_pipeline(dataset=dataset, + configuration=configuration, + run_time_limit_secs=50, + disable_file_output=disable_file_output, + budget_type='epochs', + budget=budget + ) + assert isinstance(dataset, BaseDataset) + assert isinstance(run_info, RunInfo) + assert isinstance(run_info.config, Configuration) + + assert isinstance(run_value, RunValue) + assert 'SUCCESS' in str(run_value.status) + + if disable_file_output is None: + if resampling_strategy in CrossValTypes: + assert isinstance(pipeline, BaseEstimator) + X_test = dataset.test_tensors[0] + preds = pipeline.predict_proba(X_test) + assert isinstance(preds, np.ndarray) + + score = accuracy(dataset.test_tensors[1], preds) + assert isinstance(score, float) + assert score > 0.7 + else: + assert isinstance(pipeline, BasePipeline) + # To make sure we fitted the model, there should be a + # run summary object with accuracy + run_summary = pipeline.named_steps['trainer'].run_summary + assert run_summary is not None + X_test = dataset.test_tensors[0] + preds = pipeline.predict(X_test) + assert isinstance(preds, np.ndarray) + + score = accuracy(dataset.test_tensors[1], preds) + assert isinstance(score, float) + assert score > 0.7 + else: + assert pipeline is None + assert run_value.cost < 0.3 + + # Make sure that the pipeline can be pickled + dump_file = os.path.join(tempfile.gettempdir(), 'automl.dump.pkl') + with open(dump_file, 'wb') as f: + pickle.dump(pipeline, f) + + num_run_dir = estimator._backend.get_numrun_directory( + run_info.seed, run_value.additional_info['num_run'], budget=float(budget)) + + cv_model_path = os.path.join(num_run_dir, estimator._backend.get_cv_model_filename( + run_info.seed, run_value.additional_info['num_run'], budget=float(budget))) + model_path = os.path.join(num_run_dir, estimator._backend.get_model_filename( + run_info.seed, run_value.additional_info['num_run'], budget=float(budget))) + + if disable_file_output: + # No file output is expected + assert not os.path.exists(num_run_dir) + else: + # We expect the model path always + # And the cv model only on 'cv' + assert os.path.exists(model_path) + if resampling_strategy in CrossValTypes: + assert os.path.exists(cv_model_path) + elif resampling_strategy in HoldoutValTypes: + assert not os.path.exists(cv_model_path) + + +@pytest.mark.parametrize('openml_id', (40984,)) +@pytest.mark.parametrize('resampling_strategy,resampling_strategy_args', + ((HoldoutValTypes.holdout_validation, {'val_share': 0.8}), + ) + ) +def test_pipeline_fit_error( + openml_id, + resampling_strategy, + resampling_strategy_args, + backend, + n_samples +): + # Get the data and check that contents of data-manager make sense + X, y = sklearn.datasets.fetch_openml( + data_id=int(openml_id), + return_X_y=True, as_frame=True + ) + X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X[:n_samples], y[:n_samples], random_state=1) + + # Search for a good configuration + estimator = TabularClassificationTask( + backend=backend, + resampling_strategy=resampling_strategy, + ) + + dataset = estimator.get_dataset(X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, + resampling_strategy=resampling_strategy, + resampling_strategy_args=resampling_strategy_args) + + configuration = estimator.get_search_space(dataset).get_default_configuration() + pipeline, run_info, run_value, dataset = estimator.fit_pipeline(dataset=dataset, + configuration=configuration, + run_time_limit_secs=7, + ) + + assert 'TIMEOUT' in str(run_value.status) + assert pipeline is None diff --git a/test/test_api/test_base_api.py b/test/test_api/test_base_api.py index 126b702e6..3b379dbd6 100644 --- a/test/test_api/test_base_api.py +++ b/test/test_api/test_base_api.py @@ -20,6 +20,7 @@ # ==== @pytest.mark.parametrize("fit_dictionary_tabular", ['classification_categorical_only'], indirect=True) def test_nonsupported_arguments(fit_dictionary_tabular): + BaseTask.__abstractmethods__ = set() with pytest.raises(ValueError, match=r".*Expected search space updates to be of instance.*"): api = BaseTask(search_space_updates='None') @@ -82,6 +83,7 @@ def test_pipeline_predict_function(): @pytest.mark.parametrize("fit_dictionary_tabular", ['classification_categorical_only'], indirect=True) def test_show_models(fit_dictionary_tabular): + BaseTask.__abstractmethods__ = set() api = BaseTask() api.ensemble_ = MagicMock() api.models_ = [TabularClassificationPipeline(dataset_properties=fit_dictionary_tabular['dataset_properties'])] @@ -94,6 +96,7 @@ def test_show_models(fit_dictionary_tabular): def test_set_pipeline_config(): # checks if we can correctly change the pipeline options + BaseTask.__abstractmethods__ = set() estimator = BaseTask() pipeline_options = {"device": "cuda", "budget_type": "epochs", @@ -110,6 +113,7 @@ def test_set_pipeline_config(): (3, 50, 'runtime', {'budget_type': 'runtime', 'runtime': 50}), ]) def test_pipeline_get_budget(fit_dictionary_tabular, min_budget, max_budget, budget_type, expected): + BaseTask.__abstractmethods__ = set() estimator = BaseTask(task_type='tabular_classification', ensemble_size=0) # Fixture pipeline config diff --git a/test/test_evaluation/test_abstract_evaluator.py b/test/test_evaluation/test_abstract_evaluator.py index 6cec57fb4..a0be2c3f3 100644 --- a/test/test_evaluation/test_abstract_evaluator.py +++ b/test/test_evaluation/test_abstract_evaluator.py @@ -13,6 +13,7 @@ from autoPyTorch.automl_common.common.utils.backend import Backend, BackendContext from autoPyTorch.evaluation.abstract_evaluator import AbstractEvaluator +from autoPyTorch.evaluation.utils import DisableFileOutputParameters from autoPyTorch.pipeline.components.training.metrics.metrics import accuracy this_directory = os.path.dirname(__file__) @@ -129,7 +130,7 @@ def test_disable_file_output(self): ae = AbstractEvaluator( backend=self.backend_mock, queue=queue_mock, - disable_file_output=True, + disable_file_output=[DisableFileOutputParameters.all], metric=accuracy, logger_port=unittest.mock.Mock(), budget=0, @@ -314,3 +315,35 @@ def test_error_unsupported_budget_type(self): self.assertIsInstance(e, ValueError) shutil.rmtree(self.working_directory, ignore_errors=True) + + def test_error_unsupported_disable_file_output_parameters(self): + shutil.rmtree(self.working_directory, ignore_errors=True) + os.mkdir(self.working_directory) + + queue_mock = unittest.mock.Mock() + + context = BackendContext( + prefix='autoPyTorch', + temporary_directory=os.path.join(self.working_directory, 'tmp'), + output_directory=os.path.join(self.working_directory, 'out'), + delete_tmp_folder_after_terminate=True, + delete_output_folder_after_terminate=True, + ) + with unittest.mock.patch.object(Backend, 'load_datamanager') as load_datamanager_mock: + load_datamanager_mock.return_value = get_multiclass_classification_datamanager() + + backend = Backend(context, prefix='autoPyTorch') + + try: + AbstractEvaluator( + backend=backend, + output_y_hat_optimization=False, + queue=queue_mock, + metric=accuracy, + budget=0, + configuration=1, + disable_file_output=['model']) + except Exception as e: + self.assertIsInstance(e, ValueError) + + shutil.rmtree(self.working_directory, ignore_errors=True) diff --git a/test/test_evaluation/test_utils.py b/test/test_evaluation/test_utils.py new file mode 100644 index 000000000..e81eea38b --- /dev/null +++ b/test/test_evaluation/test_utils.py @@ -0,0 +1,35 @@ +""" +Tests the functionality in autoPyTorch.evaluation.utils +""" +import pytest + +from autoPyTorch.evaluation.utils import DisableFileOutputParameters + + +@pytest.mark.parametrize('disable_file_output', + [['pipeline', 'pipelines'], + [DisableFileOutputParameters.pipelines, DisableFileOutputParameters.pipeline]]) +def test_disable_file_output_no_error(disable_file_output): + """ + Checks that `DisableFileOutputParameters.check_compatibility` + does not raise an error for the parameterized values of `disable_file_output`. + + Args: + disable_file_output ([List[Union[str, DisableFileOutputParameters]]]): + Options that should be compatible with the `DisableFileOutputParameters` + defined in `autoPyTorch`. + """ + DisableFileOutputParameters.check_compatibility(disable_file_output=disable_file_output) + + +def test_disable_file_output_error(): + """ + Checks that `DisableFileOutputParameters.check_compatibility` raises an error + for a value not present in `DisableFileOutputParameters` and ensures that the + expected error is raised. + """ + disable_file_output = ['model'] + with pytest.raises(ValueError, match=r"Expected .*? to be in the members (.*?) of" + r" DisableFileOutputParameters or as string value" + r" of a member."): + DisableFileOutputParameters.check_compatibility(disable_file_output=disable_file_output) diff --git a/test/test_utils/test_common.py b/test/test_utils/test_common.py new file mode 100644 index 000000000..ea3dec563 --- /dev/null +++ b/test/test_utils/test_common.py @@ -0,0 +1,72 @@ +""" +This tests the functionality in autoPyTorch/utils/common. +""" +from enum import Enum + +import pytest + +from autoPyTorch.utils.common import autoPyTorchEnum + + +class SubEnum(autoPyTorchEnum): + x = "x" + y = "y" + + +class DummyEnum(Enum): # You need to move it on top + x = "x" + + +@pytest.mark.parametrize('iter', + ([SubEnum.x], + ["x"], + {SubEnum.x: "hello"}, + {'x': 'hello'}, + SubEnum, + ["x", "y"])) +def test_autopytorch_enum(iter): + """ + This test ensures that a subclass of `autoPyTorchEnum` + can be used with strings. + + Args: + iter (Iterable): + iterable to check for compaitbility + """ + + e = SubEnum.x + + assert e in iter + + +@pytest.mark.parametrize('iter', + [[SubEnum.y], + ["y"], + {SubEnum.y: "hello"}, + {'y': 'hello'}]) +def test_autopytorch_enum_false(iter): + """ + This test ensures that a subclass of `autoPyTorchEnum` + can be used with strings. + Args: + iter (Iterable): + iterable to check for compaitbility + """ + + e = SubEnum.x + + assert e not in iter + + +@pytest.mark.parametrize('others', (1, 2.0, SubEnum, DummyEnum.x)) +def test_raise_errors_autopytorch_enum(others): + """ + This test ensures that a subclass of `autoPyTorchEnum` + raises error properly. + Args: + others (Any): + Variable to compare with SubEnum. + """ + + with pytest.raises(RuntimeError): + SubEnum.x == others diff --git a/test/test_utils/test_results_manager.py b/test/test_utils/test_results_manager.py index 8998009a4..496aec7fa 100644 --- a/test/test_utils/test_results_manager.py +++ b/test/test_utils/test_results_manager.py @@ -352,6 +352,7 @@ def test_metric_results(metric, scores, ensemble_ends_later): def test_search_results_sprint_statistics(): + BaseTask.__abstractmethods__ = set() api = BaseTask() for method in ['get_search_results', 'sprint_statistics', 'get_incumbent_results']: with pytest.raises(RuntimeError): diff --git a/test/test_utils/test_results_visualizer.py b/test/test_utils/test_results_visualizer.py index c463fa063..e31571ef0 100644 --- a/test/test_utils/test_results_visualizer.py +++ b/test/test_utils/test_results_visualizer.py @@ -146,6 +146,7 @@ def test_set_plot_args(params): # TODO @pytest.mark.parametrize('metric_name', ('unknown', 'accuracy')) def test_raise_error_in_plot_perf_over_time_in_base_task(metric_name): + BaseTask.__abstractmethods__ = set() api = BaseTask() if metric_name == 'unknown': @@ -159,6 +160,7 @@ def test_raise_error_in_plot_perf_over_time_in_base_task(metric_name): @pytest.mark.parametrize('metric_name', ('balanced_accuracy', 'accuracy')) def test_plot_perf_over_time(metric_name): # TODO dummy_history = [{'Timestamp': datetime(2022, 1, 1), 'train_accuracy': 1, 'test_accuracy': 1}] + BaseTask.__abstractmethods__ = set() api = BaseTask() run_history_data = json.load(open(os.path.join(os.path.dirname(__file__), 'runhistory.json'),