From cdcf7664fd7ddf866829947180adcac17e1a29c6 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 30 Nov 2021 15:29:54 +0100 Subject: [PATCH 01/32] Add fit pipeline with tests --- autoPyTorch/api/base_task.py | 279 ++++++++++++++++--- autoPyTorch/api/tabular_classification.py | 97 ++++--- autoPyTorch/api/tabular_regression.py | 102 ++++--- autoPyTorch/evaluation/abstract_evaluator.py | 46 +-- autoPyTorch/evaluation/tae.py | 2 +- autoPyTorch/evaluation/train_evaluator.py | 25 +- test/test_api/test_api.py | 111 +++++++- 7 files changed, 512 insertions(+), 150 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index b4d20165e..b33c6d3ae 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -27,7 +27,7 @@ import pandas as pd -from smac.runhistory.runhistory import DataOrigin, RunHistory +from smac.runhistory.runhistory import DataOrigin, RunHistory, RunInfo, RunValue from smac.stats.stats import Stats from smac.tae import StatusType @@ -233,7 +233,11 @@ def __init__( " HyperparameterSearchSpaceUpdates got {}".format(type(self.search_space_updates))) @abstractmethod - def build_pipeline(self, dataset_properties: Dict[str, Any]) -> BasePipeline: + def build_pipeline(self, dataset_properties: Dict[str, Any], + include_components: Optional[Dict] = None, + exclude_components: Optional[Dict] = None, + search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None + ) -> BasePipeline: """ Build pipeline according to current task and for the passed dataset properties @@ -244,7 +248,21 @@ def build_pipeline(self, dataset_properties: Dict[str, Any]) -> BasePipeline: Returns: """ - raise NotImplementedError + raise NotImplementedError("Function called on BaseTask, this can only be called by " + "specific task which is a child of the BaseTask") + + @abstractmethod + def get_dataset(self, + X_train: Union[List, pd.DataFrame, np.ndarray], + y_train: Union[List, pd.DataFrame, np.ndarray], + X_test: Union[List, pd.DataFrame, np.ndarray], + y_test: Union[List, pd.DataFrame, np.ndarray], + resampling_strategy: Optional[Union[CrossValTypes, HoldoutValTypes]] = None, + resampling_strategy_args: Optional[Dict[str, Any]] = None, + dataset_name: Optional[str] = None, + ) -> BaseDataset: + raise NotImplementedError("Function called on BaseTask, this can only be called by " + "specific task which is a child of the BaseTask") @property def run_history(self) -> RunHistory: @@ -563,7 +581,7 @@ def _do_dummy_prediction(self) -> None: initial_num_run=num_run, stats=stats, memory_limit=memory_limit, - disable_file_output=True if len(self._disable_file_output) > 0 else False, + disable_file_output=self._disable_file_output, all_supported_metrics=self._all_supported_metrics ) @@ -647,7 +665,7 @@ def _do_traditional_prediction(self, time_left: int, func_eval_time_limit_secs: initial_num_run=self._backend.get_next_num_run(), stats=stats, memory_limit=memory_limit, - disable_file_output=True if len(self._disable_file_output) > 0 else False, + disable_file_output=self._disable_file_output, all_supported_metrics=self._all_supported_metrics ) dask_futures.append([ @@ -743,7 +761,7 @@ def _search( tae_func: Optional[Callable] = None, all_supported_metrics: bool = True, precision: int = 32, - disable_file_output: List = [], + disable_file_output: Optional[List[str]] = None, load_models: bool = True, portfolio_selection: Optional[str] = None, dask_client: Optional[dask.distributed.Client] = None @@ -844,9 +862,8 @@ def _search( precision (int: default=32): Numeric precision used when loading ensemble data. Can be either '16', '32' or '64'. - disable_file_output (Union[bool, List]): - If True, disable model and prediction output. - Can also be used as a list to pass more fine-grained + disable_file_output (Optional[List]): + Used as a list to pass more fine-grained information on what to save. Allowed elements in the list are: + `y_optimization`: @@ -860,6 +877,8 @@ def _search( pipelines fit on each fold. + `y_test`: do not save the predictions for the test set. + + `all`: + do not save any of the above. load_models (bool: default=True): Whether to load the models after fitting AutoPyTorch. portfolio_selection (Optional[str]): @@ -901,7 +920,7 @@ def _search( self._backend.setup_logger(port=self._logger_port) self._all_supported_metrics = all_supported_metrics - self._disable_file_output = disable_file_output + self._disable_file_output = disable_file_output if disable_file_output is not None else [] self._memory_limit = memory_limit self._time_for_task = total_walltime_limit # Save start time to backend @@ -1223,10 +1242,29 @@ def refit( return self - def fit(self, - dataset: BaseDataset, - pipeline_config: Optional[Configuration] = None, - split_id: int = 0) -> BasePipeline: + def fit_pipeline( + self, + configuration: Configuration, + dataset: Optional[BaseDataset] = None, + X_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + y_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + dataset_name: Optional[str] = None, + resampling_strategy: Optional[Union[HoldoutValTypes, CrossValTypes]] = None, + resampling_strategy_args: Optional[Dict[str, Any]] = None, + run_time_limit_secs: int = 60, + memory_limit: Optional[int] = None, + eval_metric: Optional[str] = None, + all_supported_metrics: bool = False, + budget_type: Optional[str] = None, + include_components: Optional[Dict] = None, + exclude_components: Optional[Dict] = None, + search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, + budget: Optional[float] = None, + pipeline_options: Optional[Dict] = None, + disable_file_output: Optional[List[str]] = None, + ) -> Tuple[Optional[BasePipeline], RunInfo, RunValue, BaseDataset]: """ Fit a pipeline on the given task for the budget. A pipeline configuration can be specified if None, @@ -1237,24 +1275,110 @@ def fit(self, methods. Args: - dataset (Dataset): - The argument that will provide the dataset splits. It can either - be a dictionary with the splits, or the dataset object which can - generate the splits based on different restrictions. - split_id (int: default=0): - split id to fit on. - pipeline_config (Optional[Configuration]): - configuration to fit the pipeline with. If None, - uses default + X_train, y_train, X_test, y_test: Union[np.ndarray, List, pd.DataFrame] + A pair of features (X_train) and targets (y_train) used to fit a + pipeline. Additionally, a holdout of this pairs (X_test, y_test) can + be provided to track the generalization performance of each stage. + dataset_name (Optional[str]): + Name of the dataset, if None, random value is used. + resampling_strategy (Union[CrossValTypes, HoldoutValTypes]), + (default=HoldoutValTypes.holdout_validation): + strategy to split the training data. + resampling_strategy_args (Optional[Dict[str, Any]]): + Arguments required for the chosen resampling strategy. If None, uses + the default values provided in DEFAULT_RESAMPLING_PARAMETERS + in ```datasets/resampling_strategy.py```. + run_time_limit_secs (int: default=120): + Time limit for a single call to the machine learning model. + Model fitting will be terminated if the machine learning algorithm + runs over the time limit. Set this value high enough so that + typical machine learning algorithms can be fit on the training + data. + memory_limit (Optional[int]): + Memory limit in MB for the machine learning algorithm. autopytorch + will stop fitting the machine learning algorithm if it tries + to allocate more than memory_limit MB. If None is provided, + no memory limit is set. In case of multi-processing, memory_limit + will be per job. This memory limit also applies to the ensemble + creation process. + eval_metric (str): + Name of the metric that is used to evaluate a pipeline. + all_supported_metrics (bool: default=True): + if True, all metrics supporting current task will be calculated + for each pipeline and results will be available via cv_results + budget_type (str): + Type of budget to be used when fitting the pipeline. + It can be one of: + + + `epochs`: The training of each pipeline will be terminated after + a number of epochs have passed. This number of epochs is determined by the + budget argument of this method. + + `runtime`: The training of each pipeline will be terminated after + a number of seconds have passed. This number of seconds is determined by the + budget argument of this method. The overall fitting time of a pipeline is + controlled by func_eval_time_limit_secs. 'runtime' only controls the allocated + time to train a pipeline, but it does not consider the overall time it takes + to create a pipeline (data loading and preprocessing, other i/o operations, etc.). + include_components (Optional[Dict]): + If None, all possible components are used. + Otherwise specifies set of components to use. + exclude_components (Optional[Dict]): + If None, all possible components are used. + Otherwise specifies set of components not to use. + Incompatible with include components + search_space_updates(Optional[HyperparameterSearchSpaceUpdates]): + Updates to be made to the hyperparameter search space of the pipeline + budget (Optional[float]): + Budget to fit a single run of the pipeline. If not + provided, uses the default in the pipeline config + pipeline_options (Optional[Dict]): + Valid config options include "device", + "torch_num_threads", "early_stopping", "use_tensorboard_logger", + "metrics_during_training" + disable_file_output (Optional[List]): + Used as a list to pass more fine-grained + information on what to save. Allowed elements in the list are: + + + `y_optimization`: + do not save the predictions for the optimization set, + which would later on be used to build an ensemble. Note that SMAC + optimizes a metric evaluated on the optimization set. + + `pipeline`: + do not save any individual pipeline files + + `pipelines`: + In case of cross validation, disables saving the joint model of the + pipelines fit on each fold. + + `y_test`: + do not save the predictions for the test set. + + `all`: + do not save any of the above. + configuration: (Configuration) + configuration to fit the pipeline with. Returns: - BasePipeline: - fitted pipeline + (BasePipeline): fitted pipeline + (RunInfo): Run information + (RunValue): Result of fitting the pipeline + (BaseDataset): Dataset created from the given tensors """ - self.dataset_name = dataset.dataset_name - if self._logger is None: - self._logger = self._get_logger(str(self.dataset_name)) + if dataset is None: + assert X_train is not None and \ + y_train is not None, "No dataset provided, must provide X_train, y_train tensors" + dataset = self.get_dataset(X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, + resampling_strategy=resampling_strategy, + resampling_strategy_args=resampling_strategy_args, + dataset_name=dataset_name + ) + + # TAE expects each configuration to have a config_id. + # For fitting a pipeline as it is not part of the + # search process, it makes sense to set it to 0 + if hasattr(configuration, 'config_id') or configuration.config_id is None: + configuration.__setattr__('config_id', 0) # get dataset properties dataset_requirements = get_dataset_requirements( @@ -1265,21 +1389,98 @@ def fit(self, dataset_properties = dataset.get_dataset_properties(dataset_requirements) self._backend.save_datamanager(dataset) - # build pipeline - pipeline = self.build_pipeline(dataset_properties) - if pipeline_config is not None: - pipeline.set_hyperparameters(pipeline_config) + if self._logger is None: + # dataset_name is created inside the constructor of BaseDataset + # we expect it to be not None. This is for mypy + assert dataset.dataset_name is not None + self._logger = self._get_logger(dataset.dataset_name) + + if include_components is None: + include_components = self.include_components + if exclude_components is None: + exclude_components = self.exclude_components + if search_space_updates is None: + search_space_updates = self.search_space_updates + + scenario_mock = unittest.mock.Mock() + scenario_mock.wallclock_limit = run_time_limit_secs + # This stats object is a hack - maybe the SMAC stats object should + # already be generated here! + stats = Stats(scenario_mock) + + if memory_limit is None: + if hasattr(self, '_memory_limit') and self._memory_limit is not None: + memory_limit = self._memory_limit + + metric = get_metrics(dataset_properties=dataset_properties, + names=[eval_metric] if eval_metric is not None else None, + all_supported_metrics=False).pop() + + pipeline_options = self.pipeline_options.copy().update(pipeline_options) if pipeline_options is not None \ + else self.pipeline_options.copy() + + assert pipeline_options is not None + + if budget_type is not None: + pipeline_options.update({'budget_type': budget_type}) + else: + budget_type = pipeline_options['budget_type'] - # initialise fit dictionary - X = self._get_fit_dictionary( - dataset_properties=dataset_properties, - dataset=dataset, - split_id=split_id) + budget = budget if budget is not None else pipeline_options[budget_type] - fit_and_suppress_warnings(self._logger, pipeline, X, y=None) + if disable_file_output is None: + disable_file_output = self._disable_file_output if hasattr(self, '_disable_file_output') \ + and self._disable_file_output is not None else [] + + stats.start_timing() + + tae = ExecuteTaFuncWithQueue( + backend=self._backend, + seed=self.seed, + metric=metric, + logger_port=self._logger_port, + cost_for_crash=get_cost_of_crash(metric), + abort_on_first_run_crash=False, + initial_num_run=self._backend.get_next_num_run(), + stats=stats, + memory_limit=memory_limit, + disable_file_output=disable_file_output, + all_supported_metrics=all_supported_metrics, + budget_type=budget_type, + include=include_components, + exclude=exclude_components, + search_space_updates=search_space_updates, + pipeline_config=pipeline_options, + pynisher_context=self._multiprocessing_context + ) + + run_info, run_value = tae.run_wrapper( + RunInfo(config=configuration, + budget=budget, + seed=self.seed, + cutoff=run_time_limit_secs, + capped=False, + instance_specific=None, + instance=None) + ) + + fitted_pipeline: Optional[BasePipeline] = None + if 'all' in disable_file_output or 'pipeline' in disable_file_output: + self._logger.warning("File output is disabled. No pipeline can returned") + elif run_value.status == StatusType.SUCCESS: + if self.resampling_strategy in CrossValTypes: + load_function = self._backend.load_cv_model_by_seed_and_id_and_budget + else: + load_function = self._backend.load_model_by_seed_and_id_and_budget + fitted_pipeline = load_function( + seed=self.seed, + idx=run_info.config.config_id + tae.initial_num_run, + budget=float(run_info.budget), + ) self._clean_logger() - return pipeline + + return fitted_pipeline, run_info, run_value, dataset def predict( self, diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index d83f1dc01..3f8019e58 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -1,5 +1,3 @@ -import os -import uuid from typing import Any, Callable, Dict, List, Optional, Union import numpy as np @@ -106,18 +104,55 @@ def __init__( task_type=TASK_TYPES_TO_STRING[TABULAR_CLASSIFICATION], ) - def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TabularClassificationPipeline: - """ - Build pipeline according to current task and for the passed dataset properties + def build_pipeline( + self, + dataset_properties: Dict[str, Any], + include_components: Optional[Dict] = None, + exclude_components: Optional[Dict] = None, + search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None + ) -> TabularClassificationPipeline: + return TabularClassificationPipeline(dataset_properties=dataset_properties, + include=include_components, + exclude=exclude_components, + search_space_updates=search_space_updates) - Args: - dataset_properties (Dict[str,Any]) + def get_dataset( + self, + X_train: Union[List, pd.DataFrame, np.ndarray], + y_train: Union[List, pd.DataFrame, np.ndarray], + X_test: Union[List, pd.DataFrame, np.ndarray], + y_test: Union[List, pd.DataFrame, np.ndarray], + resampling_strategy: Optional[Union[CrossValTypes, HoldoutValTypes]] = None, + resampling_strategy_args: Optional[Dict[str, Any]] = None, + dataset_name: Optional[str] = None, + ) -> TabularDataset: - Returns: - TabularClassificationPipeline: - Pipeline compatible with the given dataset properties. - """ - return TabularClassificationPipeline(dataset_properties=dataset_properties) + resampling_strategy = resampling_strategy if resampling_strategy is not None else self.resampling_strategy + resampling_strategy_args = resampling_strategy_args if resampling_strategy_args is not None else \ + self.resampling_strategy_args + + # Create a validator object to make sure that the data provided by + # the user matches the autopytorch requirements + InputValidator = TabularInputValidator( + is_classification=True, + logger_port=self._logger_port, + ) + + # Fit a input validator to check the provided data + # Also, an encoder is fit to both train and test data, + # to prevent unseen categories during inference + InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) + + dataset = TabularDataset( + X=X_train, Y=y_train, + X_test=X_test, Y_test=y_test, + validator=InputValidator, + resampling_strategy=resampling_strategy, + resampling_strategy_args=resampling_strategy_args, + dataset_name=dataset_name + ) + + return dataset def search( self, @@ -138,7 +173,7 @@ def search( get_smac_object_callback: Optional[Callable] = None, all_supported_metrics: bool = True, precision: int = 32, - disable_file_output: List = [], + disable_file_output: Optional[List[str]] = None, load_models: bool = True, portfolio_selection: Optional[str] = None, ) -> 'BaseTask': @@ -237,9 +272,8 @@ def search( precision (int: default=32): Numeric precision used when loading ensemble data. Can be either '16', '32' or '64'. - disable_file_output (Union[bool, List]): - If True, disable model and prediction output. - Can also be used as a list to pass more fine-grained + disable_file_output (List): + Used as a list to pass more fine-grained information on what to save. Allowed elements in the list are: + `y_optimization`: @@ -253,6 +287,8 @@ def search( pipelines fit on each fold. + `y_test`: do not save the predictions for the test set. + + `all`: + do not save any of the above. load_models (bool: default=True): Whether to load the models after fitting AutoPyTorch. portfolio_selection (Optional[str]): @@ -269,32 +305,15 @@ def search( self """ - if dataset_name is None: - dataset_name = str(uuid.uuid1(clock_seq=os.getpid())) - - # we have to create a logger for at this point for the validator - self._logger = self._get_logger(dataset_name) - # Create a validator object to make sure that the data provided by - # the user matches the autopytorch requirements - self.InputValidator = TabularInputValidator( - is_classification=True, - logger_port=self._logger_port, - ) - - # Fit a input validator to check the provided data - # Also, an encoder is fit to both train and test data, - # to prevent unseen categories during inference - self.InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) - - self.dataset = TabularDataset( - X=X_train, Y=y_train, - X_test=X_test, Y_test=y_test, - validator=self.InputValidator, - dataset_name=dataset_name, + self.dataset = self.get_dataset( + X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, resampling_strategy=self.resampling_strategy, resampling_strategy_args=self.resampling_strategy_args, - ) + dataset_name=dataset_name) return self._search( dataset=self.dataset, diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py index a68990732..4d35a71b3 100644 --- a/autoPyTorch/api/tabular_regression.py +++ b/autoPyTorch/api/tabular_regression.py @@ -1,5 +1,3 @@ -import os -import uuid from typing import Any, Callable, Dict, List, Optional, Union import numpy as np @@ -107,18 +105,55 @@ def __init__( task_type=TASK_TYPES_TO_STRING[TABULAR_REGRESSION], ) - def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TabularRegressionPipeline: - """ - Build pipeline according to current task and for the passed dataset properties + def build_pipeline( + self, + dataset_properties: Dict[str, Any], + include_components: Optional[Dict] = None, + exclude_components: Optional[Dict] = None, + search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None + ) -> TabularRegressionPipeline: + return TabularRegressionPipeline(dataset_properties=dataset_properties, + include=include_components, + exclude=exclude_components, + search_space_updates=search_space_updates) - Args: - dataset_properties (Dict[str,Any]) + def get_dataset( + self, + X_train: Union[List, pd.DataFrame, np.ndarray], + y_train: Union[List, pd.DataFrame, np.ndarray], + X_test: Union[List, pd.DataFrame, np.ndarray], + y_test: Union[List, pd.DataFrame, np.ndarray], + resampling_strategy: Optional[Union[CrossValTypes, HoldoutValTypes]] = None, + resampling_strategy_args: Optional[Dict[str, Any]] = None, + dataset_name: Optional[str] = None, + ) -> TabularDataset: - Returns: - TabularRegressionPipeline: - Pipeline compatible with the given dataset properties. - """ - return TabularRegressionPipeline(dataset_properties=dataset_properties) + resampling_strategy = resampling_strategy if resampling_strategy is not None else self.resampling_strategy + resampling_strategy_args = resampling_strategy_args if resampling_strategy_args is not None else \ + self.resampling_strategy_args + + # Create a validator object to make sure that the data provided by + # the user matches the autopytorch requirements + InputValidator = TabularInputValidator( + is_classification=False, + logger_port=self._logger_port, + ) + + # Fit a input validator to check the provided data + # Also, an encoder is fit to both train and test data, + # to prevent unseen categories during inference + InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) + + dataset = TabularDataset( + X=X_train, Y=y_train, + X_test=X_test, Y_test=y_test, + validator=InputValidator, + resampling_strategy=resampling_strategy, + resampling_strategy_args=resampling_strategy_args, + dataset_name=dataset_name + ) + + return dataset def search( self, @@ -139,7 +174,7 @@ def search( get_smac_object_callback: Optional[Callable] = None, all_supported_metrics: bool = True, precision: int = 32, - disable_file_output: List = [], + disable_file_output: Optional[List[str]] = None, load_models: bool = True, portfolio_selection: Optional[str] = None, ) -> 'BaseTask': @@ -155,8 +190,8 @@ def search( A pair of features (X_train) and targets (y_train) used to fit a pipeline. Additionally, a holdout of this pairs (X_test, y_test) can be provided to track the generalization performance of each stage. - optimize_metric (str): name of the metric that is used to - evaluate a pipeline. + optimize_metric (str): + Name of the metric that is used to evaluate a pipeline. budget_type (str): Type of budget to be used when fitting the pipeline. It can be one of: @@ -238,9 +273,8 @@ def search( precision (int: default=32): Numeric precision used when loading ensemble data. Can be either '16', '32' or '64'. - disable_file_output (Union[bool, List]): - If True, disable model and prediction output. - Can also be used as a list to pass more fine-grained + disable_file_output (Optional[List]): + Used as a list to pass more fine-grained information on what to save. Allowed elements in the list are: + `y_optimization`: @@ -254,6 +288,8 @@ def search( pipelines fit on each fold. + `y_test`: do not save the predictions for the test set. + + `all`: + do not save any of the above. load_models (bool: default=True): Whether to load the models after fitting AutoPyTorch. portfolio_selection (Optional[str]): @@ -270,32 +306,14 @@ def search( self """ - if dataset_name is None: - dataset_name = str(uuid.uuid1(clock_seq=os.getpid())) - - # we have to create a logger for at this point for the validator - self._logger = self._get_logger(dataset_name) - - # Create a validator object to make sure that the data provided by - # the user matches the autopytorch requirements - self.InputValidator = TabularInputValidator( - is_classification=False, - logger_port=self._logger_port, - ) - - # Fit a input validator to check the provided data - # Also, an encoder is fit to both train and test data, - # to prevent unseen categories during inference - self.InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) - - self.dataset = TabularDataset( - X=X_train, Y=y_train, - X_test=X_test, Y_test=y_test, - validator=self.InputValidator, - dataset_name=dataset_name, + self.dataset = self.get_dataset( + X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, resampling_strategy=self.resampling_strategy, resampling_strategy_args=self.resampling_strategy_args, - ) + dataset_name=dataset_name) return self._search( dataset=self.dataset, diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py index 027c7211a..93c0d0f9b 100644 --- a/autoPyTorch/evaluation/abstract_evaluator.py +++ b/autoPyTorch/evaluation/abstract_evaluator.py @@ -375,10 +375,23 @@ class AbstractEvaluator(object): An optional dictionary to include components of the pipeline steps. exclude (Optional[Dict[str, Any]]): An optional dictionary to exclude components of the pipeline steps. - disable_file_output (Union[bool, List[str]]): - By default, the model, it's predictions and other metadata is stored on disk - for each finished configuration. This argument allows the user to skip - saving certain file type, for example the model, from being written to disk. + disable_file_output (Optional[List]): + Used as a list to pass more fine-grained + information on what to save. Allowed elements in the list are: + + + `y_optimization`: + do not save the predictions for the optimization set, + which would later on be used to build an ensemble. Note that SMAC + optimizes a metric evaluated on the optimization set. + + `pipeline`: + do not save any individual pipeline files + + `pipelines`: + In case of cross validation, disables saving the joint model of the + pipelines fit on each fold. + + `y_test`: + do not save the predictions for the test set. + + `all`: + do not save any of the above. init_params (Optional[Dict[str, Any]]): Optional argument that is passed to each pipeline step. It is the equivalent of kwargs for the pipeline steps. @@ -404,7 +417,7 @@ def __init__(self, backend: Backend, num_run: Optional[int] = None, include: Optional[Dict[str, Any]] = None, exclude: Optional[Dict[str, Any]] = None, - disable_file_output: Union[bool, List[str]] = False, + disable_file_output: Optional[List[str]] = None, init_params: Optional[Dict[str, Any]] = None, logger_port: Optional[int] = None, all_supported_metrics: bool = True, @@ -448,12 +461,7 @@ def __init__(self, backend: Backend, # Flag to save target for ensemble self.output_y_hat_optimization = output_y_hat_optimization - if isinstance(disable_file_output, bool): - self.disable_file_output: bool = disable_file_output - elif isinstance(disable_file_output, List): - self.disabled_file_outputs: List[str] = disable_file_output - else: - raise ValueError('disable_file_output should be either a bool or a list') + self.disable_file_output = disable_file_output if disable_file_output is not None else [] self.pipeline_class: Optional[Union[BaseEstimator, BasePipeline]] = None if self.task_type in REGRESSION_TASKS: @@ -835,19 +843,17 @@ def file_output( # Abort if we don't want to output anything. if hasattr(self, 'disable_file_output'): - if self.disable_file_output: + if 'all' in self.disable_file_output: return None, {} - else: - self.disabled_file_outputs = [] # This file can be written independently of the others down bellow - if 'y_optimization' not in self.disabled_file_outputs: + if 'y_optimization' not in self.disable_file_output: if self.output_y_hat_optimization: self.backend.save_targets_ensemble(self.Y_optimization) if hasattr(self, 'pipelines') and self.pipelines is not None: if self.pipelines[0] is not None and len(self.pipelines) > 0: - if 'pipelines' not in self.disabled_file_outputs: + if 'pipelines' not in self.disable_file_output: if self.task_type in CLASSIFICATION_TASKS: pipelines = VotingClassifier(estimators=None, voting='soft', ) else: @@ -861,7 +867,7 @@ def file_output( pipelines = None if hasattr(self, 'pipeline') and self.pipeline is not None: - if 'pipeline' not in self.disabled_file_outputs: + if 'pipeline' not in self.disable_file_output: pipeline = self.pipeline else: pipeline = None @@ -877,15 +883,15 @@ def file_output( cv_model=pipelines, ensemble_predictions=( Y_optimization_pred if 'y_optimization' not in - self.disabled_file_outputs else None + self.disable_file_output else None ), valid_predictions=( Y_valid_pred if 'y_valid' not in - self.disabled_file_outputs else None + self.disable_file_output else None ), test_predictions=( Y_test_pred if 'y_test' not in - self.disabled_file_outputs else None + self.disable_file_output else None ), ) diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py index d99251d3d..89a9838c9 100644 --- a/autoPyTorch/evaluation/tae.py +++ b/autoPyTorch/evaluation/tae.py @@ -109,7 +109,7 @@ def __init__( include: Optional[Dict[str, Any]] = None, exclude: Optional[Dict[str, Any]] = None, memory_limit: Optional[int] = None, - disable_file_output: bool = False, + disable_file_output: Optional[List] = None, init_params: Dict[str, Any] = None, budget_type: str = None, ta: Optional[Callable] = None, diff --git a/autoPyTorch/evaluation/train_evaluator.py b/autoPyTorch/evaluation/train_evaluator.py index 37926a8c0..bdff3549f 100644 --- a/autoPyTorch/evaluation/train_evaluator.py +++ b/autoPyTorch/evaluation/train_evaluator.py @@ -79,10 +79,23 @@ class TrainEvaluator(AbstractEvaluator): An optional dictionary to include components of the pipeline steps. exclude (Optional[Dict[str, Any]]): An optional dictionary to exclude components of the pipeline steps. - disable_file_output (Union[bool, List[str]]): - By default, the model, it's predictions and other metadata is stored on disk - for each finished configuration. This argument allows the user to skip - saving certain file type, for example the model, from being written to disk. + disable_file_output (Optional[List]): + Used as a list to pass more fine-grained + information on what to save. Allowed elements in the list are: + + + `y_optimization`: + do not save the predictions for the optimization set, + which would later on be used to build an ensemble. Note that SMAC + optimizes a metric evaluated on the optimization set. + + `pipeline`: + do not save any individual pipeline files + + `pipelines`: + In case of cross validation, disables saving the joint model of the + pipelines fit on each fold. + + `y_test`: + do not save the predictions for the test set. + + `all`: + do not save any of the above. init_params (Optional[Dict[str, Any]]): Optional argument that is passed to each pipeline step. It is the equivalent of kwargs for the pipeline steps. @@ -107,7 +120,7 @@ def __init__(self, backend: Backend, queue: Queue, num_run: Optional[int] = None, include: Optional[Dict[str, Any]] = None, exclude: Optional[Dict[str, Any]] = None, - disable_file_output: Union[bool, List] = False, + disable_file_output: Optional[List] = [], init_params: Optional[Dict[str, Any]] = None, logger_port: Optional[int] = None, keep_models: Optional[bool] = None, @@ -397,7 +410,7 @@ def eval_function( num_run: int, include: Optional[Dict[str, Any]], exclude: Optional[Dict[str, Any]], - disable_file_output: Union[bool, List], + disable_file_output: List, pipeline_config: Optional[Dict[str, Any]] = None, budget_type: str = None, init_params: Optional[Dict[str, Any]] = None, diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py index 5cb271eb0..1e6009081 100644 --- a/test/test_api/test_api.py +++ b/test/test_api/test_api.py @@ -2,6 +2,7 @@ import os import pathlib import pickle +import tempfile import unittest from test.test_api.utils import dummy_do_dummy_prediction, dummy_eval_function @@ -17,14 +18,14 @@ import sklearn import sklearn.datasets -from sklearn.base import BaseEstimator -from sklearn.base import clone +from sklearn.base import BaseEstimator, clone from sklearn.ensemble import VotingClassifier, VotingRegressor -from smac.runhistory.runhistory import RunHistory +from smac.runhistory.runhistory import RunHistory, RunInfo, RunValue from autoPyTorch.api.tabular_classification import TabularClassificationTask from autoPyTorch.api.tabular_regression import TabularRegressionTask +from autoPyTorch.datasets.base_dataset import BaseDataset from autoPyTorch.datasets.resampling_strategy import ( CrossValTypes, HoldoutValTypes, @@ -645,3 +646,107 @@ def test_build_pipeline(api_type, fit_dictionary_tabular): pipeline = api.build_pipeline(fit_dictionary_tabular['dataset_properties']) assert isinstance(pipeline, BaseEstimator) assert len(pipeline.steps) > 0 + + +@pytest.mark.parametrize("disable_file_output", [['all'], None]) +@pytest.mark.parametrize('openml_id', (40984,)) +@pytest.mark.parametrize('resampling_strategy,resampling_strategy_args', + ((HoldoutValTypes.holdout_validation, {'val_share': 0.8}), + (CrossValTypes.k_fold_cross_validation, {'num_splits': 2}) + ) + ) +@pytest.mark.parametrize("budget", [15, 20]) +def test_pipeline_fit(openml_id, + resampling_strategy, + resampling_strategy_args, + backend, + disable_file_output, + budget, + n_samples): + # Get the data and check that contents of data-manager make sense + X, y = sklearn.datasets.fetch_openml( + data_id=int(openml_id), + return_X_y=True, as_frame=True + ) + X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X[:n_samples], y[:n_samples], random_state=1) + + # Search for a good configuration + estimator = TabularClassificationTask( + backend=backend, + resampling_strategy=resampling_strategy, + ) + + dataset = estimator.get_dataset(X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, + resampling_strategy=resampling_strategy, + resampling_strategy_args=resampling_strategy_args) + + configuration = estimator.get_search_space(dataset).get_default_configuration() + pipeline, run_info, run_value, dataset = estimator.fit_pipeline(dataset=dataset, + configuration=configuration, + run_time_limit_secs=50, + disable_file_output=disable_file_output, + budget_type='epochs', + budget=budget + ) + assert isinstance(dataset, BaseDataset) + assert isinstance(run_info, RunInfo) + assert isinstance(run_info.config, Configuration) + + assert isinstance(run_value, RunValue) + assert 'SUCCESS' in str(run_value.status) + + if disable_file_output is None: + if resampling_strategy in CrossValTypes: + assert isinstance(pipeline, BaseEstimator) + X_test = dataset.test_tensors[0] + preds = pipeline.predict_proba(X_test) + assert isinstance(preds, np.ndarray) + + score = accuracy(dataset.test_tensors[1], preds) + assert isinstance(score, float) + assert score > 0.7 + else: + assert isinstance(pipeline, BasePipeline) + # To make sure we fitted the model, there should be a + # run summary object with accuracy + run_summary = pipeline.named_steps['trainer'].run_summary + assert run_summary is not None + X_test = dataset.test_tensors[0] + preds = pipeline.predict(X_test) + assert isinstance(preds, np.ndarray) + + score = accuracy(dataset.test_tensors[1], preds) + assert isinstance(score, float) + assert score > 0.7 + else: + assert pipeline is None + assert run_value.cost < 0.3 + + # Make sure that the pipeline can be pickled + dump_file = os.path.join(tempfile.gettempdir(), 'automl.dump.pkl') + with open(dump_file, 'wb') as f: + pickle.dump(pipeline, f) + + num_run_dir = estimator._backend.get_numrun_directory( + run_info.seed, run_value.additional_info['num_run'], budget=float(budget)) + + cv_model_path = os.path.join(num_run_dir, estimator._backend.get_cv_model_filename( + run_info.seed, run_value.additional_info['num_run'], budget=float(budget))) + model_path = os.path.join(num_run_dir, estimator._backend.get_model_filename( + run_info.seed, run_value.additional_info['num_run'], budget=float(budget))) + + if disable_file_output: + # No file output is expected + assert not os.path.exists(num_run_dir) + else: + # We expect the model path always + # And the cv model only on 'cv' + assert os.path.exists(model_path) + if resampling_strategy in CrossValTypes: + assert os.path.exists(cv_model_path) + elif resampling_strategy in HoldoutValTypes: + assert not os.path.exists(cv_model_path) From bc5b469ededc88599afb216f724cd964fc8a1ffb Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 30 Nov 2021 15:46:55 +0100 Subject: [PATCH 02/32] Add documentation for get dataset --- autoPyTorch/api/base_task.py | 56 +++++++++++++++++------ autoPyTorch/api/tabular_classification.py | 29 +++++++++++- autoPyTorch/api/tabular_regression.py | 29 +++++++++++- autoPyTorch/datasets/tabular_dataset.py | 4 +- 4 files changed, 99 insertions(+), 19 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index b33c6d3ae..53c785d66 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -252,15 +252,41 @@ def build_pipeline(self, dataset_properties: Dict[str, Any], "specific task which is a child of the BaseTask") @abstractmethod - def get_dataset(self, - X_train: Union[List, pd.DataFrame, np.ndarray], - y_train: Union[List, pd.DataFrame, np.ndarray], - X_test: Union[List, pd.DataFrame, np.ndarray], - y_test: Union[List, pd.DataFrame, np.ndarray], - resampling_strategy: Optional[Union[CrossValTypes, HoldoutValTypes]] = None, - resampling_strategy_args: Optional[Dict[str, Any]] = None, - dataset_name: Optional[str] = None, - ) -> BaseDataset: + def get_dataset( + self, + X_train: Union[List, pd.DataFrame, np.ndarray], + y_train: Union[List, pd.DataFrame, np.ndarray], + X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + resampling_strategy: Optional[Union[CrossValTypes, HoldoutValTypes]] = None, + resampling_strategy_args: Optional[Dict[str, Any]] = None, + dataset_name: Optional[str] = None, + ) -> BaseDataset: + """ + Returns an object of a child class of `BaseDataset` according to the current task. + + Args: + X_train (Union[List, pd.DataFrame, np.ndarray]): + Training feature set. + y_train (Union[List, pd.DataFrame, np.ndarray]): + Training target set. + X_test (Optional[Union[List, pd.DataFrame, np.ndarray]]): + Testing feature set + y_test (Optional[Union[List, pd.DataFrame, np.ndarray]]): + Testing target set + resampling_strategy (Optional[Union[CrossValTypes, HoldoutValTypes]]): + Strategy to split the training data. + resampling_strategy_args (Optional[Dict[str, Any]]): + arguments required for the chosen resampling strategy. If None, uses + the default values provided in DEFAULT_RESAMPLING_PARAMETERS + in ```datasets/resampling_strategy.py```. + dataset_name (Optional[str], optional): + name of the dataset, used as experiment name. + + Returns: + BaseDataset: + the dataset object + """ raise NotImplementedError("Function called on BaseTask, this can only be called by " "specific task which is a child of the BaseTask") @@ -1356,10 +1382,14 @@ def fit_pipeline( configuration to fit the pipeline with. Returns: - (BasePipeline): fitted pipeline - (RunInfo): Run information - (RunValue): Result of fitting the pipeline - (BaseDataset): Dataset created from the given tensors + (BasePipeline): + fitted pipeline + (RunInfo): + Run information + (RunValue): + Result of fitting the pipeline + (BaseDataset): + Dataset created from the given tensors """ if dataset is None: diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index 3f8019e58..289b76c7d 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -120,12 +120,37 @@ def get_dataset( self, X_train: Union[List, pd.DataFrame, np.ndarray], y_train: Union[List, pd.DataFrame, np.ndarray], - X_test: Union[List, pd.DataFrame, np.ndarray], - y_test: Union[List, pd.DataFrame, np.ndarray], + X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, resampling_strategy: Optional[Union[CrossValTypes, HoldoutValTypes]] = None, resampling_strategy_args: Optional[Dict[str, Any]] = None, dataset_name: Optional[str] = None, ) -> TabularDataset: + """ + Returns an object of `TabularDataset` according to the current task. + + Args: + X_train (Union[List, pd.DataFrame, np.ndarray]): + Training feature set. + y_train (Union[List, pd.DataFrame, np.ndarray]): + Training target set. + X_test (Optional[Union[List, pd.DataFrame, np.ndarray]]): + Testing feature set + y_test (Optional[Union[List, pd.DataFrame, np.ndarray]]): + Testing target set + resampling_strategy (Optional[Union[CrossValTypes, HoldoutValTypes]]): + Strategy to split the training data. + resampling_strategy_args (Optional[Dict[str, Any]]): + arguments required for the chosen resampling strategy. If None, uses + the default values provided in DEFAULT_RESAMPLING_PARAMETERS + in ```datasets/resampling_strategy.py```. + dataset_name (Optional[str], optional): + name of the dataset, used as experiment name. + + Returns: + TabularDataset: + the dataset object + """ resampling_strategy = resampling_strategy if resampling_strategy is not None else self.resampling_strategy resampling_strategy_args = resampling_strategy_args if resampling_strategy_args is not None else \ diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py index 4d35a71b3..bd5b2e937 100644 --- a/autoPyTorch/api/tabular_regression.py +++ b/autoPyTorch/api/tabular_regression.py @@ -121,12 +121,37 @@ def get_dataset( self, X_train: Union[List, pd.DataFrame, np.ndarray], y_train: Union[List, pd.DataFrame, np.ndarray], - X_test: Union[List, pd.DataFrame, np.ndarray], - y_test: Union[List, pd.DataFrame, np.ndarray], + X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, resampling_strategy: Optional[Union[CrossValTypes, HoldoutValTypes]] = None, resampling_strategy_args: Optional[Dict[str, Any]] = None, dataset_name: Optional[str] = None, ) -> TabularDataset: + """ + Returns an object of `TabularDataset` according to the current task. + + Args: + X_train (Union[List, pd.DataFrame, np.ndarray]): + Training feature set. + y_train (Union[List, pd.DataFrame, np.ndarray]): + Training target set. + X_test (Optional[Union[List, pd.DataFrame, np.ndarray]]): + Testing feature set + y_test (Optional[Union[List, pd.DataFrame, np.ndarray]]): + Testing target set + resampling_strategy (Optional[Union[CrossValTypes, HoldoutValTypes]]): + Strategy to split the training data. + resampling_strategy_args (Optional[Dict[str, Any]]): + arguments required for the chosen resampling strategy. If None, uses + the default values provided in DEFAULT_RESAMPLING_PARAMETERS + in ```datasets/resampling_strategy.py```. + dataset_name (Optional[str], optional): + name of the dataset, used as experiment name. + + Returns: + TabularDataset: + the dataset object + """ resampling_strategy = resampling_strategy if resampling_strategy is not None else self.resampling_strategy resampling_strategy_args = resampling_strategy_args if resampling_strategy_args is not None else \ diff --git a/autoPyTorch/datasets/tabular_dataset.py b/autoPyTorch/datasets/tabular_dataset.py index c2e229868..16335dfbb 100644 --- a/autoPyTorch/datasets/tabular_dataset.py +++ b/autoPyTorch/datasets/tabular_dataset.py @@ -35,8 +35,8 @@ class TabularDataset(BaseDataset): resampling_strategy (Union[CrossValTypes, HoldoutValTypes]), (default=HoldoutValTypes.holdout_validation): strategy to split the training data. - resampling_strategy_args (Optional[Dict[str, Any]]): arguments - required for the chosen resampling strategy. If None, uses + resampling_strategy_args (Optional[Dict[str, Any]]): + arguments required for the chosen resampling strategy. If None, uses the default values provided in DEFAULT_RESAMPLING_PARAMETERS in ```datasets/resampling_strategy.py```. shuffle: Whether to shuffle the data before performing splits From 0359c8ccf6999626510cbcfa3d4f503f3b3c86e9 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 30 Nov 2021 16:52:54 +0100 Subject: [PATCH 03/32] update documentation --- autoPyTorch/api/base_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 53c785d66..f44062db6 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -1327,7 +1327,7 @@ def fit_pipeline( no memory limit is set. In case of multi-processing, memory_limit will be per job. This memory limit also applies to the ensemble creation process. - eval_metric (str): + eval_metric (Optional[str]): Name of the metric that is used to evaluate a pipeline. all_supported_metrics (bool: default=True): if True, all metrics supporting current task will be calculated From 75eb604f3c71d9030c3e017b783b006408f57734 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 30 Nov 2021 17:39:26 +0100 Subject: [PATCH 04/32] fix tests --- autoPyTorch/api/base_task.py | 54 ++++++++++++++++++++++- autoPyTorch/api/tabular_classification.py | 24 +++++----- autoPyTorch/api/tabular_regression.py | 22 ++++----- test/test_api/test_api.py | 3 -- 4 files changed, 77 insertions(+), 26 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index f44062db6..6fd728dd9 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -252,6 +252,48 @@ def build_pipeline(self, dataset_properties: Dict[str, Any], "specific task which is a child of the BaseTask") @abstractmethod + def _get_dataset_input_validator( + self, + X_train: Union[List, pd.DataFrame, np.ndarray], + y_train: Union[List, pd.DataFrame, np.ndarray], + X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + resampling_strategy: Optional[Union[CrossValTypes, HoldoutValTypes]] = None, + resampling_strategy_args: Optional[Dict[str, Any]] = None, + dataset_name: Optional[str] = None, + ) -> Tuple[BaseDataset, BaseInputValidator]: + """ + Returns an object of a child class of `BaseDataset` and + an object of a child class of `BaseInputValidator` according + to the current task. + + Args: + X_train (Union[List, pd.DataFrame, np.ndarray]): + Training feature set. + y_train (Union[List, pd.DataFrame, np.ndarray]): + Training target set. + X_test (Optional[Union[List, pd.DataFrame, np.ndarray]]): + Testing feature set + y_test (Optional[Union[List, pd.DataFrame, np.ndarray]]): + Testing target set + resampling_strategy (Optional[Union[CrossValTypes, HoldoutValTypes]]): + Strategy to split the training data. + resampling_strategy_args (Optional[Dict[str, Any]]): + arguments required for the chosen resampling strategy. If None, uses + the default values provided in DEFAULT_RESAMPLING_PARAMETERS + in ```datasets/resampling_strategy.py```. + dataset_name (Optional[str], optional): + name of the dataset, used as experiment name. + + Returns: + BaseDataset: + the dataset object + BaseInputValidator: + fitted input validator + """ + raise NotImplementedError("Function called on BaseTask, this can only be called by " + "specific task which is a child of the BaseTask") + def get_dataset( self, X_train: Union[List, pd.DataFrame, np.ndarray], @@ -287,8 +329,16 @@ def get_dataset( BaseDataset: the dataset object """ - raise NotImplementedError("Function called on BaseTask, this can only be called by " - "specific task which is a child of the BaseTask") + dataset, _ = self._get_dataset_input_validator( + X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, + resampling_strategy=resampling_strategy, + resampling_strategy_args=resampling_strategy_args, + dataset_name=dataset_name) + + return dataset @property def run_history(self) -> RunHistory: diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index 289b76c7d..069121d6f 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -1,4 +1,4 @@ -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import numpy as np @@ -116,7 +116,7 @@ def build_pipeline( exclude=exclude_components, search_space_updates=search_space_updates) - def get_dataset( + def _get_dataset_input_validator( self, X_train: Union[List, pd.DataFrame, np.ndarray], y_train: Union[List, pd.DataFrame, np.ndarray], @@ -125,9 +125,10 @@ def get_dataset( resampling_strategy: Optional[Union[CrossValTypes, HoldoutValTypes]] = None, resampling_strategy_args: Optional[Dict[str, Any]] = None, dataset_name: Optional[str] = None, - ) -> TabularDataset: + ) -> Tuple[TabularDataset, TabularInputValidator]: """ - Returns an object of `TabularDataset` according to the current task. + Returns an object of `TabularDataset` and an object of + `TabularInputValidator` according to the current task. Args: X_train (Union[List, pd.DataFrame, np.ndarray]): @@ -144,12 +145,13 @@ def get_dataset( arguments required for the chosen resampling strategy. If None, uses the default values provided in DEFAULT_RESAMPLING_PARAMETERS in ```datasets/resampling_strategy.py```. - dataset_name (Optional[str], optional): + dataset_name (Optional[str]): name of the dataset, used as experiment name. - Returns: TabularDataset: - the dataset object + the dataset object. + TabularInputValidator: + the input validator fitted on the data. """ resampling_strategy = resampling_strategy if resampling_strategy is not None else self.resampling_strategy @@ -177,7 +179,7 @@ def get_dataset( dataset_name=dataset_name ) - return dataset + return dataset, InputValidator def search( self, @@ -331,7 +333,7 @@ def search( """ - self.dataset = self.get_dataset( + self.dataset, self.InputValidator = self._get_dataset_input_validator( X_train=X_train, y_train=y_train, X_test=X_test, @@ -377,7 +379,7 @@ def predict( """ if self.InputValidator is None or not self.InputValidator._is_fitted: raise ValueError("predict() is only supported after calling search. Kindly call first " - "the estimator fit() method.") + "the estimator search() method.") X_test = self.InputValidator.feature_validator.transform(X_test) predicted_probabilities = super().predict(X_test, batch_size=batch_size, @@ -397,6 +399,6 @@ def predict_proba(self, batch_size: Optional[int] = None, n_jobs: int = 1) -> np.ndarray: if self.InputValidator is None or not self.InputValidator._is_fitted: raise ValueError("predict() is only supported after calling search. Kindly call first " - "the estimator fit() method.") + "the estimator search() method.") X_test = self.InputValidator.feature_validator.transform(X_test) return super().predict(X_test, batch_size=batch_size, n_jobs=n_jobs) diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py index bd5b2e937..6828ef8ad 100644 --- a/autoPyTorch/api/tabular_regression.py +++ b/autoPyTorch/api/tabular_regression.py @@ -1,4 +1,4 @@ -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import numpy as np @@ -117,7 +117,7 @@ def build_pipeline( exclude=exclude_components, search_space_updates=search_space_updates) - def get_dataset( + def _get_dataset_input_validator( self, X_train: Union[List, pd.DataFrame, np.ndarray], y_train: Union[List, pd.DataFrame, np.ndarray], @@ -126,9 +126,10 @@ def get_dataset( resampling_strategy: Optional[Union[CrossValTypes, HoldoutValTypes]] = None, resampling_strategy_args: Optional[Dict[str, Any]] = None, dataset_name: Optional[str] = None, - ) -> TabularDataset: + ) -> Tuple[TabularDataset, TabularInputValidator]: """ - Returns an object of `TabularDataset` according to the current task. + Returns an object of `TabularDataset` and an object of + `TabularInputValidator` according to the current task. Args: X_train (Union[List, pd.DataFrame, np.ndarray]): @@ -145,12 +146,13 @@ def get_dataset( arguments required for the chosen resampling strategy. If None, uses the default values provided in DEFAULT_RESAMPLING_PARAMETERS in ```datasets/resampling_strategy.py```. - dataset_name (Optional[str], optional): + dataset_name (Optional[str]): name of the dataset, used as experiment name. - Returns: TabularDataset: - the dataset object + the dataset object. + TabularInputValidator: + the input validator fitted on the data. """ resampling_strategy = resampling_strategy if resampling_strategy is not None else self.resampling_strategy @@ -178,7 +180,7 @@ def get_dataset( dataset_name=dataset_name ) - return dataset + return dataset, InputValidator def search( self, @@ -331,7 +333,7 @@ def search( self """ - self.dataset = self.get_dataset( + self.dataset, self.InputValidator = self._get_dataset_input_validator( X_train=X_train, y_train=y_train, X_test=X_test, @@ -367,7 +369,7 @@ def predict( ) -> np.ndarray: if self.InputValidator is None or not self.InputValidator._is_fitted: raise ValueError("predict() is only supported after calling search. Kindly call first " - "the estimator fit() method.") + "the estimator search() method.") X_test = self.InputValidator.feature_validator.transform(X_test) predicted_values = super().predict(X_test, batch_size=batch_size, diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py index 1e6009081..286dc1307 100644 --- a/test/test_api/test_api.py +++ b/test/test_api/test_api.py @@ -217,9 +217,6 @@ def test_tabular_classification(openml_id, resampling_strategy, backend, resampl # Make sure that a configuration space is stored in the estimator assert isinstance(estimator.get_search_space(), CS.ConfigurationSpace) - # test fit on dummy data - assert isinstance(estimator.fit(dataset=backend.load_datamanager()), BasePipeline) - @pytest.mark.parametrize('openml_name', ("boston", )) @unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_function', From 136f619be252b18c3b51044184fdb3dbf9f9012f Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 30 Nov 2021 19:43:54 +0100 Subject: [PATCH 05/32] remove permutation importance from visualisation example --- .../example_single_configuration.py | 86 +++++++++++++++++++ examples/40_advanced/example_visualization.py | 15 ---- 2 files changed, 86 insertions(+), 15 deletions(-) create mode 100644 examples/40_advanced/example_single_configuration.py diff --git a/examples/40_advanced/example_single_configuration.py b/examples/40_advanced/example_single_configuration.py new file mode 100644 index 000000000..846118b12 --- /dev/null +++ b/examples/40_advanced/example_single_configuration.py @@ -0,0 +1,86 @@ +# -*- encoding: utf-8 -*- +""" +========================== +Fit a single configuration +========================== +*Auto-PyTorch* searches for the best combination of machine learning algorithms +and their hyper-parameter configuration for a given task. +This example shows how one can fit one of these pipelines, both, with a user defined +configuration, and a randomly sampled one form the configuration space. +The pipelines that Auto-PyTorch fits are compatible with Scikit-Learn API. You can +get further documentation about Scikit-Learn models here: _ +""" +import os +import tempfile as tmp +import warnings + +os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir() +os.environ['OMP_NUM_THREADS'] = '1' +os.environ['OPENBLAS_NUM_THREADS'] = '1' +os.environ['MKL_NUM_THREADS'] = '1' + +warnings.simplefilter(action='ignore', category=UserWarning) +warnings.simplefilter(action='ignore', category=FutureWarning) + +import sklearn.datasets +import sklearn.metrics + +from autoPyTorch.api.tabular_classification import TabularClassificationTask +from autoPyTorch.datasets.resampling_strategy import HoldoutValTypes + + +############################################################################ +# Data Loading +# ============ + +X, y = sklearn.datasets.fetch_openml(data_id=3, return_X_y=True, as_frame=True) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, test_size=0.5, random_state=3 +) + +############################################################################ +# Define an estimator +# ============================ + +estimator = TabularClassificationTask( + resampling_strategy=HoldoutValTypes.holdout_validation, + resampling_strategy_args={'val_share': 0.33}, + temporary_directory='./tmp/temp', + output_directory='./tmp/out', + delete_output_folder_after_terminate=False, + delete_tmp_folder_after_terminate=False +) + +############################################################################ +# Get a random configuration of the pipeline for current dataset +# =============================================================== + +dataset = estimator.get_dataset(X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, + dataset_name='kr-vs-kp') +configuration = estimator.get_search_space(dataset).sample_configuration() + +print("Passed Configuration:", configuration) +########################################################################### +# Fit the configuration +# ================================== + +pipeline, run_info, run_value, dataset = estimator.fit_pipeline(dataset=dataset, + disable_file_output=False, + configuration=configuration, + budget_type='epochs', + budget=20, + run_time_limit_secs=200 + ) + +# This object complies with Scikit-Learn Pipeline API. +# https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html +print(pipeline.named_steps) + +# The fit_pipeline command also returns a named tuple with the pipeline constraints +print(run_info) + +# The fit_pipeline command also returns a named tuple with train/test performance +print(run_value) diff --git a/examples/40_advanced/example_visualization.py b/examples/40_advanced/example_visualization.py index 37c1c6dc3..a88899e81 100644 --- a/examples/40_advanced/example_visualization.py +++ b/examples/40_advanced/example_visualization.py @@ -149,18 +149,3 @@ grid=True, ) plt.show() - -# We then can understand the importance of each input feature using -# a permutation importance analysis. This is done as a proof of concept, to -# showcase that we can leverage of scikit-learn API. -result = permutation_importance(estimator, X_train, y_train, n_repeats=5, - scoring='accuracy', - random_state=seed) -sorted_idx = result.importances_mean.argsort() - -fig, ax = plt.subplots() -ax.boxplot(result.importances[sorted_idx].T, - vert=False, labels=X_test.columns[sorted_idx]) -ax.set_title("Permutation Importances (Train set)") -fig.tight_layout() -plt.show() From 47313634e58a7ef523a3e769a88ac307aa901ac5 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 30 Nov 2021 21:40:10 +0100 Subject: [PATCH 06/32] change disable_file_output --- examples/40_advanced/example_single_configuration.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/40_advanced/example_single_configuration.py b/examples/40_advanced/example_single_configuration.py index 846118b12..6c78559ec 100644 --- a/examples/40_advanced/example_single_configuration.py +++ b/examples/40_advanced/example_single_configuration.py @@ -68,7 +68,6 @@ # ================================== pipeline, run_info, run_value, dataset = estimator.fit_pipeline(dataset=dataset, - disable_file_output=False, configuration=configuration, budget_type='epochs', budget=20, From af48ebf53ae5f9d0ad8a8f792bc53612ab479b17 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 6 Dec 2021 12:57:59 +0100 Subject: [PATCH 07/32] add --- autoPyTorch/api/base_task.py | 12 ++++--- autoPyTorch/api/tabular_classification.py | 6 ++-- autoPyTorch/api/tabular_regression.py | 6 ++-- autoPyTorch/evaluation/abstract_evaluator.py | 15 +++++--- autoPyTorch/evaluation/train_evaluator.py | 8 +++-- autoPyTorch/evaluation/utils.py | 36 +++++++++++++++++++ .../example_single_configuration.py | 6 ++-- .../test_abstract_evaluator.py | 32 +++++++++++++++++ test/test_evaluation/test_utils.py | 17 +++++++++ 9 files changed, 120 insertions(+), 18 deletions(-) create mode 100644 test/test_evaluation/test_utils.py diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 6fd728dd9..9d1847c00 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -938,9 +938,10 @@ def _search( precision (int: default=32): Numeric precision used when loading ensemble data. Can be either '16', '32' or '64'. - disable_file_output (Optional[List]): + disable_file_output (List[Union[str, DisableFileOutputParameters]]): Used as a list to pass more fine-grained - information on what to save. Allowed elements in the list are: + information on what to save. Must be a member of `DisableFileOutputParameters`. + Allowed elements in the list are: + `y_optimization`: do not save the predictions for the optimization set, @@ -955,6 +956,7 @@ def _search( do not save the predictions for the test set. + `all`: do not save any of the above. + For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`. load_models (bool: default=True): Whether to load the models after fitting AutoPyTorch. portfolio_selection (Optional[str]): @@ -1411,9 +1413,10 @@ def fit_pipeline( Valid config options include "device", "torch_num_threads", "early_stopping", "use_tensorboard_logger", "metrics_during_training" - disable_file_output (Optional[List]): + disable_file_output (List[Union[str, DisableFileOutputParameters]]): Used as a list to pass more fine-grained - information on what to save. Allowed elements in the list are: + information on what to save. Must be a member of `DisableFileOutputParameters`. + Allowed elements in the list are: + `y_optimization`: do not save the predictions for the optimization set, @@ -1428,6 +1431,7 @@ def fit_pipeline( do not save the predictions for the test set. + `all`: do not save any of the above. + For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`. configuration: (Configuration) configuration to fit the pipeline with. diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index 069121d6f..255b70579 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -299,9 +299,10 @@ def search( precision (int: default=32): Numeric precision used when loading ensemble data. Can be either '16', '32' or '64'. - disable_file_output (List): + disable_file_output (List[Union[str, DisableFileOutputParameters]]): Used as a list to pass more fine-grained - information on what to save. Allowed elements in the list are: + information on what to save. Must be a member of `DisableFileOutputParameters`. + Allowed elements in the list are: + `y_optimization`: do not save the predictions for the optimization set, @@ -316,6 +317,7 @@ def search( do not save the predictions for the test set. + `all`: do not save any of the above. + For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`. load_models (bool: default=True): Whether to load the models after fitting AutoPyTorch. portfolio_selection (Optional[str]): diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py index 6828ef8ad..d9ddfb674 100644 --- a/autoPyTorch/api/tabular_regression.py +++ b/autoPyTorch/api/tabular_regression.py @@ -300,9 +300,10 @@ def search( precision (int: default=32): Numeric precision used when loading ensemble data. Can be either '16', '32' or '64'. - disable_file_output (Optional[List]): + disable_file_output (List[Union[str, DisableFileOutputParameters]]): Used as a list to pass more fine-grained - information on what to save. Allowed elements in the list are: + information on what to save. Must be a member of `DisableFileOutputParameters`. + Allowed elements in the list are: + `y_optimization`: do not save the predictions for the optimization set, @@ -317,6 +318,7 @@ def search( do not save the predictions for the test set. + `all`: do not save any of the above. + For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`. load_models (bool: default=True): Whether to load the models after fitting AutoPyTorch. portfolio_selection (Optional[str]): diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py index 93c0d0f9b..d70f0b756 100644 --- a/autoPyTorch/evaluation/abstract_evaluator.py +++ b/autoPyTorch/evaluation/abstract_evaluator.py @@ -33,8 +33,9 @@ ) from autoPyTorch.datasets.base_dataset import BaseDataset, BaseDatasetPropertiesType from autoPyTorch.evaluation.utils import ( + DisableFileOutputParameters, VotingRegressorWrapper, - convert_multioutput_multiclass_to_multilabel + convert_multioutput_multiclass_to_multilabel, ) from autoPyTorch.pipeline.base_pipeline import BasePipeline from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric @@ -375,9 +376,10 @@ class AbstractEvaluator(object): An optional dictionary to include components of the pipeline steps. exclude (Optional[Dict[str, Any]]): An optional dictionary to exclude components of the pipeline steps. - disable_file_output (Optional[List]): + disable_file_output (List[Union[str, DisableFileOutputParameters]]): Used as a list to pass more fine-grained - information on what to save. Allowed elements in the list are: + information on what to save. Must be a member of `DisableFileOutputParameters`. + Allowed elements in the list are: + `y_optimization`: do not save the predictions for the optimization set, @@ -392,6 +394,7 @@ class AbstractEvaluator(object): do not save the predictions for the test set. + `all`: do not save any of the above. + For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`. init_params (Optional[Dict[str, Any]]): Optional argument that is passed to each pipeline step. It is the equivalent of kwargs for the pipeline steps. @@ -461,7 +464,11 @@ def __init__(self, backend: Backend, # Flag to save target for ensemble self.output_y_hat_optimization = output_y_hat_optimization - self.disable_file_output = disable_file_output if disable_file_output is not None else [] + disable_file_output = disable_file_output if disable_file_output is not None else [] + # check compatibility of disable file output + DisableFileOutputParameters.check_compatibility(disable_file_output) + + self.disable_file_output = disable_file_output self.pipeline_class: Optional[Union[BaseEstimator, BasePipeline]] = None if self.task_type in REGRESSION_TASKS: diff --git a/autoPyTorch/evaluation/train_evaluator.py b/autoPyTorch/evaluation/train_evaluator.py index bdff3549f..8b07421a3 100644 --- a/autoPyTorch/evaluation/train_evaluator.py +++ b/autoPyTorch/evaluation/train_evaluator.py @@ -79,9 +79,10 @@ class TrainEvaluator(AbstractEvaluator): An optional dictionary to include components of the pipeline steps. exclude (Optional[Dict[str, Any]]): An optional dictionary to exclude components of the pipeline steps. - disable_file_output (Optional[List]): + disable_file_output (List[Union[str, DisableFileOutputParameters]]): Used as a list to pass more fine-grained - information on what to save. Allowed elements in the list are: + information on what to save. Must be a member of `DisableFileOutputParameters`. + Allowed elements in the list are: + `y_optimization`: do not save the predictions for the optimization set, @@ -96,6 +97,7 @@ class TrainEvaluator(AbstractEvaluator): do not save the predictions for the test set. + `all`: do not save any of the above. + For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`. init_params (Optional[Dict[str, Any]]): Optional argument that is passed to each pipeline step. It is the equivalent of kwargs for the pipeline steps. @@ -120,7 +122,7 @@ def __init__(self, backend: Backend, queue: Queue, num_run: Optional[int] = None, include: Optional[Dict[str, Any]] = None, exclude: Optional[Dict[str, Any]] = None, - disable_file_output: Optional[List] = [], + disable_file_output: Optional[List[str]] = None, init_params: Optional[Dict[str, Any]] = None, logger_port: Optional[int] = None, keep_models: Optional[bool] = None, diff --git a/autoPyTorch/evaluation/utils.py b/autoPyTorch/evaluation/utils.py index 1bf93fa84..f729c21f8 100644 --- a/autoPyTorch/evaluation/utils.py +++ b/autoPyTorch/evaluation/utils.py @@ -1,4 +1,5 @@ import queue +from enum import Enum from multiprocessing.queues import Queue from typing import List, Optional, Union @@ -102,3 +103,38 @@ def _predict(self, X: np.ndarray) -> np.ndarray: predictions.append(pred.ravel()) return np.asarray(predictions).T + + +class DisableFileOutputParameters(Enum): + """ + Contains literals that can be passed in to `disable_file_output` list. + These include: + + + `y_optimization`: + do not save the predictions for the optimization set, + which would later on be used to build an ensemble. Note that SMAC + optimizes a metric evaluated on the optimization set. + + `pipeline`: + do not save any individual pipeline files + + `pipelines`: + In case of cross validation, disables saving the joint model of the + pipelines fit on each fold. + + `y_test`: + do not save the predictions for the test set. + + `all`: + do not save any of the above. + """ + pipeline = 'pipeline' + pipelines = 'pipelines' + y_optimization = 'y_optimization' + y_test = 'y_test' + all = 'all' + + @classmethod + def check_compatibility(cls, disable_file_output: List) -> None: + for item in disable_file_output: + if item not in cls.__members__: + if not isinstance(item, cls): + raise ValueError(f"Expected {item} to be in the members (" + f"{list(cls.__members__.keys())}) of {cls.__name__}" + f" or an instance.") diff --git a/examples/40_advanced/example_single_configuration.py b/examples/40_advanced/example_single_configuration.py index 6c78559ec..c491c16e8 100644 --- a/examples/40_advanced/example_single_configuration.py +++ b/examples/40_advanced/example_single_configuration.py @@ -40,7 +40,7 @@ ############################################################################ # Define an estimator -# ============================ +# =================== estimator = TabularClassificationTask( resampling_strategy=HoldoutValTypes.holdout_validation, @@ -65,13 +65,13 @@ print("Passed Configuration:", configuration) ########################################################################### # Fit the configuration -# ================================== +# ===================== pipeline, run_info, run_value, dataset = estimator.fit_pipeline(dataset=dataset, configuration=configuration, budget_type='epochs', budget=20, - run_time_limit_secs=200 + run_time_limit_secs=70 ) # This object complies with Scikit-Learn Pipeline API. diff --git a/test/test_evaluation/test_abstract_evaluator.py b/test/test_evaluation/test_abstract_evaluator.py index 6cec57fb4..cb16e9a35 100644 --- a/test/test_evaluation/test_abstract_evaluator.py +++ b/test/test_evaluation/test_abstract_evaluator.py @@ -314,3 +314,35 @@ def test_error_unsupported_budget_type(self): self.assertIsInstance(e, ValueError) shutil.rmtree(self.working_directory, ignore_errors=True) + + def test_error_unsupported_disable_file_output_parameters(self): + shutil.rmtree(self.working_directory, ignore_errors=True) + os.mkdir(self.working_directory) + + queue_mock = unittest.mock.Mock() + + context = BackendContext( + prefix='autoPyTorch', + temporary_directory=os.path.join(self.working_directory, 'tmp'), + output_directory=os.path.join(self.working_directory, 'out'), + delete_tmp_folder_after_terminate=True, + delete_output_folder_after_terminate=True, + ) + with unittest.mock.patch.object(Backend, 'load_datamanager') as load_datamanager_mock: + load_datamanager_mock.return_value = get_multiclass_classification_datamanager() + + backend = Backend(context, prefix='autoPyTorch') + + try: + AbstractEvaluator( + backend=backend, + output_y_hat_optimization=False, + queue=queue_mock, + metric=accuracy, + budget=0, + configuration=1, + disable_file_output=['model']) + except Exception as e: + self.assertIsInstance(e, ValueError) + + shutil.rmtree(self.working_directory, ignore_errors=True) diff --git a/test/test_evaluation/test_utils.py b/test/test_evaluation/test_utils.py new file mode 100644 index 000000000..93d2e5195 --- /dev/null +++ b/test/test_evaluation/test_utils.py @@ -0,0 +1,17 @@ +import pytest + +from autoPyTorch.evaluation.utils import DisableFileOutputParameters + +def test_disable_file_output_string_no_error(): + disable_file_output = ['pipeline', 'pipelines'] + DisableFileOutputParameters.check_compatibility(disable_file_output=disable_file_output) + +def test_disable_file_output_string_error(): + disable_file_output = ['model'] + with pytest.raises(ValueError, match=r"Expected .*? to be in the members (.*?) of" + r" DisableFileOutputParameters or an instance."): + DisableFileOutputParameters.check_compatibility(disable_file_output=disable_file_output) + +def test_disable_file_output_enum_no_error(): + disable_file_output = [DisableFileOutputParameters.pipeline, DisableFileOutputParameters.pipelines] + DisableFileOutputParameters.check_compatibility(disable_file_output=disable_file_output) \ No newline at end of file From 3df4e06eca267f0fbaab885181e397164c55a5da Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 6 Dec 2021 13:00:16 +0100 Subject: [PATCH 08/32] fix flake --- test/test_evaluation/test_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/test_evaluation/test_utils.py b/test/test_evaluation/test_utils.py index 93d2e5195..a67b29e1a 100644 --- a/test/test_evaluation/test_utils.py +++ b/test/test_evaluation/test_utils.py @@ -2,16 +2,19 @@ from autoPyTorch.evaluation.utils import DisableFileOutputParameters + def test_disable_file_output_string_no_error(): disable_file_output = ['pipeline', 'pipelines'] DisableFileOutputParameters.check_compatibility(disable_file_output=disable_file_output) + def test_disable_file_output_string_error(): disable_file_output = ['model'] with pytest.raises(ValueError, match=r"Expected .*? to be in the members (.*?) of" r" DisableFileOutputParameters or an instance."): DisableFileOutputParameters.check_compatibility(disable_file_output=disable_file_output) + def test_disable_file_output_enum_no_error(): disable_file_output = [DisableFileOutputParameters.pipeline, DisableFileOutputParameters.pipelines] - DisableFileOutputParameters.check_compatibility(disable_file_output=disable_file_output) \ No newline at end of file + DisableFileOutputParameters.check_compatibility(disable_file_output=disable_file_output) From e8289e43490259307d5f046c49598d75ae5e6bf0 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 6 Dec 2021 15:22:25 +0100 Subject: [PATCH 09/32] fix test and examples --- examples/40_advanced/example_single_configuration.py | 6 +----- test/test_evaluation/test_abstract_evaluator.py | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/examples/40_advanced/example_single_configuration.py b/examples/40_advanced/example_single_configuration.py index c491c16e8..f8e3a6910 100644 --- a/examples/40_advanced/example_single_configuration.py +++ b/examples/40_advanced/example_single_configuration.py @@ -45,10 +45,6 @@ estimator = TabularClassificationTask( resampling_strategy=HoldoutValTypes.holdout_validation, resampling_strategy_args={'val_share': 0.33}, - temporary_directory='./tmp/temp', - output_directory='./tmp/out', - delete_output_folder_after_terminate=False, - delete_tmp_folder_after_terminate=False ) ############################################################################ @@ -71,7 +67,7 @@ configuration=configuration, budget_type='epochs', budget=20, - run_time_limit_secs=70 + run_time_limit_secs=100 ) # This object complies with Scikit-Learn Pipeline API. diff --git a/test/test_evaluation/test_abstract_evaluator.py b/test/test_evaluation/test_abstract_evaluator.py index cb16e9a35..b08b26db4 100644 --- a/test/test_evaluation/test_abstract_evaluator.py +++ b/test/test_evaluation/test_abstract_evaluator.py @@ -129,7 +129,7 @@ def test_disable_file_output(self): ae = AbstractEvaluator( backend=self.backend_mock, queue=queue_mock, - disable_file_output=True, + disable_file_output=['all'], metric=accuracy, logger_port=unittest.mock.Mock(), budget=0, From 4018d026ae4c21f05b5fb43221f478ffd54788ee Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 6 Dec 2021 15:54:59 +0100 Subject: [PATCH 10/32] change type of disable_file_output --- autoPyTorch/api/base_task.py | 9 ++-- autoPyTorch/api/tabular_classification.py | 5 ++- autoPyTorch/api/tabular_regression.py | 5 ++- autoPyTorch/evaluation/abstract_evaluator.py | 40 ++++++++--------- autoPyTorch/evaluation/tae.py | 9 +++- autoPyTorch/evaluation/train_evaluator.py | 43 ++++++++++--------- .../test_abstract_evaluator.py | 3 +- 7 files changed, 62 insertions(+), 52 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 9d1847c00..061844059 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -45,6 +45,7 @@ from autoPyTorch.ensemble.singlebest_ensemble import SingleBest from autoPyTorch.evaluation.abstract_evaluator import fit_and_suppress_warnings from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash +from autoPyTorch.evaluation.utils import DisableFileOutputParameters from autoPyTorch.optimizer.smbo import AutoMLSMBO from autoPyTorch.pipeline.base_pipeline import BasePipeline from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner import get_available_traditional_learners @@ -837,7 +838,7 @@ def _search( tae_func: Optional[Callable] = None, all_supported_metrics: bool = True, precision: int = 32, - disable_file_output: Optional[List[str]] = None, + disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, load_models: bool = True, portfolio_selection: Optional[str] = None, dask_client: Optional[dask.distributed.Client] = None @@ -938,7 +939,7 @@ def _search( precision (int: default=32): Numeric precision used when loading ensemble data. Can be either '16', '32' or '64'. - disable_file_output (List[Union[str, DisableFileOutputParameters]]): + disable_file_output (Optional[List[Union[str, DisableFileOutputParameters]]]): Used as a list to pass more fine-grained information on what to save. Must be a member of `DisableFileOutputParameters`. Allowed elements in the list are: @@ -1341,7 +1342,7 @@ def fit_pipeline( search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, budget: Optional[float] = None, pipeline_options: Optional[Dict] = None, - disable_file_output: Optional[List[str]] = None, + disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, ) -> Tuple[Optional[BasePipeline], RunInfo, RunValue, BaseDataset]: """ Fit a pipeline on the given task for the budget. @@ -1413,7 +1414,7 @@ def fit_pipeline( Valid config options include "device", "torch_num_threads", "early_stopping", "use_tensorboard_logger", "metrics_during_training" - disable_file_output (List[Union[str, DisableFileOutputParameters]]): + disable_file_output (Optional[List[Union[str, DisableFileOutputParameters]]]): Used as a list to pass more fine-grained information on what to save. Must be a member of `DisableFileOutputParameters`. Allowed elements in the list are: diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index 255b70579..7d71b84a4 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -16,6 +16,7 @@ HoldoutValTypes, ) from autoPyTorch.datasets.tabular_dataset import TabularDataset +from autoPyTorch.evaluation.utils import DisableFileOutputParameters from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates @@ -200,7 +201,7 @@ def search( get_smac_object_callback: Optional[Callable] = None, all_supported_metrics: bool = True, precision: int = 32, - disable_file_output: Optional[List[str]] = None, + disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, load_models: bool = True, portfolio_selection: Optional[str] = None, ) -> 'BaseTask': @@ -299,7 +300,7 @@ def search( precision (int: default=32): Numeric precision used when loading ensemble data. Can be either '16', '32' or '64'. - disable_file_output (List[Union[str, DisableFileOutputParameters]]): + disable_file_output (Optional[List[Union[str, DisableFileOutputParameters]]]): Used as a list to pass more fine-grained information on what to save. Must be a member of `DisableFileOutputParameters`. Allowed elements in the list are: diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py index d9ddfb674..7a59651ef 100644 --- a/autoPyTorch/api/tabular_regression.py +++ b/autoPyTorch/api/tabular_regression.py @@ -16,6 +16,7 @@ HoldoutValTypes, ) from autoPyTorch.datasets.tabular_dataset import TabularDataset +from autoPyTorch.evaluation.utils import DisableFileOutputParameters from autoPyTorch.pipeline.tabular_regression import TabularRegressionPipeline from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates @@ -201,7 +202,7 @@ def search( get_smac_object_callback: Optional[Callable] = None, all_supported_metrics: bool = True, precision: int = 32, - disable_file_output: Optional[List[str]] = None, + disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, load_models: bool = True, portfolio_selection: Optional[str] = None, ) -> 'BaseTask': @@ -300,7 +301,7 @@ def search( precision (int: default=32): Numeric precision used when loading ensemble data. Can be either '16', '32' or '64'. - disable_file_output (List[Union[str, DisableFileOutputParameters]]): + disable_file_output (Optional[List[Union[str, DisableFileOutputParameters]]]): Used as a list to pass more fine-grained information on what to save. Must be a member of `DisableFileOutputParameters`. Allowed elements in the list are: diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py index d70f0b756..0b7cc105f 100644 --- a/autoPyTorch/evaluation/abstract_evaluator.py +++ b/autoPyTorch/evaluation/abstract_evaluator.py @@ -376,25 +376,25 @@ class AbstractEvaluator(object): An optional dictionary to include components of the pipeline steps. exclude (Optional[Dict[str, Any]]): An optional dictionary to exclude components of the pipeline steps. - disable_file_output (List[Union[str, DisableFileOutputParameters]]): - Used as a list to pass more fine-grained - information on what to save. Must be a member of `DisableFileOutputParameters`. - Allowed elements in the list are: - - + `y_optimization`: - do not save the predictions for the optimization set, - which would later on be used to build an ensemble. Note that SMAC - optimizes a metric evaluated on the optimization set. - + `pipeline`: - do not save any individual pipeline files - + `pipelines`: - In case of cross validation, disables saving the joint model of the - pipelines fit on each fold. - + `y_test`: - do not save the predictions for the test set. - + `all`: - do not save any of the above. - For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`. + disable_file_output (Optional[List[Union[str, DisableFileOutputParameters]]]): + Used as a list to pass more fine-grained + information on what to save. Must be a member of `DisableFileOutputParameters`. + Allowed elements in the list are: + + + `y_optimization`: + do not save the predictions for the optimization set, + which would later on be used to build an ensemble. Note that SMAC + optimizes a metric evaluated on the optimization set. + + `pipeline`: + do not save any individual pipeline files + + `pipelines`: + In case of cross validation, disables saving the joint model of the + pipelines fit on each fold. + + `y_test`: + do not save the predictions for the test set. + + `all`: + do not save any of the above. + For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`. init_params (Optional[Dict[str, Any]]): Optional argument that is passed to each pipeline step. It is the equivalent of kwargs for the pipeline steps. @@ -420,7 +420,7 @@ def __init__(self, backend: Backend, num_run: Optional[int] = None, include: Optional[Dict[str, Any]] = None, exclude: Optional[Dict[str, Any]] = None, - disable_file_output: Optional[List[str]] = None, + disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, init_params: Optional[Dict[str, Any]] = None, logger_port: Optional[int] = None, all_supported_metrics: bool = True, diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py index 89a9838c9..683870304 100644 --- a/autoPyTorch/evaluation/tae.py +++ b/autoPyTorch/evaluation/tae.py @@ -24,7 +24,12 @@ import autoPyTorch.evaluation.train_evaluator from autoPyTorch.automl_common.common.utils.backend import Backend -from autoPyTorch.evaluation.utils import empty_queue, extract_learning_curve, read_queue +from autoPyTorch.evaluation.utils import ( + DisableFileOutputParameters, + empty_queue, + extract_learning_curve, + read_queue +) from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric from autoPyTorch.utils.common import dict_repr, replace_string_bool_to_bool from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates @@ -109,7 +114,7 @@ def __init__( include: Optional[Dict[str, Any]] = None, exclude: Optional[Dict[str, Any]] = None, memory_limit: Optional[int] = None, - disable_file_output: Optional[List] = None, + disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, init_params: Dict[str, Any] = None, budget_type: str = None, ta: Optional[Callable] = None, diff --git a/autoPyTorch/evaluation/train_evaluator.py b/autoPyTorch/evaluation/train_evaluator.py index 8b07421a3..1bf1bce4c 100644 --- a/autoPyTorch/evaluation/train_evaluator.py +++ b/autoPyTorch/evaluation/train_evaluator.py @@ -18,6 +18,7 @@ AbstractEvaluator, fit_and_suppress_warnings ) +from autoPyTorch.evaluation.utils import DisableFileOutputParameters from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric from autoPyTorch.utils.common import dict_repr, subsampler from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates @@ -79,25 +80,25 @@ class TrainEvaluator(AbstractEvaluator): An optional dictionary to include components of the pipeline steps. exclude (Optional[Dict[str, Any]]): An optional dictionary to exclude components of the pipeline steps. - disable_file_output (List[Union[str, DisableFileOutputParameters]]): - Used as a list to pass more fine-grained - information on what to save. Must be a member of `DisableFileOutputParameters`. - Allowed elements in the list are: - - + `y_optimization`: - do not save the predictions for the optimization set, - which would later on be used to build an ensemble. Note that SMAC - optimizes a metric evaluated on the optimization set. - + `pipeline`: - do not save any individual pipeline files - + `pipelines`: - In case of cross validation, disables saving the joint model of the - pipelines fit on each fold. - + `y_test`: - do not save the predictions for the test set. - + `all`: - do not save any of the above. - For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`. + disable_file_output (Optional[List[Union[str, DisableFileOutputParameters]]]): + Used as a list to pass more fine-grained + information on what to save. Must be a member of `DisableFileOutputParameters`. + Allowed elements in the list are: + + + `y_optimization`: + do not save the predictions for the optimization set, + which would later on be used to build an ensemble. Note that SMAC + optimizes a metric evaluated on the optimization set. + + `pipeline`: + do not save any individual pipeline files + + `pipelines`: + In case of cross validation, disables saving the joint model of the + pipelines fit on each fold. + + `y_test`: + do not save the predictions for the test set. + + `all`: + do not save any of the above. + For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`. init_params (Optional[Dict[str, Any]]): Optional argument that is passed to each pipeline step. It is the equivalent of kwargs for the pipeline steps. @@ -122,7 +123,7 @@ def __init__(self, backend: Backend, queue: Queue, num_run: Optional[int] = None, include: Optional[Dict[str, Any]] = None, exclude: Optional[Dict[str, Any]] = None, - disable_file_output: Optional[List[str]] = None, + disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, init_params: Optional[Dict[str, Any]] = None, logger_port: Optional[int] = None, keep_models: Optional[bool] = None, @@ -412,7 +413,7 @@ def eval_function( num_run: int, include: Optional[Dict[str, Any]], exclude: Optional[Dict[str, Any]], - disable_file_output: List, + disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, pipeline_config: Optional[Dict[str, Any]] = None, budget_type: str = None, init_params: Optional[Dict[str, Any]] = None, diff --git a/test/test_evaluation/test_abstract_evaluator.py b/test/test_evaluation/test_abstract_evaluator.py index b08b26db4..a0be2c3f3 100644 --- a/test/test_evaluation/test_abstract_evaluator.py +++ b/test/test_evaluation/test_abstract_evaluator.py @@ -13,6 +13,7 @@ from autoPyTorch.automl_common.common.utils.backend import Backend, BackendContext from autoPyTorch.evaluation.abstract_evaluator import AbstractEvaluator +from autoPyTorch.evaluation.utils import DisableFileOutputParameters from autoPyTorch.pipeline.components.training.metrics.metrics import accuracy this_directory = os.path.dirname(__file__) @@ -129,7 +130,7 @@ def test_disable_file_output(self): ae = AbstractEvaluator( backend=self.backend_mock, queue=queue_mock, - disable_file_output=['all'], + disable_file_output=[DisableFileOutputParameters.all], metric=accuracy, logger_port=unittest.mock.Mock(), budget=0, From add889066911b9abf153fbaec2320b01adafa5d4 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 6 Dec 2021 16:29:14 +0100 Subject: [PATCH 11/32] Address comments from eddie --- autoPyTorch/api/base_task.py | 33 ++++++++++++----------- autoPyTorch/api/tabular_classification.py | 8 +++--- autoPyTorch/api/tabular_regression.py | 11 ++++---- 3 files changed, 27 insertions(+), 25 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 061844059..847b255af 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -11,7 +11,7 @@ import typing import unittest.mock import warnings -from abc import abstractmethod +from abc import ABC, abstractmethod from typing import Any, Callable, Dict, List, Optional, Tuple, Union from ConfigSpace.configuration_space import Configuration, ConfigurationSpace @@ -105,7 +105,7 @@ def send_warnings_to_log( return prediction -class BaseTask: +class BaseTask(ABC): """ Base class for the tasks that serve as API to the pipelines. @@ -135,10 +135,10 @@ class BaseTask: delete_tmp_folder_after_terminate (bool): Determines whether to delete the temporary directory, when finished - include_components (Optional[Dict]): + include_components (Optional[Dict[str, Any]]): If None, all possible components are used. Otherwise specifies set of components to use. - exclude_components (Optional[Dict]): + exclude_components (Optional[Dict[str, Any]]): If None, all possible components are used. Otherwise specifies set of components not to use. Incompatible with include components @@ -160,8 +160,8 @@ def __init__( output_directory: Optional[str] = None, delete_tmp_folder_after_terminate: bool = True, delete_output_folder_after_terminate: bool = True, - include_components: Optional[Dict] = None, - exclude_components: Optional[Dict] = None, + include_components: Optional[Dict[str, Any]] = None, + exclude_components: Optional[Dict[str, Any]] = None, backend: Optional[Backend] = None, resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation, resampling_strategy_args: Optional[Dict[str, Any]] = None, @@ -318,12 +318,13 @@ def get_dataset( y_test (Optional[Union[List, pd.DataFrame, np.ndarray]]): Testing target set resampling_strategy (Optional[Union[CrossValTypes, HoldoutValTypes]]): - Strategy to split the training data. + Strategy to split the training data. if None, uses + HoldoutValTypes.holdout_validation resampling_strategy_args (Optional[Dict[str, Any]]): arguments required for the chosen resampling strategy. If None, uses the default values provided in DEFAULT_RESAMPLING_PARAMETERS in ```datasets/resampling_strategy.py```. - dataset_name (Optional[str], optional): + dataset_name (Optional[str]): name of the dataset, used as experiment name. Returns: @@ -1337,8 +1338,8 @@ def fit_pipeline( eval_metric: Optional[str] = None, all_supported_metrics: bool = False, budget_type: Optional[str] = None, - include_components: Optional[Dict] = None, - exclude_components: Optional[Dict] = None, + include_components: Optional[Dict[str, Any]] = None, + exclude_components: Optional[Dict[str, Any]] = None, search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, budget: Optional[float] = None, pipeline_options: Optional[Dict] = None, @@ -1360,14 +1361,14 @@ def fit_pipeline( be provided to track the generalization performance of each stage. dataset_name (Optional[str]): Name of the dataset, if None, random value is used. - resampling_strategy (Union[CrossValTypes, HoldoutValTypes]), - (default=HoldoutValTypes.holdout_validation): - strategy to split the training data. + resampling_strategy (Optional[Union[CrossValTypes, HoldoutValTypes]]): + Strategy to split the training data. if None, uses + HoldoutValTypes.holdout_validation resampling_strategy_args (Optional[Dict[str, Any]]): Arguments required for the chosen resampling strategy. If None, uses the default values provided in DEFAULT_RESAMPLING_PARAMETERS in ```datasets/resampling_strategy.py```. - run_time_limit_secs (int: default=120): + run_time_limit_secs (int: default=60): Time limit for a single call to the machine learning model. Model fitting will be terminated if the machine learning algorithm runs over the time limit. Set this value high enough so that @@ -1398,10 +1399,10 @@ def fit_pipeline( controlled by func_eval_time_limit_secs. 'runtime' only controls the allocated time to train a pipeline, but it does not consider the overall time it takes to create a pipeline (data loading and preprocessing, other i/o operations, etc.). - include_components (Optional[Dict]): + include_components (Optional[Dict[str, Any]]): If None, all possible components are used. Otherwise specifies set of components to use. - exclude_components (Optional[Dict]): + exclude_components (Optional[Dict[str, Any]]): If None, all possible components are used. Otherwise specifies set of components not to use. Incompatible with include components diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index 7d71b84a4..d77c35f6a 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -53,10 +53,10 @@ class TabularClassificationTask(BaseTask): delete_tmp_folder_after_terminate (bool): Determines whether to delete the temporary directory, when finished - include_components (Optional[Dict]): + include_components (Optional[Dict[str, Any]]): If None, all possible components are used. Otherwise specifies set of components to use. - exclude_components (Optional[Dict]): + exclude_components (Optional[Dict[str, Any]]): If None, all possible components are used. Otherwise specifies set of components not to use. Incompatible with include components. @@ -77,8 +77,8 @@ def __init__( output_directory: Optional[str] = None, delete_tmp_folder_after_terminate: bool = True, delete_output_folder_after_terminate: bool = True, - include_components: Optional[Dict] = None, - exclude_components: Optional[Dict] = None, + include_components: Optional[Dict[str, Any]] = None, + exclude_components: Optional[Dict[str, Any]] = None, resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation, resampling_strategy_args: Optional[Dict[str, Any]] = None, backend: Optional[Backend] = None, diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py index 7a59651ef..56d49beba 100644 --- a/autoPyTorch/api/tabular_regression.py +++ b/autoPyTorch/api/tabular_regression.py @@ -53,10 +53,10 @@ class TabularRegressionTask(BaseTask): delete_tmp_folder_after_terminate (bool): Determines whether to delete the temporary directory, when finished - include_components (Optional[Dict]): + include_components (Optional[Dict[str, Any]]): If None, all possible components are used. Otherwise specifies set of components to use. - exclude_components (Optional[Dict]): + exclude_components (Optional[Dict[str, Any]]): If None, all possible components are used. Otherwise specifies set of components not to use. Incompatible with include components. @@ -78,8 +78,8 @@ def __init__( output_directory: Optional[str] = None, delete_tmp_folder_after_terminate: bool = True, delete_output_folder_after_terminate: bool = True, - include_components: Optional[Dict] = None, - exclude_components: Optional[Dict] = None, + include_components: Optional[Dict[str, Any]] = None, + exclude_components: Optional[Dict[str, Any]] = None, resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation, resampling_strategy_args: Optional[Dict[str, Any]] = None, backend: Optional[Backend] = None, @@ -142,7 +142,8 @@ def _get_dataset_input_validator( y_test (Optional[Union[List, pd.DataFrame, np.ndarray]]): Testing target set resampling_strategy (Optional[Union[CrossValTypes, HoldoutValTypes]]): - Strategy to split the training data. + Strategy to split the training data. if None, uses + HoldoutValTypes.holdout_validation resampling_strategy_args (Optional[Dict[str, Any]]): arguments required for the chosen resampling strategy. If None, uses the default values provided in DEFAULT_RESAMPLING_PARAMETERS From d8739cd91a109bf9ec92cfa4317e4540d69b6092 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 6 Dec 2021 17:01:04 +0100 Subject: [PATCH 12/32] fix docstring in api --- autoPyTorch/api/base_task.py | 63 ++++++++++++++++------- autoPyTorch/api/tabular_classification.py | 49 ++++++++++++++---- autoPyTorch/api/tabular_regression.py | 48 +++++++++++++---- 3 files changed, 123 insertions(+), 37 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 847b255af..9002d8114 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -136,12 +136,15 @@ class BaseTask(ABC): Determines whether to delete the temporary directory, when finished include_components (Optional[Dict[str, Any]]): - If None, all possible components are used. - Otherwise specifies set of components to use. + Dictionary containing components to include. Key is the node + name and Value is an Iterable of the names of the components + to include. Only these components will be present in the + search space. exclude_components (Optional[Dict[str, Any]]): - If None, all possible components are used. - Otherwise specifies set of components not to use. - Incompatible with include components + Dictionary containing components to exclude. Key is the node + name and Value is an Iterable of the names of the components + to exclude. All except these components will be present in + the search space. search_space_updates (Optional[HyperparameterSearchSpaceUpdates]): Search space updates that can be used to modify the search space of particular components or choice modules of the pipeline @@ -234,19 +237,37 @@ def __init__( " HyperparameterSearchSpaceUpdates got {}".format(type(self.search_space_updates))) @abstractmethod - def build_pipeline(self, dataset_properties: Dict[str, Any], - include_components: Optional[Dict] = None, - exclude_components: Optional[Dict] = None, - search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None - ) -> BasePipeline: + def build_pipeline( + self, + dataset_properties: Dict[str, BaseDatasetPropertiesType], + include_components: Optional[Dict[str, Any]] = None, + exclude_components: Optional[Dict[str, Any]] = None, + search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None + ) -> BasePipeline: """ Build pipeline according to current task and for the passed dataset properties Args: - dataset_properties (Dict[str,Any]) + dataset_properties (Dict[str, Any]): + Characteristics of the dataset to guide the pipeline + choices of components + include_components (Optional[Dict[str, Any]]): + Dictionary containing components to include. Key is the node + name and Value is an Iterable of the names of the components + to include. Only these components will be present in the + search space. + exclude_components (Optional[Dict[str, Any]]): + Dictionary containing components to exclude. Key is the node + name and Value is an Iterable of the names of the components + to exclude. All except these components will be present in + the search space. + search_space_updates (Optional[HyperparameterSearchSpaceUpdates]): + Search space updates that can be used to modify the search + space of particular components or choice modules of the pipeline Returns: + BasePipeline """ raise NotImplementedError("Function called on BaseTask, this can only be called by " @@ -278,7 +299,8 @@ def _get_dataset_input_validator( y_test (Optional[Union[List, pd.DataFrame, np.ndarray]]): Testing target set resampling_strategy (Optional[Union[CrossValTypes, HoldoutValTypes]]): - Strategy to split the training data. + Strategy to split the training data. if None, uses + HoldoutValTypes.holdout_validation. resampling_strategy_args (Optional[Dict[str, Any]]): arguments required for the chosen resampling strategy. If None, uses the default values provided in DEFAULT_RESAMPLING_PARAMETERS @@ -319,7 +341,7 @@ def get_dataset( Testing target set resampling_strategy (Optional[Union[CrossValTypes, HoldoutValTypes]]): Strategy to split the training data. if None, uses - HoldoutValTypes.holdout_validation + HoldoutValTypes.holdout_validation. resampling_strategy_args (Optional[Dict[str, Any]]): arguments required for the chosen resampling strategy. If None, uses the default values provided in DEFAULT_RESAMPLING_PARAMETERS @@ -1363,7 +1385,7 @@ def fit_pipeline( Name of the dataset, if None, random value is used. resampling_strategy (Optional[Union[CrossValTypes, HoldoutValTypes]]): Strategy to split the training data. if None, uses - HoldoutValTypes.holdout_validation + HoldoutValTypes.holdout_validation. resampling_strategy_args (Optional[Dict[str, Any]]): Arguments required for the chosen resampling strategy. If None, uses the default values provided in DEFAULT_RESAMPLING_PARAMETERS @@ -1400,12 +1422,15 @@ def fit_pipeline( time to train a pipeline, but it does not consider the overall time it takes to create a pipeline (data loading and preprocessing, other i/o operations, etc.). include_components (Optional[Dict[str, Any]]): - If None, all possible components are used. - Otherwise specifies set of components to use. + Dictionary containing components to include. Key is the node + name and Value is an Iterable of the names of the components + to include. Only these components will be present in the + search space. exclude_components (Optional[Dict[str, Any]]): - If None, all possible components are used. - Otherwise specifies set of components not to use. - Incompatible with include components + Dictionary containing components to exclude. Key is the node + name and Value is an Iterable of the names of the components + to exclude. All except these components will be present in + the search space. search_space_updates(Optional[HyperparameterSearchSpaceUpdates]): Updates to be made to the hyperparameter search space of the pipeline budget (Optional[float]): diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index d77c35f6a..aeb69277c 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -11,6 +11,7 @@ TASK_TYPES_TO_STRING, ) from autoPyTorch.data.tabular_validator import TabularInputValidator +from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType from autoPyTorch.datasets.resampling_strategy import ( CrossValTypes, HoldoutValTypes, @@ -54,12 +55,15 @@ class TabularClassificationTask(BaseTask): Determines whether to delete the temporary directory, when finished include_components (Optional[Dict[str, Any]]): - If None, all possible components are used. - Otherwise specifies set of components to use. + Dictionary containing components to include. Key is the node + name and Value is an Iterable of the names of the components + to include. Only these components will be present in the + search space. exclude_components (Optional[Dict[str, Any]]): - If None, all possible components are used. - Otherwise specifies set of components not to use. - Incompatible with include components. + Dictionary containing components to exclude. Key is the node + name and Value is an Iterable of the names of the components + to exclude. All except these components will be present in + the search space. search_space_updates (Optional[HyperparameterSearchSpaceUpdates]): search space updates that can be used to modify the search space of particular components or choice modules of the pipeline @@ -107,11 +111,37 @@ def __init__( def build_pipeline( self, - dataset_properties: Dict[str, Any], - include_components: Optional[Dict] = None, - exclude_components: Optional[Dict] = None, + dataset_properties: Dict[str, BaseDatasetPropertiesType], + include_components: Optional[Dict[str, Any]] = None, + exclude_components: Optional[Dict[str, Any]] = None, search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None ) -> TabularClassificationPipeline: + """ + Build pipeline according to current task + and for the passed dataset properties + + Args: + dataset_properties (Dict[str, Any]): + Characteristics of the dataset to guide the pipeline + choices of components + include_components (Optional[Dict[str, Any]]): + Dictionary containing components to include. Key is the node + name and Value is an Iterable of the names of the components + to include. Only these components will be present in the + search space. + exclude_components (Optional[Dict[str, Any]]): + Dictionary containing components to exclude. Key is the node + name and Value is an Iterable of the names of the components + to exclude. All except these components will be present in + the search space. + search_space_updates (Optional[HyperparameterSearchSpaceUpdates]): + Search space updates that can be used to modify the search + space of particular components or choice modules of the pipeline + + Returns: + TabularClassificationPipeline + + """ return TabularClassificationPipeline(dataset_properties=dataset_properties, include=include_components, exclude=exclude_components, @@ -141,7 +171,8 @@ def _get_dataset_input_validator( y_test (Optional[Union[List, pd.DataFrame, np.ndarray]]): Testing target set resampling_strategy (Optional[Union[CrossValTypes, HoldoutValTypes]]): - Strategy to split the training data. + Strategy to split the training data. if None, uses + HoldoutValTypes.holdout_validation. resampling_strategy_args (Optional[Dict[str, Any]]): arguments required for the chosen resampling strategy. If None, uses the default values provided in DEFAULT_RESAMPLING_PARAMETERS diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py index 56d49beba..f429b210c 100644 --- a/autoPyTorch/api/tabular_regression.py +++ b/autoPyTorch/api/tabular_regression.py @@ -11,6 +11,7 @@ TASK_TYPES_TO_STRING ) from autoPyTorch.data.tabular_validator import TabularInputValidator +from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType from autoPyTorch.datasets.resampling_strategy import ( CrossValTypes, HoldoutValTypes, @@ -54,12 +55,15 @@ class TabularRegressionTask(BaseTask): Determines whether to delete the temporary directory, when finished include_components (Optional[Dict[str, Any]]): - If None, all possible components are used. - Otherwise specifies set of components to use. + Dictionary containing components to include. Key is the node + name and Value is an Iterable of the names of the components + to include. Only these components will be present in the + search space. exclude_components (Optional[Dict[str, Any]]): - If None, all possible components are used. - Otherwise specifies set of components not to use. - Incompatible with include components. + Dictionary containing components to exclude. Key is the node + name and Value is an Iterable of the names of the components + to exclude. All except these components will be present in + the search space. search_space_updates (Optional[HyperparameterSearchSpaceUpdates]): search space updates that can be used to modify the search space of particular components or choice modules of the pipeline @@ -108,11 +112,37 @@ def __init__( def build_pipeline( self, - dataset_properties: Dict[str, Any], - include_components: Optional[Dict] = None, - exclude_components: Optional[Dict] = None, + dataset_properties: Dict[str, BaseDatasetPropertiesType], + include_components: Optional[Dict[str, Any]] = None, + exclude_components: Optional[Dict[str, Any]] = None, search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None ) -> TabularRegressionPipeline: + """ + Build pipeline according to current task + and for the passed dataset properties + + Args: + dataset_properties (Dict[str, Any]): + Characteristics of the dataset to guide the pipeline + choices of components + include_components (Optional[Dict[str, Any]]): + Dictionary containing components to include. Key is the node + name and Value is an Iterable of the names of the components + to include. Only these components will be present in the + search space. + exclude_components (Optional[Dict[str, Any]]): + Dictionary containing components to exclude. Key is the node + name and Value is an Iterable of the names of the components + to exclude. All except these components will be present in + the search space. + search_space_updates (Optional[HyperparameterSearchSpaceUpdates]): + Search space updates that can be used to modify the search + space of particular components or choice modules of the pipeline + + Returns: + TabularRegressionPipeline: + + """ return TabularRegressionPipeline(dataset_properties=dataset_properties, include=include_components, exclude=exclude_components, @@ -143,7 +173,7 @@ def _get_dataset_input_validator( Testing target set resampling_strategy (Optional[Union[CrossValTypes, HoldoutValTypes]]): Strategy to split the training data. if None, uses - HoldoutValTypes.holdout_validation + HoldoutValTypes.holdout_validation. resampling_strategy_args (Optional[Dict[str, Any]]): arguments required for the chosen resampling strategy. If None, uses the default values provided in DEFAULT_RESAMPLING_PARAMETERS From f1ea974e4acbfc19ad63d07159743195eb5ed112 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 6 Dec 2021 20:10:12 +0100 Subject: [PATCH 13/32] fix tests for base api --- test/test_api/test_base_api.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/test_api/test_base_api.py b/test/test_api/test_base_api.py index 126b702e6..44319b809 100644 --- a/test/test_api/test_base_api.py +++ b/test/test_api/test_base_api.py @@ -1,7 +1,7 @@ import logging import re import unittest -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch import numpy as np @@ -20,6 +20,7 @@ # ==== @pytest.mark.parametrize("fit_dictionary_tabular", ['classification_categorical_only'], indirect=True) def test_nonsupported_arguments(fit_dictionary_tabular): + BaseTask.__abstractmethods__ = set() with pytest.raises(ValueError, match=r".*Expected search space updates to be of instance.*"): api = BaseTask(search_space_updates='None') @@ -82,6 +83,7 @@ def test_pipeline_predict_function(): @pytest.mark.parametrize("fit_dictionary_tabular", ['classification_categorical_only'], indirect=True) def test_show_models(fit_dictionary_tabular): + BaseTask.__abstractmethods__ = set() api = BaseTask() api.ensemble_ = MagicMock() api.models_ = [TabularClassificationPipeline(dataset_properties=fit_dictionary_tabular['dataset_properties'])] @@ -94,6 +96,7 @@ def test_show_models(fit_dictionary_tabular): def test_set_pipeline_config(): # checks if we can correctly change the pipeline options + BaseTask.__abstractmethods__ = set() estimator = BaseTask() pipeline_options = {"device": "cuda", "budget_type": "epochs", @@ -110,6 +113,7 @@ def test_set_pipeline_config(): (3, 50, 'runtime', {'budget_type': 'runtime', 'runtime': 50}), ]) def test_pipeline_get_budget(fit_dictionary_tabular, min_budget, max_budget, budget_type, expected): + BaseTask.__abstractmethods__ = set() estimator = BaseTask(task_type='tabular_classification', ensemble_size=0) # Fixture pipeline config From 38471f1aec556a276c4560e4e011b2171126f1ce Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 6 Dec 2021 20:12:53 +0100 Subject: [PATCH 14/32] fix tests for base api --- test/test_api/test_base_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_api/test_base_api.py b/test/test_api/test_base_api.py index 44319b809..3b379dbd6 100644 --- a/test/test_api/test_base_api.py +++ b/test/test_api/test_base_api.py @@ -1,7 +1,7 @@ import logging import re import unittest -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock import numpy as np From 02ac9de77215e3bc2562be1d164daed44dd8238d Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 6 Dec 2021 21:13:18 +0100 Subject: [PATCH 15/32] fix tests after rebase --- test/test_utils/test_results_manager.py | 1 + test/test_utils/test_results_visualizer.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/test/test_utils/test_results_manager.py b/test/test_utils/test_results_manager.py index 8998009a4..496aec7fa 100644 --- a/test/test_utils/test_results_manager.py +++ b/test/test_utils/test_results_manager.py @@ -352,6 +352,7 @@ def test_metric_results(metric, scores, ensemble_ends_later): def test_search_results_sprint_statistics(): + BaseTask.__abstractmethods__ = set() api = BaseTask() for method in ['get_search_results', 'sprint_statistics', 'get_incumbent_results']: with pytest.raises(RuntimeError): diff --git a/test/test_utils/test_results_visualizer.py b/test/test_utils/test_results_visualizer.py index c463fa063..e31571ef0 100644 --- a/test/test_utils/test_results_visualizer.py +++ b/test/test_utils/test_results_visualizer.py @@ -146,6 +146,7 @@ def test_set_plot_args(params): # TODO @pytest.mark.parametrize('metric_name', ('unknown', 'accuracy')) def test_raise_error_in_plot_perf_over_time_in_base_task(metric_name): + BaseTask.__abstractmethods__ = set() api = BaseTask() if metric_name == 'unknown': @@ -159,6 +160,7 @@ def test_raise_error_in_plot_perf_over_time_in_base_task(metric_name): @pytest.mark.parametrize('metric_name', ('balanced_accuracy', 'accuracy')) def test_plot_perf_over_time(metric_name): # TODO dummy_history = [{'Timestamp': datetime(2022, 1, 1), 'train_accuracy': 1, 'test_accuracy': 1}] + BaseTask.__abstractmethods__ = set() api = BaseTask() run_history_data = json.load(open(os.path.join(os.path.dirname(__file__), 'runhistory.json'), From fd32939a59aa3236fd228cadb91afec202eed709 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 7 Dec 2021 10:57:34 +0100 Subject: [PATCH 16/32] reduce dataset size in example --- examples/40_advanced/example_single_configuration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/40_advanced/example_single_configuration.py b/examples/40_advanced/example_single_configuration.py index f8e3a6910..a0fe454b6 100644 --- a/examples/40_advanced/example_single_configuration.py +++ b/examples/40_advanced/example_single_configuration.py @@ -44,7 +44,7 @@ estimator = TabularClassificationTask( resampling_strategy=HoldoutValTypes.holdout_validation, - resampling_strategy_args={'val_share': 0.33}, + resampling_strategy_args={'val_share': 0.5}, ) ############################################################################ From 39587505659b9e9e586930d51de9a9d47b81582f Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 7 Dec 2021 10:59:05 +0100 Subject: [PATCH 17/32] remove optional from doc string --- autoPyTorch/api/base_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 9002d8114..2839f86e8 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -305,7 +305,7 @@ def _get_dataset_input_validator( arguments required for the chosen resampling strategy. If None, uses the default values provided in DEFAULT_RESAMPLING_PARAMETERS in ```datasets/resampling_strategy.py```. - dataset_name (Optional[str], optional): + dataset_name (Optional[str]): name of the dataset, used as experiment name. Returns: From c33381ad6f60da9d41dd8b78bd0eb302ea3798f4 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 7 Dec 2021 11:42:10 +0100 Subject: [PATCH 18/32] Handle unsuccessful fitting of pipeline better --- autoPyTorch/api/base_task.py | 28 +++++++------ .../example_single_configuration.py | 8 ++-- test/test_api/test_api.py | 42 +++++++++++++++++++ 3 files changed, 62 insertions(+), 16 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 2839f86e8..6dde20560 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -1576,19 +1576,23 @@ def fit_pipeline( ) fitted_pipeline: Optional[BasePipeline] = None - if 'all' in disable_file_output or 'pipeline' in disable_file_output: - self._logger.warning("File output is disabled. No pipeline can returned") - elif run_value.status == StatusType.SUCCESS: - if self.resampling_strategy in CrossValTypes: - load_function = self._backend.load_cv_model_by_seed_and_id_and_budget - else: - load_function = self._backend.load_model_by_seed_and_id_and_budget - fitted_pipeline = load_function( - seed=self.seed, - idx=run_info.config.config_id + tae.initial_num_run, - budget=float(run_info.budget), - ) + if run_value.status == StatusType.SUCCESS: + if 'all' in disable_file_output or 'pipeline' in disable_file_output: + self._logger.warning("File output is disabled. No pipeline can returned") + elif run_value.status == StatusType.SUCCESS: + if self.resampling_strategy in CrossValTypes: + load_function = self._backend.load_cv_model_by_seed_and_id_and_budget + else: + load_function = self._backend.load_model_by_seed_and_id_and_budget + fitted_pipeline = load_function( + seed=self.seed, + idx=run_info.config.config_id + tae.initial_num_run, + budget=float(run_info.budget), + ) + else: + warnings.warn(f"Fitting pipeline failed with status: {run_value.status}" + f", aditional_info: {run_value.additional_info}") self._clean_logger() return fitted_pipeline, run_info, run_value, dataset diff --git a/examples/40_advanced/example_single_configuration.py b/examples/40_advanced/example_single_configuration.py index a0fe454b6..564fb71e3 100644 --- a/examples/40_advanced/example_single_configuration.py +++ b/examples/40_advanced/example_single_configuration.py @@ -70,12 +70,12 @@ run_time_limit_secs=100 ) -# This object complies with Scikit-Learn Pipeline API. -# https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html -print(pipeline.named_steps) - # The fit_pipeline command also returns a named tuple with the pipeline constraints print(run_info) # The fit_pipeline command also returns a named tuple with train/test performance print(run_value) + +# This object complies with Scikit-Learn Pipeline API. +# https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html +print(pipeline.named_steps) diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py index 286dc1307..25180a775 100644 --- a/test/test_api/test_api.py +++ b/test/test_api/test_api.py @@ -747,3 +747,45 @@ def test_pipeline_fit(openml_id, assert os.path.exists(cv_model_path) elif resampling_strategy in HoldoutValTypes: assert not os.path.exists(cv_model_path) + +@pytest.mark.parametrize('openml_id', (40984,)) +@pytest.mark.parametrize('resampling_strategy,resampling_strategy_args', + ((HoldoutValTypes.holdout_validation, {'val_share': 0.8}), + ) + ) +def test_pipeline_fit_error( + openml_id, + resampling_strategy, + resampling_strategy_args, + backend, + n_samples +): + # Get the data and check that contents of data-manager make sense + X, y = sklearn.datasets.fetch_openml( + data_id=int(openml_id), + return_X_y=True, as_frame=True + ) + X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X[:n_samples], y[:n_samples], random_state=1) + + # Search for a good configuration + estimator = TabularClassificationTask( + backend=backend, + resampling_strategy=resampling_strategy, + ) + + dataset = estimator.get_dataset(X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, + resampling_strategy=resampling_strategy, + resampling_strategy_args=resampling_strategy_args) + + configuration = estimator.get_search_space(dataset).get_default_configuration() + pipeline, run_info, run_value, dataset = estimator.fit_pipeline(dataset=dataset, + configuration=configuration, + run_time_limit_secs=7, + ) + + assert 'TIMEOUT' in str(run_value.status) + assert pipeline is None From dff0e5c71b9c8b2e53359421d4334cc7e2f81c32 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 7 Dec 2021 11:45:45 +0100 Subject: [PATCH 19/32] fix flake in tests --- test/test_api/test_api.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py index 25180a775..fda013612 100644 --- a/test/test_api/test_api.py +++ b/test/test_api/test_api.py @@ -748,6 +748,7 @@ def test_pipeline_fit(openml_id, elif resampling_strategy in HoldoutValTypes: assert not os.path.exists(cv_model_path) + @pytest.mark.parametrize('openml_id', (40984,)) @pytest.mark.parametrize('resampling_strategy,resampling_strategy_args', ((HoldoutValTypes.holdout_validation, {'val_share': 0.8}), From eb648e5b768196ceb08485e433be1c0b99fbeae3 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 7 Dec 2021 14:01:24 +0100 Subject: [PATCH 20/32] change to default configuration for documentation --- examples/40_advanced/example_single_configuration.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/40_advanced/example_single_configuration.py b/examples/40_advanced/example_single_configuration.py index 564fb71e3..23581f3f8 100644 --- a/examples/40_advanced/example_single_configuration.py +++ b/examples/40_advanced/example_single_configuration.py @@ -48,7 +48,7 @@ ) ############################################################################ -# Get a random configuration of the pipeline for current dataset +# Get a configuration of the pipeline for current dataset # =============================================================== dataset = estimator.get_dataset(X_train=X_train, @@ -56,7 +56,7 @@ X_test=X_test, y_test=y_test, dataset_name='kr-vs-kp') -configuration = estimator.get_search_space(dataset).sample_configuration() +configuration = estimator.get_search_space(dataset).get_default_configuration() print("Passed Configuration:", configuration) ########################################################################### From 974ea1c6b3b1600f054e9c3d3953837364c088f1 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 7 Dec 2021 14:35:33 +0100 Subject: [PATCH 21/32] add warning for no ensemble created when y_optimization in disable_file_output --- autoPyTorch/api/base_task.py | 8 ++++++++ autoPyTorch/evaluation/utils.py | 19 ++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 6dde20560..c6b77dbc4 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -1023,6 +1023,14 @@ def _search( self._all_supported_metrics = all_supported_metrics self._disable_file_output = disable_file_output if disable_file_output is not None else [] + if ( + DisableFileOutputParameters.check_value_in_iterable(self._disable_file_output, + DisableFileOutputParameters.y_optimization) + and self.ensemble_size > 1 + ): + self._logger.warning(f"No ensemble will be created when {DisableFileOutputParameters.y_optimization}" + f" is in disable_file_output") + self._memory_limit = memory_limit self._time_for_task = total_walltime_limit # Save start time to backend diff --git a/autoPyTorch/evaluation/utils.py b/autoPyTorch/evaluation/utils.py index f729c21f8..3b6ed0669 100644 --- a/autoPyTorch/evaluation/utils.py +++ b/autoPyTorch/evaluation/utils.py @@ -1,7 +1,7 @@ import queue from enum import Enum from multiprocessing.queues import Queue -from typing import List, Optional, Union +from typing import Iterable, List, Optional, Union import numpy as np @@ -138,3 +138,20 @@ def check_compatibility(cls, disable_file_output: List) -> None: raise ValueError(f"Expected {item} to be in the members (" f"{list(cls.__members__.keys())}) of {cls.__name__}" f" or an instance.") + + @staticmethod + def check_value_in_iterable(container: Iterable, parameter: "DisableFileOutputParameters") -> bool: + """ + checks if parameter is in the container either as + the parameter itself or as its value. + + Args: + container (Iterable): + Iterable to check in. + parameter (DisableFileOutputParameters): + parameter to check. + Returns: + bool: + whether parameter is in `container` + """ + return parameter in container or parameter.value in container From cc19e4c6f28917d3c7671672daac7ebaabe2d678 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 7 Dec 2021 17:04:38 +0100 Subject: [PATCH 22/32] reduce budget for single configuration --- examples/40_advanced/example_single_configuration.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/40_advanced/example_single_configuration.py b/examples/40_advanced/example_single_configuration.py index 23581f3f8..e4a3078f9 100644 --- a/examples/40_advanced/example_single_configuration.py +++ b/examples/40_advanced/example_single_configuration.py @@ -66,7 +66,7 @@ pipeline, run_info, run_value, dataset = estimator.fit_pipeline(dataset=dataset, configuration=configuration, budget_type='epochs', - budget=20, + budget=10, run_time_limit_secs=100 ) @@ -78,4 +78,5 @@ # This object complies with Scikit-Learn Pipeline API. # https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html +if pipeline is not None: print(pipeline.named_steps) From ab93ee6df0997fcdbb3494b634b44d093a47bda5 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 7 Dec 2021 17:44:54 +0100 Subject: [PATCH 23/32] address comments from eddie --- autoPyTorch/api/base_task.py | 4 +-- autoPyTorch/evaluation/utils.py | 9 +++--- .../example_single_configuration.py | 1 - test/test_evaluation/test_utils.py | 30 ++++++++++++++----- 4 files changed, 28 insertions(+), 16 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index c6b77dbc4..904fc8778 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -1588,7 +1588,7 @@ def fit_pipeline( if run_value.status == StatusType.SUCCESS: if 'all' in disable_file_output or 'pipeline' in disable_file_output: self._logger.warning("File output is disabled. No pipeline can returned") - elif run_value.status == StatusType.SUCCESS: + else: if self.resampling_strategy in CrossValTypes: load_function = self._backend.load_cv_model_by_seed_and_id_and_budget else: @@ -1600,7 +1600,7 @@ def fit_pipeline( ) else: warnings.warn(f"Fitting pipeline failed with status: {run_value.status}" - f", aditional_info: {run_value.additional_info}") + f", additional_info: {run_value.additional_info}") self._clean_logger() return fitted_pipeline, run_info, run_value, dataset diff --git a/autoPyTorch/evaluation/utils.py b/autoPyTorch/evaluation/utils.py index 3b6ed0669..0dacb23b7 100644 --- a/autoPyTorch/evaluation/utils.py +++ b/autoPyTorch/evaluation/utils.py @@ -133,11 +133,10 @@ class DisableFileOutputParameters(Enum): @classmethod def check_compatibility(cls, disable_file_output: List) -> None: for item in disable_file_output: - if item not in cls.__members__: - if not isinstance(item, cls): - raise ValueError(f"Expected {item} to be in the members (" - f"{list(cls.__members__.keys())}) of {cls.__name__}" - f" or an instance.") + if item not in cls.__members__ and not isinstance(item, cls): + raise ValueError(f"Expected {item} to be in the members (" + f"{list(cls.__members__.keys())}) of {cls.__name__}" + f" or an instance.") @staticmethod def check_value_in_iterable(container: Iterable, parameter: "DisableFileOutputParameters") -> bool: diff --git a/examples/40_advanced/example_single_configuration.py b/examples/40_advanced/example_single_configuration.py index e4a3078f9..453ac4636 100644 --- a/examples/40_advanced/example_single_configuration.py +++ b/examples/40_advanced/example_single_configuration.py @@ -78,5 +78,4 @@ # This object complies with Scikit-Learn Pipeline API. # https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html -if pipeline is not None: print(pipeline.named_steps) diff --git a/test/test_evaluation/test_utils.py b/test/test_evaluation/test_utils.py index a67b29e1a..c6743ac47 100644 --- a/test/test_evaluation/test_utils.py +++ b/test/test_evaluation/test_utils.py @@ -1,20 +1,34 @@ +""" +Tests the functionality in autoPyTorch.evaluation.utils +""" import pytest from autoPyTorch.evaluation.utils import DisableFileOutputParameters -def test_disable_file_output_string_no_error(): - disable_file_output = ['pipeline', 'pipelines'] +@pytest.mark.parametrize('disable_file_output', + [['pipeline', 'pipelines'], + [DisableFileOutputParameters.pipelines, DisableFileOutputParameters.pipeline]]) +def test_disable_file_output_no_error(disable_file_output): + """ + Checks that `DisableFileOutputParameters.check_compatibility` + does not raise an error for the parameterized values of `disable_file_output`. + + Args: + disable_file_output ([List[Union[str, DisableFileOutputParameters]]]): + Options that should be compatible with the `DisableFileOutputParameters` + defined in `autoPyTorch`. + """ DisableFileOutputParameters.check_compatibility(disable_file_output=disable_file_output) -def test_disable_file_output_string_error(): +def test_disable_file_output_error(): + """ + Checks that `DisableFileOutputParameters.check_compatibility` raises an error + for a value not present in `DisableFileOutputParameters` and ensures that the + expected error is raised. + """ disable_file_output = ['model'] with pytest.raises(ValueError, match=r"Expected .*? to be in the members (.*?) of" r" DisableFileOutputParameters or an instance."): DisableFileOutputParameters.check_compatibility(disable_file_output=disable_file_output) - - -def test_disable_file_output_enum_no_error(): - disable_file_output = [DisableFileOutputParameters.pipeline, DisableFileOutputParameters.pipelines] - DisableFileOutputParameters.check_compatibility(disable_file_output=disable_file_output) From c246b20aafd740ae8d311762906d48dd92470117 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Thu, 9 Dec 2021 10:42:29 +0100 Subject: [PATCH 24/32] address comments from shuhei --- autoPyTorch/api/base_task.py | 4 ++-- autoPyTorch/evaluation/abstract_evaluator.py | 5 ++--- autoPyTorch/evaluation/utils.py | 9 ++++++--- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 904fc8778..db374b1b8 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -1024,8 +1024,8 @@ def _search( self._all_supported_metrics = all_supported_metrics self._disable_file_output = disable_file_output if disable_file_output is not None else [] if ( - DisableFileOutputParameters.check_value_in_iterable(self._disable_file_output, - DisableFileOutputParameters.y_optimization) + DisableFileOutputParameters.is_in_iterable(self._disable_file_output, + DisableFileOutputParameters.y_optimization) and self.ensemble_size > 1 ): self._logger.warning(f"No ensemble will be created when {DisableFileOutputParameters.y_optimization}" diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py index 0b7cc105f..b21183dda 100644 --- a/autoPyTorch/evaluation/abstract_evaluator.py +++ b/autoPyTorch/evaluation/abstract_evaluator.py @@ -849,9 +849,8 @@ def file_output( ) # Abort if we don't want to output anything. - if hasattr(self, 'disable_file_output'): - if 'all' in self.disable_file_output: - return None, {} + if 'all' in self.disable_file_output: + return None, {} # This file can be written independently of the others down bellow if 'y_optimization' not in self.disable_file_output: diff --git a/autoPyTorch/evaluation/utils.py b/autoPyTorch/evaluation/utils.py index 0dacb23b7..d75b70a60 100644 --- a/autoPyTorch/evaluation/utils.py +++ b/autoPyTorch/evaluation/utils.py @@ -131,15 +131,18 @@ class DisableFileOutputParameters(Enum): all = 'all' @classmethod - def check_compatibility(cls, disable_file_output: List) -> None: + def check_compatibility( + cls, + disable_file_output: List[Union[str, 'DisableFileOutputParameters']] + ) -> None: for item in disable_file_output: if item not in cls.__members__ and not isinstance(item, cls): raise ValueError(f"Expected {item} to be in the members (" f"{list(cls.__members__.keys())}) of {cls.__name__}" - f" or an instance.") + f" either as an instance or the string value of the member.") @staticmethod - def check_value_in_iterable(container: Iterable, parameter: "DisableFileOutputParameters") -> bool: + def is_in_iterable(container: Iterable, parameter: "DisableFileOutputParameters") -> bool: """ checks if parameter is in the container either as the parameter itself or as its value. From a0a4e757cefc095480523d447778bc7f2c775c53 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Thu, 9 Dec 2021 12:56:24 +0100 Subject: [PATCH 25/32] Add autoPyTorchEnum --- autoPyTorch/api/base_task.py | 3 +-- autoPyTorch/evaluation/utils.py | 27 ++++++-------------------- autoPyTorch/utils/common.py | 20 +++++++++++++++++++ test/test_evaluation/test_utils.py | 3 ++- test/test_utils/test_common.py | 31 ++++++++++++++++++++++++++++++ 5 files changed, 60 insertions(+), 24 deletions(-) create mode 100644 test/test_utils/test_common.py diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index db374b1b8..ac1df9950 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -1024,8 +1024,7 @@ def _search( self._all_supported_metrics = all_supported_metrics self._disable_file_output = disable_file_output if disable_file_output is not None else [] if ( - DisableFileOutputParameters.is_in_iterable(self._disable_file_output, - DisableFileOutputParameters.y_optimization) + DisableFileOutputParameters.y_optimization in self._disable_file_output and self.ensemble_size > 1 ): self._logger.warning(f"No ensemble will be created when {DisableFileOutputParameters.y_optimization}" diff --git a/autoPyTorch/evaluation/utils.py b/autoPyTorch/evaluation/utils.py index d75b70a60..37e5fa36d 100644 --- a/autoPyTorch/evaluation/utils.py +++ b/autoPyTorch/evaluation/utils.py @@ -1,7 +1,6 @@ import queue -from enum import Enum from multiprocessing.queues import Queue -from typing import Iterable, List, Optional, Union +from typing import List, Optional, Union import numpy as np @@ -9,6 +8,9 @@ from smac.runhistory.runhistory import RunValue +from autoPyTorch.utils.common import autoPyTorchEnum + + __all__ = [ 'read_queue', 'convert_multioutput_multiclass_to_multilabel', @@ -105,7 +107,7 @@ def _predict(self, X: np.ndarray) -> np.ndarray: return np.asarray(predictions).T -class DisableFileOutputParameters(Enum): +class DisableFileOutputParameters(autoPyTorchEnum): """ Contains literals that can be passed in to `disable_file_output` list. These include: @@ -139,21 +141,4 @@ def check_compatibility( if item not in cls.__members__ and not isinstance(item, cls): raise ValueError(f"Expected {item} to be in the members (" f"{list(cls.__members__.keys())}) of {cls.__name__}" - f" either as an instance or the string value of the member.") - - @staticmethod - def is_in_iterable(container: Iterable, parameter: "DisableFileOutputParameters") -> bool: - """ - checks if parameter is in the container either as - the parameter itself or as its value. - - Args: - container (Iterable): - Iterable to check in. - parameter (DisableFileOutputParameters): - parameter to check. - Returns: - bool: - whether parameter is in `container` - """ - return parameter in container or parameter.value in container + f" or as string value of a member.") diff --git a/autoPyTorch/utils/common.py b/autoPyTorch/utils/common.py index 7be8a233c..8da9ad6c7 100644 --- a/autoPyTorch/utils/common.py +++ b/autoPyTorch/utils/common.py @@ -1,3 +1,4 @@ +from enum import Enum from typing import Any, Dict, Iterable, List, NamedTuple, Optional, Sequence, Type, Union from ConfigSpace.configuration_space import ConfigurationSpace @@ -75,6 +76,25 @@ def __str__(self) -> str: self.hyperparameter, self.value_range, self.default_value, self.log) +class autoPyTorchEnum(str, Enum): + """ + Utility class for enums in autoPyTorch. + Allows users to use strings, while we internally use + this enum + """ + def __eq__(self, other: Any) -> bool: + if isinstance(other, autoPyTorchEnum): + return type(self) == type(other) and self.value == other.value + elif isinstance(other, str): + return bool(self.value == other) + else: + raise RuntimeError(f"Unsupported type {type(other)}." + f"{self} only supports `str` and `{self}`") + + def __hash__(self) -> int: + return hash(self.value) + + def custom_collate_fn(batch: List) -> List[Optional[torch.Tensor]]: """ In the case of not providing a y tensor, in a diff --git a/test/test_evaluation/test_utils.py b/test/test_evaluation/test_utils.py index c6743ac47..e81eea38b 100644 --- a/test/test_evaluation/test_utils.py +++ b/test/test_evaluation/test_utils.py @@ -30,5 +30,6 @@ def test_disable_file_output_error(): """ disable_file_output = ['model'] with pytest.raises(ValueError, match=r"Expected .*? to be in the members (.*?) of" - r" DisableFileOutputParameters or an instance."): + r" DisableFileOutputParameters or as string value" + r" of a member."): DisableFileOutputParameters.check_compatibility(disable_file_output=disable_file_output) diff --git a/test/test_utils/test_common.py b/test/test_utils/test_common.py new file mode 100644 index 000000000..45044680b --- /dev/null +++ b/test/test_utils/test_common.py @@ -0,0 +1,31 @@ +""" +This tests the functionality in autoPyTorch/utils/common. +""" +import pytest + +from autoPyTorch.utils.common import autoPyTorchEnum + + +class SubEnum(autoPyTorchEnum): + x = "x" + y = "y" + + +@pytest.mark.parametrize('iter', + [[SubEnum.x], + ["x"], + {SubEnum.x: "hello"}, + {'x': 'hello'}]) +def test_autopytorch_enum(iter): + """ + This test ensures that a subclass of `autoPyTorchEnum` + can be used with strings. + + Args: + iter (Iterable): + iterable to check for compaitbility + """ + + e = SubEnum.x + + assert e in iter \ No newline at end of file From a0fef7700fd0be8696db1c8a477d4ddedf4f6754 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Fri, 10 Dec 2021 12:39:42 +0100 Subject: [PATCH 26/32] fix flake in tests --- test/test_utils/test_common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_utils/test_common.py b/test/test_utils/test_common.py index 45044680b..8a042bed6 100644 --- a/test/test_utils/test_common.py +++ b/test/test_utils/test_common.py @@ -27,5 +27,5 @@ def test_autopytorch_enum(iter): """ e = SubEnum.x - - assert e in iter \ No newline at end of file + + assert e in iter From 8094ff11ebeeda8d823ae92df505477f6e7f53de Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Sun, 19 Dec 2021 15:46:50 +0100 Subject: [PATCH 27/32] address comments from shuhei --- autoPyTorch/api/base_task.py | 55 ++++++++++++++++++++++-------------- autoPyTorch/utils/common.py | 6 ++-- 2 files changed, 38 insertions(+), 23 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index ac1df9950..c656be8d6 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -314,8 +314,7 @@ def _get_dataset_input_validator( BaseInputValidator: fitted input validator """ - raise NotImplementedError("Function called on BaseTask, this can only be called by " - "specific task which is a child of the BaseTask") + raise NotImplementedError def get_dataset( self, @@ -1495,8 +1494,7 @@ def fit_pipeline( # TAE expects each configuration to have a config_id. # For fitting a pipeline as it is not part of the # search process, it makes sense to set it to 0 - if hasattr(configuration, 'config_id') or configuration.config_id is None: - configuration.__setattr__('config_id', 0) + configuration.__setattr__('config_id', 0) # get dataset properties dataset_requirements = get_dataset_requirements( @@ -1582,28 +1580,43 @@ def fit_pipeline( instance=None) ) - fitted_pipeline: Optional[BasePipeline] = None + fitted_pipeline = self._get_fitted_pipeline( + pipeline_idx=run_info.config.config_id + tae.initial_num_run, + run_info=run_info, + run_value=run_value, + disable_file_output=disable_file_output + ) - if run_value.status == StatusType.SUCCESS: - if 'all' in disable_file_output or 'pipeline' in disable_file_output: - self._logger.warning("File output is disabled. No pipeline can returned") - else: - if self.resampling_strategy in CrossValTypes: - load_function = self._backend.load_cv_model_by_seed_and_id_and_budget - else: - load_function = self._backend.load_model_by_seed_and_id_and_budget - fitted_pipeline = load_function( - seed=self.seed, - idx=run_info.config.config_id + tae.initial_num_run, - budget=float(run_info.budget), - ) - else: - warnings.warn(f"Fitting pipeline failed with status: {run_value.status}" - f", additional_info: {run_value.additional_info}") self._clean_logger() return fitted_pipeline, run_info, run_value, dataset + def _get_fitted_pipeline( + self, + pipeline_idx: int, + run_info: RunInfo, + run_value: RunValue, + disable_file_output: List[Union[str, DisableFileOutputParameters]] + ) -> Optional[BasePipeline]: + if run_value.status != StatusType.SUCCESS: + warnings.warn(f"Fitting pipeline failed with status: {run_value.status}" + f", additional_info: {run_value.additional_info}") + return None + elif any(disable_file_output for c in ['all', 'pipeline']): + self._logger.warning("File output is disabled. No pipeline can returned") + return None + + if self.resampling_strategy in CrossValTypes: + load_function = self._backend.load_cv_model_by_seed_and_id_and_budget + else: + load_function = self._backend.load_model_by_seed_and_id_and_budget + + return load_function( + seed=self.seed, + idx=pipeline_idx, + budget=float(run_info.budget), + ) + def predict( self, X_test: np.ndarray, diff --git a/autoPyTorch/utils/common.py b/autoPyTorch/utils/common.py index 8da9ad6c7..1488d5fcd 100644 --- a/autoPyTorch/utils/common.py +++ b/autoPyTorch/utils/common.py @@ -88,8 +88,10 @@ def __eq__(self, other: Any) -> bool: elif isinstance(other, str): return bool(self.value == other) else: - raise RuntimeError(f"Unsupported type {type(other)}." - f"{self} only supports `str` and `{self}`") + enum_name = self.__class__.__name__ + raise RuntimeError(f"Unsupported type {type(other)}. " + f"{enum_name} only supports `str` and" + f"`{enum_name}`") def __hash__(self) -> int: return hash(self.value) From 4d90706d295d736e8aa52341dc887df58b648140 Mon Sep 17 00:00:00 2001 From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com> Date: Sun, 19 Dec 2021 15:52:56 +0100 Subject: [PATCH 28/32] Apply suggestions from code review Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com> --- autoPyTorch/api/base_task.py | 17 +++----- autoPyTorch/evaluation/abstract_evaluator.py | 4 +- test/test_utils/test_common.py | 44 +++++++++++++++++++- 3 files changed, 50 insertions(+), 15 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index c656be8d6..5f9fb0f05 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -1511,12 +1511,9 @@ def fit_pipeline( assert dataset.dataset_name is not None self._logger = self._get_logger(dataset.dataset_name) - if include_components is None: - include_components = self.include_components - if exclude_components is None: - exclude_components = self.exclude_components - if search_space_updates is None: - search_space_updates = self.search_space_updates + include_components = self.include_components if include_components is None else include_components + exclude_components = self.exclude_components if exclude_components is None else exclude_components + search_space_updates = self.search_space_updates if search_space_updates is None else search_space_updates scenario_mock = unittest.mock.Mock() scenario_mock.wallclock_limit = run_time_limit_secs @@ -1524,9 +1521,8 @@ def fit_pipeline( # already be generated here! stats = Stats(scenario_mock) - if memory_limit is None: - if hasattr(self, '_memory_limit') and self._memory_limit is not None: - memory_limit = self._memory_limit + if memory_limit is None and getattr(self, '_memory_limit', None) is not None: + memory_limit = self._memory_limit metric = get_metrics(dataset_properties=dataset_properties, names=[eval_metric] if eval_metric is not None else None, @@ -1545,8 +1541,7 @@ def fit_pipeline( budget = budget if budget is not None else pipeline_options[budget_type] if disable_file_output is None: - disable_file_output = self._disable_file_output if hasattr(self, '_disable_file_output') \ - and self._disable_file_output is not None else [] + disable_file_output = getattr(self, '_disable_file_output', []) stats.start_timing() diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py index b21183dda..f98c69dd4 100644 --- a/autoPyTorch/evaluation/abstract_evaluator.py +++ b/autoPyTorch/evaluation/abstract_evaluator.py @@ -857,7 +857,7 @@ def file_output( if self.output_y_hat_optimization: self.backend.save_targets_ensemble(self.Y_optimization) - if hasattr(self, 'pipelines') and self.pipelines is not None: + if getattr(self, 'pipelines', None) is not None: if self.pipelines[0] is not None and len(self.pipelines) > 0: if 'pipelines' not in self.disable_file_output: if self.task_type in CLASSIFICATION_TASKS: @@ -872,7 +872,7 @@ def file_output( else: pipelines = None - if hasattr(self, 'pipeline') and self.pipeline is not None: + if getattr(self, 'pipeline', None) is not None: if 'pipeline' not in self.disable_file_output: pipeline = self.pipeline else: diff --git a/test/test_utils/test_common.py b/test/test_utils/test_common.py index 8a042bed6..023c7aea0 100644 --- a/test/test_utils/test_common.py +++ b/test/test_utils/test_common.py @@ -12,10 +12,12 @@ class SubEnum(autoPyTorchEnum): @pytest.mark.parametrize('iter', - [[SubEnum.x], + ([SubEnum.x], ["x"], {SubEnum.x: "hello"}, - {'x': 'hello'}]) + {'x': 'hello'}, + SubEnum, + ["x", "y"])) def test_autopytorch_enum(iter): """ This test ensures that a subclass of `autoPyTorchEnum` @@ -29,3 +31,41 @@ def test_autopytorch_enum(iter): e = SubEnum.x assert e in iter + +class DummyEnum(Enum): # You need to move it on top + x = "x" + + +@pytest.mark.parametrize('iter', + [[SubEnum.y], + ["y"], + {SubEnum.y: "hello"}, + {'y': 'hello'}]) +def test_autopytorch_enum_false(iter): + """ + This test ensures that a subclass of `autoPyTorchEnum` + can be used with strings. + Args: + iter (Iterable): + iterable to check for compaitbility + """ + + e = SubEnum.x + + assert e not in iter + + +@pytest.mark.parametrize('others', (1, 2.0, SubEnum, DummyEnum.x)) +def test_raise_errors_autopytorch_enum(others): + """ + This test ensures that a subclass of `autoPyTorchEnum` + raises error properly. + Args: + others (Any): + Variable to compare with SubEnum. + """ + + with pytest.raises(RuntimeError): + SubEnum.x == others + + From c7cc712ea41292f3659920be0baa569a4544c38f Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Sun, 19 Dec 2021 16:03:46 +0100 Subject: [PATCH 29/32] fix flake --- autoPyTorch/api/base_task.py | 15 +++++++++++---- autoPyTorch/evaluation/abstract_evaluator.py | 2 +- test/test_utils/test_common.py | 11 ++++++----- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 5f9fb0f05..f2ecf3441 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -1491,6 +1491,10 @@ def fit_pipeline( dataset_name=dataset_name ) + # dataset_name is created inside the constructor of BaseDataset + # we expect it to be not None. This is for mypy + assert dataset.dataset_name is not None + # TAE expects each configuration to have a config_id. # For fitting a pipeline as it is not part of the # search process, it makes sense to set it to 0 @@ -1506,9 +1510,6 @@ def fit_pipeline( self._backend.save_datamanager(dataset) if self._logger is None: - # dataset_name is created inside the constructor of BaseDataset - # we expect it to be not None. This is for mypy - assert dataset.dataset_name is not None self._logger = self._get_logger(dataset.dataset_name) include_components = self.include_components if include_components is None else include_components @@ -1576,6 +1577,7 @@ def fit_pipeline( ) fitted_pipeline = self._get_fitted_pipeline( + dataset_name=dataset.dataset_name, pipeline_idx=run_info.config.config_id + tae.initial_num_run, run_info=run_info, run_value=run_value, @@ -1588,11 +1590,16 @@ def fit_pipeline( def _get_fitted_pipeline( self, + dataset_name: str, pipeline_idx: int, run_info: RunInfo, run_value: RunValue, disable_file_output: List[Union[str, DisableFileOutputParameters]] ) -> Optional[BasePipeline]: + + if self._logger is None: + self._logger = self._get_logger(str(dataset_name)) + if run_value.status != StatusType.SUCCESS: warnings.warn(f"Fitting pipeline failed with status: {run_value.status}" f", additional_info: {run_value.additional_info}") @@ -1606,7 +1613,7 @@ def _get_fitted_pipeline( else: load_function = self._backend.load_model_by_seed_and_id_and_budget - return load_function( + return load_function( # type: ignore[no-any-return] seed=self.seed, idx=pipeline_idx, budget=float(run_info.budget), diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py index f98c69dd4..2f792b7a8 100644 --- a/autoPyTorch/evaluation/abstract_evaluator.py +++ b/autoPyTorch/evaluation/abstract_evaluator.py @@ -858,7 +858,7 @@ def file_output( self.backend.save_targets_ensemble(self.Y_optimization) if getattr(self, 'pipelines', None) is not None: - if self.pipelines[0] is not None and len(self.pipelines) > 0: + if self.pipelines[0] is not None and len(self.pipelines) > 0: # type: ignore[index, arg-type] if 'pipelines' not in self.disable_file_output: if self.task_type in CLASSIFICATION_TASKS: pipelines = VotingClassifier(estimators=None, voting='soft', ) diff --git a/test/test_utils/test_common.py b/test/test_utils/test_common.py index 023c7aea0..ea3dec563 100644 --- a/test/test_utils/test_common.py +++ b/test/test_utils/test_common.py @@ -1,6 +1,8 @@ """ This tests the functionality in autoPyTorch/utils/common. """ +from enum import Enum + import pytest from autoPyTorch.utils.common import autoPyTorchEnum @@ -11,6 +13,10 @@ class SubEnum(autoPyTorchEnum): y = "y" +class DummyEnum(Enum): # You need to move it on top + x = "x" + + @pytest.mark.parametrize('iter', ([SubEnum.x], ["x"], @@ -32,9 +38,6 @@ def test_autopytorch_enum(iter): assert e in iter -class DummyEnum(Enum): # You need to move it on top - x = "x" - @pytest.mark.parametrize('iter', [[SubEnum.y], @@ -67,5 +70,3 @@ def test_raise_errors_autopytorch_enum(others): with pytest.raises(RuntimeError): SubEnum.x == others - - From 14113f97f4145157933adf14959b333cd42a6e86 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 20 Dec 2021 11:50:45 +0100 Subject: [PATCH 30/32] use **dataset_kwargs --- autoPyTorch/api/base_task.py | 54 +++++++++++++----------------------- 1 file changed, 20 insertions(+), 34 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index f2ecf3441..2400a55fd 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -1353,14 +1353,6 @@ def refit( def fit_pipeline( self, configuration: Configuration, - dataset: Optional[BaseDataset] = None, - X_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, - y_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, - X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, - y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, - dataset_name: Optional[str] = None, - resampling_strategy: Optional[Union[HoldoutValTypes, CrossValTypes]] = None, - resampling_strategy_args: Optional[Dict[str, Any]] = None, run_time_limit_secs: int = 60, memory_limit: Optional[int] = None, eval_metric: Optional[str] = None, @@ -1372,6 +1364,7 @@ def fit_pipeline( budget: Optional[float] = None, pipeline_options: Optional[Dict] = None, disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, + **dataset_kwargs: Any ) -> Tuple[Optional[BasePipeline], RunInfo, RunValue, BaseDataset]: """ Fit a pipeline on the given task for the budget. @@ -1383,19 +1376,6 @@ def fit_pipeline( methods. Args: - X_train, y_train, X_test, y_test: Union[np.ndarray, List, pd.DataFrame] - A pair of features (X_train) and targets (y_train) used to fit a - pipeline. Additionally, a holdout of this pairs (X_test, y_test) can - be provided to track the generalization performance of each stage. - dataset_name (Optional[str]): - Name of the dataset, if None, random value is used. - resampling_strategy (Optional[Union[CrossValTypes, HoldoutValTypes]]): - Strategy to split the training data. if None, uses - HoldoutValTypes.holdout_validation. - resampling_strategy_args (Optional[Dict[str, Any]]): - Arguments required for the chosen resampling strategy. If None, uses - the default values provided in DEFAULT_RESAMPLING_PARAMETERS - in ```datasets/resampling_strategy.py```. run_time_limit_secs (int: default=60): Time limit for a single call to the machine learning model. Model fitting will be terminated if the machine learning algorithm @@ -1465,8 +1445,15 @@ def fit_pipeline( + `all`: do not save any of the above. For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`. - configuration: (Configuration) + configuration (Configuration): configuration to fit the pipeline with. + **dataset_kwargs (Any): + Can contain either `dataset (BaseDataset)` object or + keyword arguments specifying the dataset like X_train, y_train, + X_test, y_test (Optional[Union[List, pd.DataFrame, np.ndarray]] = None) + and other parameters like dataset_name (str), + resampling_strategy (Union[HoldoutValTypes, CrossValTypes]), + resampling_strategy_args (Dict[str, Any]). Returns: (BasePipeline): @@ -1477,19 +1464,18 @@ def fit_pipeline( Result of fitting the pipeline (BaseDataset): Dataset created from the given tensors - """ + """ + + if 'dataset' not in dataset_kwargs: + if ( + dataset_kwargs.get('X_train', None) is not None + and dataset_kwargs.get('y_train', None) is not None + ): + raise ValueError("No dataset provided, must provide X_train, y_train tensors") - if dataset is None: - assert X_train is not None and \ - y_train is not None, "No dataset provided, must provide X_train, y_train tensors" - dataset = self.get_dataset(X_train=X_train, - y_train=y_train, - X_test=X_test, - y_test=y_test, - resampling_strategy=resampling_strategy, - resampling_strategy_args=resampling_strategy_args, - dataset_name=dataset_name - ) + dataset = self.get_dataset(**dataset_kwargs) + else: + dataset = dataset_kwargs['dataset'] # dataset_name is created inside the constructor of BaseDataset # we expect it to be not None. This is for mypy From 5b2f75f3bb0059950d1bab5f43c254b557d4629e Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 20 Dec 2021 11:51:46 +0100 Subject: [PATCH 31/32] fix flake --- autoPyTorch/api/base_task.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 2400a55fd..d2f39822f 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -1452,7 +1452,7 @@ def fit_pipeline( keyword arguments specifying the dataset like X_train, y_train, X_test, y_test (Optional[Union[List, pd.DataFrame, np.ndarray]] = None) and other parameters like dataset_name (str), - resampling_strategy (Union[HoldoutValTypes, CrossValTypes]), + resampling_strategy (Union[HoldoutValTypes, CrossValTypes]), resampling_strategy_args (Dict[str, Any]). Returns: @@ -1464,7 +1464,7 @@ def fit_pipeline( Result of fitting the pipeline (BaseDataset): Dataset created from the given tensors - """ + """ if 'dataset' not in dataset_kwargs: if ( From 24aac05da7b522d9e1214b4dbff8dc4e99871b66 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 20 Dec 2021 16:27:38 +0100 Subject: [PATCH 32/32] change to enforce keyword args --- autoPyTorch/api/base_task.py | 57 +++++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 17 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index d2f39822f..531125bff 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -1353,6 +1353,15 @@ def refit( def fit_pipeline( self, configuration: Configuration, + *, + dataset: Optional[BaseDataset] = None, + X_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + y_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + dataset_name: Optional[str] = None, + resampling_strategy: Optional[Union[HoldoutValTypes, CrossValTypes]] = None, + resampling_strategy_args: Optional[Dict[str, Any]] = None, run_time_limit_secs: int = 60, memory_limit: Optional[int] = None, eval_metric: Optional[str] = None, @@ -1364,7 +1373,6 @@ def fit_pipeline( budget: Optional[float] = None, pipeline_options: Optional[Dict] = None, disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, - **dataset_kwargs: Any ) -> Tuple[Optional[BasePipeline], RunInfo, RunValue, BaseDataset]: """ Fit a pipeline on the given task for the budget. @@ -1376,6 +1384,26 @@ def fit_pipeline( methods. Args: + configuration (Configuration): + configuration to fit the pipeline with. + dataset (BaseDataset): + An object of the appropriate child class of `BaseDataset`, + that will be used to fit the pipeline + X_train, y_train, X_test, y_test: Union[np.ndarray, List, pd.DataFrame] + A pair of features (X_train) and targets (y_train) used to fit a + pipeline. Additionally, a holdout of this pairs (X_test, y_test) can + be provided to track the generalization performance of each stage. + dataset_name (Optional[str]): + Name of the dataset, if None, random value is used. + resampling_strategy (Optional[Union[CrossValTypes, HoldoutValTypes]]): + Strategy to split the training data. if None, uses + HoldoutValTypes.holdout_validation. + resampling_strategy_args (Optional[Dict[str, Any]]): + Arguments required for the chosen resampling strategy. If None, uses + the default values provided in DEFAULT_RESAMPLING_PARAMETERS + in ```datasets/resampling_strategy.py```. + dataset_name (Optional[str]): + name of the dataset, used as experiment name. run_time_limit_secs (int: default=60): Time limit for a single call to the machine learning model. Model fitting will be terminated if the machine learning algorithm @@ -1445,15 +1473,6 @@ def fit_pipeline( + `all`: do not save any of the above. For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`. - configuration (Configuration): - configuration to fit the pipeline with. - **dataset_kwargs (Any): - Can contain either `dataset (BaseDataset)` object or - keyword arguments specifying the dataset like X_train, y_train, - X_test, y_test (Optional[Union[List, pd.DataFrame, np.ndarray]] = None) - and other parameters like dataset_name (str), - resampling_strategy (Union[HoldoutValTypes, CrossValTypes]), - resampling_strategy_args (Dict[str, Any]). Returns: (BasePipeline): @@ -1466,16 +1485,20 @@ def fit_pipeline( Dataset created from the given tensors """ - if 'dataset' not in dataset_kwargs: + if dataset is None: if ( - dataset_kwargs.get('X_train', None) is not None - and dataset_kwargs.get('y_train', None) is not None + X_train is not None + and y_train is not None ): raise ValueError("No dataset provided, must provide X_train, y_train tensors") - - dataset = self.get_dataset(**dataset_kwargs) - else: - dataset = dataset_kwargs['dataset'] + dataset = self.get_dataset(X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, + resampling_strategy=resampling_strategy, + resampling_strategy_args=resampling_strategy_args, + dataset_name=dataset_name + ) # dataset_name is created inside the constructor of BaseDataset # we expect it to be not None. This is for mypy