diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index 28d01027d..0d737d7c0 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -14,7 +14,6 @@ from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType from autoPyTorch.datasets.resampling_strategy import ( HoldoutValTypes, - CrossValTypes, ResamplingStrategies, ) from autoPyTorch.datasets.tabular_dataset import TabularDataset @@ -384,13 +383,6 @@ def search( dataset_name=dataset_name ) - if not isinstance(self.resampling_strategy, (CrossValTypes, HoldoutValTypes)): - raise ValueError( - 'Hyperparameter optimization requires a validation split. ' - 'Expected `self.resampling_strategy` to be either ' - '(CrossValTypes, HoldoutValTypes), but got {}'.format(self.resampling_strategy) - ) - return self._search( dataset=self.dataset, optimize_metric=optimize_metric, diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py index 9cc74227d..c18a90c42 100644 --- a/autoPyTorch/api/tabular_regression.py +++ b/autoPyTorch/api/tabular_regression.py @@ -14,7 +14,6 @@ from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType from autoPyTorch.datasets.resampling_strategy import ( HoldoutValTypes, - CrossValTypes, ResamplingStrategies, ) from autoPyTorch.datasets.tabular_dataset import TabularDataset @@ -384,13 +383,6 @@ def search( dataset_name=dataset_name ) - if not isinstance(self.resampling_strategy, (CrossValTypes, HoldoutValTypes)): - raise ValueError( - 'Hyperparameter optimization requires a validation split. ' - 'Expected `self.resampling_strategy` to be either ' - '(CrossValTypes, HoldoutValTypes), but got {}'.format(self.resampling_strategy) - ) - return self._search( dataset=self.dataset, optimize_metric=optimize_metric, diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 6895e8478..3f853a653 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -14,14 +14,13 @@ from sklearn.exceptions import NotFittedError from sklearn.impute import SimpleImputer from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import OneHotEncoder, StandardScaler +from sklearn.preprocessing import OrdinalEncoder from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SUPPORTED_FEAT_TYPES def _create_column_transformer( preprocessors: Dict[str, List[BaseEstimator]], - numerical_columns: List[str], categorical_columns: List[str], ) -> ColumnTransformer: """ @@ -32,8 +31,6 @@ def _create_column_transformer( Args: preprocessors (Dict[str, List[BaseEstimator]]): Dictionary containing list of numerical and categorical preprocessors. - numerical_columns (List[str]): - List of names of numerical columns categorical_columns (List[str]): List of names of categorical columns @@ -41,17 +38,11 @@ def _create_column_transformer( ColumnTransformer """ - numerical_pipeline = 'drop' - categorical_pipeline = 'drop' - if len(numerical_columns) > 0: - numerical_pipeline = make_pipeline(*preprocessors['numerical']) - if len(categorical_columns) > 0: - categorical_pipeline = make_pipeline(*preprocessors['categorical']) + categorical_pipeline = make_pipeline(*preprocessors['categorical']) return ColumnTransformer([ - ('categorical_pipeline', categorical_pipeline, categorical_columns), - ('numerical_pipeline', numerical_pipeline, numerical_columns)], - remainder='drop' + ('categorical_pipeline', categorical_pipeline, categorical_columns)], + remainder='passthrough' ) @@ -59,22 +50,17 @@ def get_tabular_preprocessors() -> Dict[str, List[BaseEstimator]]: """ This function creates a Dictionary containing a list of numerical and categorical preprocessors - Returns: Dict[str, List[BaseEstimator]] """ preprocessors: Dict[str, List[BaseEstimator]] = dict() # Categorical Preprocessors - onehot_encoder = OneHotEncoder(categories='auto', sparse=False, handle_unknown='ignore') + ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', + unknown_value=-1) categorical_imputer = SimpleImputer(strategy='constant', copy=False) - # Numerical Preprocessors - numerical_imputer = SimpleImputer(strategy='median', copy=False) - standard_scaler = StandardScaler(with_mean=True, with_std=True, copy=False) - - preprocessors['categorical'] = [categorical_imputer, onehot_encoder] - preprocessors['numerical'] = [numerical_imputer, standard_scaler] + preprocessors['categorical'] = [categorical_imputer, ordinal_encoder] return preprocessors @@ -161,31 +147,47 @@ def _fit( X = cast(pd.DataFrame, X) - self.all_nan_columns = set([column for column in X.columns if X[column].isna().all()]) + all_nan_columns = X.columns[X.isna().all()] + for col in all_nan_columns: + X[col] = pd.to_numeric(X[col]) + + # Handle objects if possible + exist_object_columns = has_object_columns(X.dtypes.values) + if exist_object_columns: + X = self.infer_objects(X) - categorical_columns, numerical_columns, feat_type = self._get_columns_info(X) + self.dtypes = [dt.name for dt in X.dtypes] # Also note this change in self.dtypes + self.all_nan_columns = set(all_nan_columns) - self.enc_columns = categorical_columns + self.enc_columns, self.feat_type = self._get_columns_info(X) - preprocessors = get_tabular_preprocessors() - self.column_transformer = _create_column_transformer( - preprocessors=preprocessors, - numerical_columns=numerical_columns, - categorical_columns=categorical_columns, - ) + if len(self.enc_columns) > 0: - # Mypy redefinition - assert self.column_transformer is not None - self.column_transformer.fit(X) + preprocessors = get_tabular_preprocessors() + self.column_transformer = _create_column_transformer( + preprocessors=preprocessors, + categorical_columns=self.enc_columns, + ) - # The column transformer reorders the feature types - # therefore, we need to change the order of columns as well - # This means categorical columns are shifted to the left + # Mypy redefinition + assert self.column_transformer is not None + self.column_transformer.fit(X) - self.feat_type = sorted( - feat_type, - key=functools.cmp_to_key(self._comparator) - ) + # The column transformer moves categorical columns before all numerical columns + # therefore, we need to sort categorical columns so that it complies this change + + self.feat_type = sorted( + self.feat_type, + key=functools.cmp_to_key(self._comparator) + ) + + encoded_categories = self.column_transformer.\ + named_transformers_['categorical_pipeline'].\ + named_steps['ordinalencoder'].categories_ + self.categories = [ + list(range(len(cat))) + for cat in encoded_categories + ] # differently to categorical_columns and numerical_columns, # this saves the index of the column. @@ -265,6 +267,23 @@ def transform( if hasattr(X, "iloc") and not scipy.sparse.issparse(X): X = cast(Type[pd.DataFrame], X) + if self.all_nan_columns is None: + raise ValueError('_fit must be called before calling transform') + + for col in list(self.all_nan_columns): + X[col] = np.nan + X[col] = pd.to_numeric(X[col]) + + if len(self.categorical_columns) > 0: + # when some categorical columns are not all nan in the training set + # but they are all nan in the testing or validation set + # we change those columns to `object` dtype + # to ensure that these columns are changed to appropriate dtype + # in self.infer_objects + all_nan_cat_cols = set(X[self.enc_columns].columns[X[self.enc_columns].isna().all()]) + dtype_dict = {col: 'object' for col in self.enc_columns if col in all_nan_cat_cols} + X = X.astype(dtype_dict) + # Check the data here so we catch problems on new test data self._check_data(X) @@ -273,11 +292,6 @@ def transform( # We need to convert the column in test data to # object otherwise the test column is interpreted as float if self.column_transformer is not None: - if len(self.categorical_columns) > 0: - categorical_columns = self.column_transformer.transformers_[0][-1] - for column in categorical_columns: - if X[column].isna().all(): - X[column] = X[column].astype('object') X = self.column_transformer.transform(X) # Sparse related transformations @@ -361,7 +375,6 @@ def _check_data( self.column_order = column_order dtypes = [dtype.name for dtype in X.dtypes] - diff_cols = X.columns[[s_dtype != dtype for s_dtype, dtype in zip(self.dtypes, dtypes)]] if len(self.dtypes) == 0: self.dtypes = dtypes @@ -373,7 +386,7 @@ def _check_data( def _get_columns_info( self, X: pd.DataFrame, - ) -> Tuple[List[str], List[str], List[str]]: + ) -> Tuple[List[str], List[str]]: """ Return the columns to be encoded from a pandas dataframe @@ -392,15 +405,12 @@ def _get_columns_info( """ # Register if a column needs encoding - numerical_columns = [] categorical_columns = [] # Also, register the feature types for the estimator feat_type = [] # Make sure each column is a valid type for i, column in enumerate(X.columns): - if self.all_nan_columns is not None and column in self.all_nan_columns: - continue column_dtype = self.dtypes[i] err_msg = "Valid types are `numerical`, `categorical` or `boolean`, " \ "but input column {} has an invalid type `{}`.".format(column, column_dtype) @@ -411,7 +421,6 @@ def _get_columns_info( # TypeError: data type not understood in certain pandas types elif is_numeric_dtype(column_dtype): feat_type.append('numerical') - numerical_columns.append(column) elif column_dtype == 'object': # TODO verify how would this happen when we always convert the object dtypes to category raise TypeError( @@ -437,7 +446,7 @@ def _get_columns_info( "before feeding it to AutoPyTorch.".format(err_msg) ) - return categorical_columns, numerical_columns, feat_type + return categorical_columns, feat_type def list_to_pandas( self, @@ -507,22 +516,26 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame: pd.DataFrame """ if hasattr(self, 'object_dtype_mapping'): - # Mypy does not process the has attr. This dict is defined below - for key, dtype in self.object_dtype_mapping.items(): # type: ignore[has-type] - # honor the training data types - try: - X[key] = X[key].astype(dtype.name) - except Exception as e: - # Try inference if possible - self.logger.warning(f'Casting the column {key} to {dtype} caused the exception {e}') - pass + # honor the training data types + try: + # Mypy does not process the has attr. + X = X.astype(self.object_dtype_mapping) # type: ignore[has-type] + except Exception as e: + # Try inference if possible + self.logger.warning(f'Casting the columns to training dtypes ' # type: ignore[has-type] + f'{self.object_dtype_mapping} caused the exception {e}') + pass else: - # Calling for the first time to infer the categories - X = X.infer_objects() - for column, data_type in zip(X.columns, X.dtypes): - if not is_numeric_dtype(data_type): - X[column] = X[column].astype('category') - + if len(self.dtypes) != 0: + # when train data has no object dtype, but test does + # we prioritise the datatype given in training data + dtype_dict = {col: dtype for col, dtype in zip(X.columns, self.dtypes)} + X = X.astype(dtype_dict) + else: + # Calling for the first time to infer the categories + X = X.infer_objects() + dtype_dict = {col: 'category' for col, dtype in zip(X.columns, X.dtypes) if not is_numeric_dtype(dtype)} + X = X.astype(dtype_dict) # only numerical attributes and categories self.object_dtype_mapping = {column: data_type for column, data_type in zip(X.columns, X.dtypes)} diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py index 803ee7cd8..ddf7c8ddf 100644 --- a/autoPyTorch/datasets/base_dataset.py +++ b/autoPyTorch/datasets/base_dataset.py @@ -125,7 +125,6 @@ def __init__( self.holdout_validators: Dict[str, HoldOutFunc] = {} self.no_resampling_validators: Dict[str, NoResamplingFunc] = {} self.random_state = np.random.RandomState(seed=seed) - self.no_resampling_validators: Dict[str, NoResamplingFunc] = {} self.shuffle = shuffle self.resampling_strategy = resampling_strategy self.resampling_strategy_args = resampling_strategy_args @@ -143,10 +142,6 @@ def __init__( else: self.output_shape = self.train_tensors[1].shape[-1] if self.train_tensors[1].ndim > 1 else 1 - # TODO: Look for a criteria to define small enough to preprocess - # False for the regularization cocktails initially - self.is_small_preprocess = False - # Make sure cross validation splits are created once self.cross_validators = CrossValFuncs.get_cross_validators(*CrossValTypes) self.holdout_validators = HoldOutFuncs.get_holdout_validators(*HoldoutValTypes) diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py index c246b4427..78447a04e 100644 --- a/autoPyTorch/datasets/resampling_strategy.py +++ b/autoPyTorch/datasets/resampling_strategy.py @@ -39,13 +39,6 @@ def __call__(self, random_state: np.random.RandomState, val_share: float, ... -class NoResamplingFunc(Protocol): - def __call__(self, - random_state: np.random.RandomState, - indices: np.ndarray) -> np.ndarray: - ... - - class CrossValTypes(IntEnum): """The type of cross validation diff --git a/autoPyTorch/evaluation/fit_evaluator.py b/autoPyTorch/evaluation/fit_evaluator.py deleted file mode 100644 index 52c47b4fa..000000000 --- a/autoPyTorch/evaluation/fit_evaluator.py +++ /dev/null @@ -1,378 +0,0 @@ -import time -from multiprocessing.queues import Queue -from typing import Any, Dict, List, Optional, Tuple, Union - -from ConfigSpace.configuration_space import Configuration - -import numpy as np - -from sklearn.base import BaseEstimator - -from smac.tae import StatusType - -from autoPyTorch.automl_common.common.utils.backend import Backend -from autoPyTorch.datasets.resampling_strategy import NoResamplingStrategyTypes -from autoPyTorch.evaluation.abstract_evaluator import ( - AbstractEvaluator, - fit_and_suppress_warnings -) -from autoPyTorch.evaluation.utils import DisableFileOutputParameters -from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric -from autoPyTorch.utils.common import subsampler -from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates - - -class FitEvaluator(AbstractEvaluator): - def __init__(self, backend: Backend, queue: Queue, - metric: autoPyTorchMetric, - budget: float, - budget_type: str = None, - pipeline_config: Optional[Dict[str, Any]] = None, - configuration: Optional[Configuration] = None, - seed: int = 1, - output_y_hat_optimization: bool = False, - num_run: Optional[int] = None, - include: Optional[Dict[str, Any]] = None, - exclude: Optional[Dict[str, Any]] = None, - disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, - init_params: Optional[Dict[str, Any]] = None, - logger_port: Optional[int] = None, - keep_models: Optional[bool] = None, - all_supported_metrics: bool = True, - search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None) -> None: - super().__init__( - backend=backend, - queue=queue, - configuration=configuration, - metric=metric, - seed=seed, - output_y_hat_optimization=output_y_hat_optimization, - num_run=num_run, - include=include, - exclude=exclude, - disable_file_output=disable_file_output, - init_params=init_params, - budget=budget, - budget_type=budget_type, - logger_port=logger_port, - all_supported_metrics=all_supported_metrics, - pipeline_config=pipeline_config, - search_space_updates=search_space_updates - ) - if not isinstance(self.datamanager.resampling_strategy, NoResamplingStrategyTypes): - raise ValueError( - "FitEvaluator needs to be fitted on the whole dataset and resampling_strategy " - "must be `NoResamplingStrategyTypes`, but got {}".format( - self.datamanager.resampling_strategy - )) - - self.splits = self.datamanager.splits - self.Y_target: Optional[np.ndarray] = None - self.Y_train_targets: np.ndarray = np.ones(self.y_train.shape) * np.NaN - self.pipeline: Optional[BaseEstimator] = None - - self.logger.debug("Search space updates :{}".format(self.search_space_updates)) - self.keep_models = keep_models - - def fit_predict_and_loss(self) -> None: - """Fit, predict and compute the loss for no resampling strategy""" - assert self.splits is not None, "Can't fit pipeline in {} is datamanager.splits is None" \ - .format(self.__class__.__name__) - additional_run_info: Optional[Dict] = None - split_id = 0 - self.logger.info("Starting fit {}".format(split_id)) - - pipeline = self._get_pipeline() - - train_split, test_split = self.splits[split_id] - assert test_split is None - self.Y_actual_train = self.y_train[train_split] - y_train_pred, y_valid_pred, y_test_pred = self._fit_and_predict(pipeline, split_id, - train_indices=train_split, - test_indices=test_split, - add_pipeline_to_self=True) - train_loss = self._loss(self.y_train[train_split], y_train_pred) - if y_valid_pred is not None: - loss = self._loss(self.y_valid, y_valid_pred) - elif y_test_pred is not None: - loss = self._loss(self.y_test, y_test_pred) - else: - loss = train_loss - - additional_run_info = pipeline.get_additional_run_info() if hasattr( - pipeline, 'get_additional_run_info') else {} - - status = StatusType.SUCCESS - - self.logger.debug("In train evaluator fit_predict_and_loss, num_run: {} loss:{}".format( - self.num_run, - loss - )) - self.finish_up( - loss=loss, - train_loss=train_loss, - valid_pred=y_valid_pred, - test_pred=y_test_pred, - additional_run_info=additional_run_info, - file_output=True, - status=status, - opt_pred=None - ) - - def _fit_and_predict(self, pipeline: BaseEstimator, fold: int, train_indices: Union[np.ndarray, List], - test_indices: None, - add_pipeline_to_self: bool - ) -> Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]: - - X = {'train_indices': train_indices, - 'val_indices': test_indices, - 'split_id': fold, - 'num_run': self.num_run, - **self.fit_dictionary} # fit dictionary - y = None - fit_and_suppress_warnings(self.logger, pipeline, X, y) - self.logger.info("Model fitted, now predicting") - ( - Y_train_pred, - Y_valid_pred, - Y_test_pred - ) = self._predict( - pipeline, - train_indices=train_indices, - ) - - if add_pipeline_to_self: - self.pipeline = pipeline - - return Y_train_pred, Y_valid_pred, Y_test_pred - - def _predict(self, pipeline: BaseEstimator, - train_indices: Union[np.ndarray, List] - ) -> Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]: - - train_pred = self.predict_function(subsampler(self.X_train, train_indices), pipeline, - self.y_train[train_indices]) - - if self.X_valid is not None: - valid_pred = self.predict_function(self.X_valid, pipeline, - self.y_valid) - else: - valid_pred = None - - if self.X_test is not None: - test_pred = self.predict_function(self.X_test, pipeline, - self.y_train[train_indices]) - else: - test_pred = None - - return train_pred, valid_pred, test_pred - - def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float], - valid_pred: Optional[np.ndarray], - test_pred: Optional[np.ndarray], additional_run_info: Optional[Dict], - file_output: bool, status: StatusType, - opt_pred: Optional[np.ndarray] - ) -> Optional[Tuple[float, float, int, Dict]]: - """This function does everything necessary after the fitting is done: - - * predicting - * saving the necessary files - We use it as the signal handler so we can recycle the code for the - normal usecase and when the runsolver kills us here :)""" - - self.duration = time.time() - self.starttime - - if file_output: - loss_, additional_run_info_ = self.file_output( - None, valid_pred, test_pred, - ) - else: - loss_ = None - additional_run_info_ = {} - - validation_loss, test_loss = self.calculate_auxiliary_losses( - valid_pred, test_pred - ) - - if loss_ is not None: - return self.duration, loss_, self.seed, additional_run_info_ - - cost = loss[self.metric.name] - - additional_run_info = ( - {} if additional_run_info is None else additional_run_info - ) - for metric_name, value in loss.items(): - additional_run_info[metric_name] = value - additional_run_info['duration'] = self.duration - additional_run_info['num_run'] = self.num_run - if train_loss is not None: - additional_run_info['train_loss'] = train_loss - if validation_loss is not None: - additional_run_info['validation_loss'] = validation_loss - if test_loss is not None: - additional_run_info['test_loss'] = test_loss - - rval_dict = {'loss': cost, - 'additional_run_info': additional_run_info, - 'status': status} - - self.queue.put(rval_dict) - return None - - def file_output( - self, - Y_optimization_pred: np.ndarray, - Y_valid_pred: np.ndarray, - Y_test_pred: np.ndarray, - ) -> Tuple[Optional[float], Dict]: - - # Abort if predictions contain NaNs - for y, s in [ - [Y_valid_pred, 'validation'], - [Y_test_pred, 'test'] - ]: - if y is not None and not np.all(np.isfinite(y)): - return ( - 1.0, - { - 'error': - 'Model predictions for %s set contains NaNs.' % s - }, - ) - - # Abort if we don't want to output anything. - if 'all' in self.disable_file_output: - return None, {} - - if getattr(self, 'pipeline', None) is not None: - if 'pipeline' not in self.disable_file_output: - pipeline = self.pipeline - else: - pipeline = None - else: - pipeline = None - - self.logger.debug("Saving model {}_{}_{} to disk".format(self.seed, self.num_run, self.budget)) - self.backend.save_numrun_to_dir( - seed=int(self.seed), - idx=int(self.num_run), - budget=float(self.budget), - model=pipeline, - cv_model=None, - ensemble_predictions=None, - valid_predictions=( - Y_valid_pred if 'y_valid' not in - self.disable_file_output else None - ), - test_predictions=( - Y_test_pred if 'y_test' not in - self.disable_file_output else None - ), - ) - - return None, {} - - -# create closure for evaluating an algorithm -def eval_function( - backend: Backend, - queue: Queue, - metric: autoPyTorchMetric, - budget: float, - config: Optional[Configuration], - seed: int, - num_run: int, - include: Optional[Dict[str, Any]], - exclude: Optional[Dict[str, Any]], - output_y_hat_optimization: bool = False, - disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, - pipeline_config: Optional[Dict[str, Any]] = None, - budget_type: str = None, - init_params: Optional[Dict[str, Any]] = None, - logger_port: Optional[int] = None, - all_supported_metrics: bool = True, - search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, - instance: str = None, -) -> None: - """ - This closure allows the communication between the ExecuteTaFuncWithQueue and the - pipeline trainer (TrainEvaluator). - - Fundamentally, smac calls the ExecuteTaFuncWithQueue.run() method, which internally - builds a TrainEvaluator. The TrainEvaluator builds a pipeline, stores the output files - to disc via the backend, and puts the performance result of the run in the queue. - - - Attributes: - backend (Backend): - An object to interface with the disk storage. In particular, allows to - access the train and test datasets - queue (Queue): - Each worker available will instantiate an evaluator, and after completion, - it will return the evaluation result via a multiprocessing queue - metric (autoPyTorchMetric): - A scorer object that is able to evaluate how good a pipeline was fit. It - is a wrapper on top of the actual score method (a wrapper on top of scikit - lean accuracy for example) that formats the predictions accordingly. - budget: (float): - The amount of epochs/time a configuration is allowed to run. - budget_type (str): - The budget type, which can be epochs or time - pipeline_config (Optional[Dict[str, Any]]): - Defines the content of the pipeline being evaluated. For example, it - contains pipeline specific settings like logging name, or whether or not - to use tensorboard. - config (Union[int, str, Configuration]): - Determines the pipeline to be constructed. - seed (int): - A integer that allows for reproducibility of results - output_y_hat_optimization (bool): - Whether this worker should output the target predictions, so that they are - stored on disk. Fundamentally, the resampling strategy might shuffle the - Y_train targets, so we store the split in order to re-use them for ensemble - selection. - num_run (Optional[int]): - An identifier of the current configuration being fit. This number is unique per - configuration. - include (Optional[Dict[str, Any]]): - An optional dictionary to include components of the pipeline steps. - exclude (Optional[Dict[str, Any]]): - An optional dictionary to exclude components of the pipeline steps. - disable_file_output (Union[bool, List[str]]): - By default, the model, it's predictions and other metadata is stored on disk - for each finished configuration. This argument allows the user to skip - saving certain file type, for example the model, from being written to disk. - init_params (Optional[Dict[str, Any]]): - Optional argument that is passed to each pipeline step. It is the equivalent of - kwargs for the pipeline steps. - logger_port (Optional[int]): - Logging is performed using a socket-server scheme to be robust against many - parallel entities that want to write to the same file. This integer states the - socket port for the communication channel. If None is provided, a traditional - logger is used. - instance (str): - An instance on which to evaluate the current pipeline. By default we work - with a single instance, being the provided X_train, y_train of a single dataset. - This instance is a compatibility argument for SMAC, that is capable of working - with multiple datasets at the same time. - """ - evaluator = FitEvaluator( - backend=backend, - queue=queue, - metric=metric, - configuration=config, - seed=seed, - num_run=num_run, - include=include, - exclude=exclude, - disable_file_output=disable_file_output, - init_params=init_params, - budget=budget, - budget_type=budget_type, - logger_port=logger_port, - all_supported_metrics=all_supported_metrics, - pipeline_config=pipeline_config, - search_space_updates=search_space_updates - ) - evaluator.fit_predict_and_loss() diff --git a/autoPyTorch/evaluation/train_evaluator.py b/autoPyTorch/evaluation/train_evaluator.py index 5c044b9eb..95614c22c 100644 --- a/autoPyTorch/evaluation/train_evaluator.py +++ b/autoPyTorch/evaluation/train_evaluator.py @@ -152,13 +152,6 @@ def __init__(self, backend: Backend, queue: Queue, search_space_updates=search_space_updates ) - if not isinstance(self.datamanager.resampling_strategy, (CrossValTypes, HoldoutValTypes)): - raise ValueError( - 'TrainEvaluator expect to have (CrossValTypes, HoldoutValTypes) as ' - 'resampling_strategy, but got {}'.format(self.datamanager.resampling_strategy) - ) - - if not isinstance(self.datamanager.resampling_strategy, (CrossValTypes, HoldoutValTypes)): resampling_strategy = self.datamanager.resampling_strategy raise ValueError( @@ -428,10 +421,10 @@ def eval_train_function( budget: float, config: Optional[Configuration], seed: int, + output_y_hat_optimization: bool, num_run: int, include: Optional[Dict[str, Any]], exclude: Optional[Dict[str, Any]], - output_y_hat_optimization: bool, disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, pipeline_config: Optional[Dict[str, Any]] = None, budget_type: str = None, diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py index e8f95ab57..05bede68a 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py @@ -3,14 +3,14 @@ import numpy as np from sklearn.compose import ColumnTransformer -# from sklearn.pipeline import make_pipeline +from sklearn.pipeline import make_pipeline import torch from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import ( autoPyTorchTabularPreprocessingComponent ) -# from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.utils import get_tabular_preprocessers +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.utils import get_tabular_preprocessers from autoPyTorch.utils.common import FitRequirement, subsampler @@ -52,11 +52,11 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer": numerical_pipeline = 'passthrough' categorical_pipeline = 'passthrough' - # preprocessors = get_tabular_preprocessers(X) - # if len(X['dataset_properties']['numerical_columns']): - # numerical_pipeline = make_pipeline(*preprocessors['numerical']) - # if len(X['dataset_properties']['categorical_columns']): - # categorical_pipeline = make_pipeline(*preprocessors['categorical']) + preprocessors = get_tabular_preprocessers(X) + if len(X['dataset_properties']['numerical_columns']): + numerical_pipeline = make_pipeline(*preprocessors['numerical']) + if len(X['dataset_properties']['categorical_columns']): + categorical_pipeline = make_pipeline(*preprocessors['categorical']) self.preprocessor = ColumnTransformer([ ('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns']), diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/NoEncoder.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/NoEncoder.py index d62ee26d2..929e99048 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/NoEncoder.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/NoEncoder.py @@ -40,7 +40,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: Returns: (Dict[str, Any]): the updated 'X' dictionary """ - # X.update({'encoder': self.preprocessor}) + X.update({'encoder': self.preprocessor}) return X @staticmethod diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py index 9829cadcd..eadc0a188 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py @@ -28,5 +28,5 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None: raise ValueError("cant call transform on {} without fitting first." .format(self.__class__.__name__)) - # X.update({'encoder': self.preprocessor}) + X.update({'encoder': self.preprocessor}) return X diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py index ac0648481..b65f3c229 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py @@ -29,5 +29,5 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None: raise ValueError("cant call transform on {} without fitting first." .format(self.__class__.__name__)) - # X.update({'imputer': self.preprocessor}) + X.update({'imputer': self.preprocessor}) return X diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/NoScaler.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/NoScaler.py index 9775d17dd..9d50aa8f5 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/NoScaler.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/NoScaler.py @@ -43,7 +43,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: Returns: np.ndarray: Transformed features """ - # X.update({'scaler': self.preprocessor}) + X.update({'scaler': self.preprocessor}) return X @staticmethod diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler.py index 270fac246..39834dd2b 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler.py @@ -28,5 +28,5 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None: raise ValueError("cant call transform on {} without fitting first." .format(self.__class__.__name__)) - # X.update({'scaler': self.preprocessor}) + X.update({'scaler': self.preprocessor}) return X diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py index 7fbf33f99..c25ea6bb0 100644 --- a/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py +++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py @@ -20,7 +20,7 @@ def __init__(self, random_state: Optional[np.random.RandomState] = None) -> None super().__init__() self.random_state = random_state self.add_fit_requirements([ - FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True), + # FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True), FitRequirement('X_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True, dataset_property=False)]) @@ -32,14 +32,13 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "EarlyPreprocessing": def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: transforms = get_preprocess_transforms(X) - if X['dataset_properties']['is_small_preprocess']: - if 'X_train' in X: - X_train = X['X_train'] - else: - # Incorporate the transform to the dataset - X_train = X['backend'].load_datamanager().train_tensors[0] - - X['X_train'] = preprocess(dataset=X_train, transforms=transforms) + if 'X_train' in X: + X_train = X['X_train'] + else: + # Incorporate the transform to the dataset + X_train = X['backend'].load_datamanager().train_tensors[0] + + X['X_train'] = preprocess(dataset=X_train, transforms=transforms) # We need to also save the preprocess transforms for inference X.update({'preprocess_transforms': transforms}) diff --git a/autoPyTorch/pipeline/components/setup/lr_scheduler/base_scheduler.py b/autoPyTorch/pipeline/components/setup/lr_scheduler/base_scheduler.py index 671a70f6a..bc53e2e1f 100644 --- a/autoPyTorch/pipeline/components/setup/lr_scheduler/base_scheduler.py +++ b/autoPyTorch/pipeline/components/setup/lr_scheduler/base_scheduler.py @@ -46,7 +46,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: X.update( lr_scheduler=self.scheduler, step_interval=self.step_interval, - is_cyclic_scheduler= self.get_properties()['cyclic'] + is_cyclic_scheduler=self.get_properties()['cyclic'] ) return X diff --git a/autoPyTorch/pipeline/components/setup/network/base_network.py b/autoPyTorch/pipeline/components/setup/network/base_network.py index 02782e7a2..7ec872b96 100644 --- a/autoPyTorch/pipeline/components/setup/network/base_network.py +++ b/autoPyTorch/pipeline/components/setup/network/base_network.py @@ -1,5 +1,4 @@ -from typing import Any, Dict, Optional, Union -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union from ConfigSpace.configuration_space import ConfigurationSpace diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py index 1a04d6645..e82f72abb 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py @@ -28,7 +28,6 @@ def __init__(self, **kwargs: Any): super().__init__() self.add_fit_requirements([ - FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True), FitRequirement('X_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True, dataset_property=False), FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True), @@ -52,12 +51,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: self.check_requirements(X, y) X_train = X['X_train'] - if X["dataset_properties"]["is_small_preprocess"]: - input_shape = X_train.shape[1:] - else: - # get input shape by transforming first two elements of the training set - column_transformer = X['tabular_transformer'].preprocessor - input_shape = column_transformer.transform(X_train[:1]).shape[1:] + input_shape = X_train.shape[1:] input_shape = get_output_shape(X['network_embedding'], input_shape=input_shape) self.input_shape = input_shape diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py index 844a4616b..998055d2b 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py @@ -1,4 +1,4 @@ -# import copy +import copy from typing import Any, Dict, Optional, Tuple import numpy as np @@ -17,11 +17,11 @@ def __init__(self, random_state: Optional[np.random.RandomState] = None): def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: - num_numerical_columns, num_input_features = self._get_args(X) + num_numerical_columns, num_input_features = self._get_required_info_from_data(X) self.embedding = self.build_embedding( num_input_features=num_input_features, - num_numerical_features=num_numerical_columns) # type: ignore[arg-type] + num_numerical_features=num_numerical_columns) return self def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: @@ -31,22 +31,39 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module: raise NotImplementedError - def _get_args(self, X: Dict[str, Any]) -> Tuple[None, None]: # Tuple[int, np.ndarray]: + def _get_required_info_from_data(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]: + """ + Returns the number of numerical columns after preprocessing and + an array of size equal to the number of input features + containing zeros for numerical data and number of categories + for categorical data. This is required to build the embedding. + + Args: + X (Dict[str, Any]): + Fit dictionary + + Returns: + Tuple[int, np.ndarray]: + number of numerical columns and array indicating + number of categories for categorical columns and + 0 for numerical columns + """ # Feature preprocessors can alter numerical columns - # if len(X['dataset_properties']['numerical_columns']) == 0: - # num_numerical_columns = 0 - # else: - # X_train = copy.deepcopy(X['backend'].load_datamanager().train_tensors[0][:2]) - # - # numerical_column_transformer = X['tabular_transformer'].preprocessor. \ - # named_transformers_['numerical_pipeline'] - # num_numerical_columns = numerical_column_transformer.transform( - # X_train[:, X['dataset_properties']['numerical_columns']]).shape[1] - # num_input_features = np.zeros((num_numerical_columns + len(X['dataset_properties']['categorical_columns'])), - # dtype=int) - # categories = X['dataset_properties']['categories'] - # - # for i, category in enumerate(categories): - # num_input_features[num_numerical_columns + i, ] = len(category) - # return num_numerical_columns, num_input_features - return None, None + if len(X['dataset_properties']['numerical_columns']) == 0: + num_numerical_columns = 0 + else: + X_train = copy.deepcopy(X['backend'].load_datamanager().train_tensors[0][:2]) + + numerical_column_transformer = X['tabular_transformer'].preprocessor. \ + named_transformers_['numerical_pipeline'] + num_numerical_columns = numerical_column_transformer.transform( + X_train[:, X['dataset_properties']['numerical_columns']]).shape[1] + + num_cols = num_numerical_columns + len(X['dataset_properties']['categorical_columns']) + num_input_feats = np.zeros(num_cols, dtype=np.int32) + + categories = X['dataset_properties']['categories'] + for idx, cats in enumerate(categories, start=num_numerical_columns): + num_input_feats[idx] = len(cats) + + return num_numerical_columns, num_input_feats diff --git a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py index 113726870..0cea0b2c7 100644 --- a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py +++ b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py @@ -56,8 +56,8 @@ def __init__(self, batch_size: int = 64, # Define fit requirements self.add_fit_requirements([ FitRequirement("split_id", (int,), user_defined=True, dataset_property=False), - FitRequirement("Backend", (Backend,), user_defined=True, dataset_property=False), - FitRequirement("is_small_preprocess", (bool,), user_defined=True, dataset_property=True)]) + FitRequirement("Backend", (Backend,), user_defined=True, dataset_property=False) + ]) def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: """The transform function calls the transform function of the @@ -102,10 +102,9 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader: self.val_transform, train=False, ) - if X['dataset_properties']["is_small_preprocess"]: - # This parameter indicates that the data has been pre-processed for speed - # Overwrite the datamanager with the pre-processes data - datamanager.replace_data(X['X_train'], X['X_test'] if 'X_test' in X else None) + # This parameter indicates that the data has been pre-processed for speed + # Overwrite the datamanager with the pre-processes data + datamanager.replace_data(X['X_train'], X['X_test'] if 'X_test' in X else None) train_dataset = datamanager.get_dataset(split_id=X['split_id'], train=True) @@ -221,10 +220,6 @@ def check_requirements(self, X: Dict[str, Any], y: Any = None) -> None: if 'backend' not in X: raise ValueError("backend is needed to load the data from disk") - if 'is_small_preprocess' not in X['dataset_properties']: - raise ValueError("is_small_pre-process is required to know if the data was preprocessed" - " or if the data-loader should transform it while loading a batch") - # We expect this class to be a base for image/tabular/time # And the difference among this data types should be mainly # in the transform, so we delegate for special transformation checking diff --git a/autoPyTorch/pipeline/components/training/data_loader/feature_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/feature_data_loader.py index 4e41ec838..d6f3081a0 100644 --- a/autoPyTorch/pipeline/components/training/data_loader/feature_data_loader.py +++ b/autoPyTorch/pipeline/components/training/data_loader/feature_data_loader.py @@ -72,7 +72,7 @@ def build_transform(self, X: Dict[str, Any], mode: str) -> torchvision.transform # distinction is performed candidate_transformations: List[Callable] = [] - if 'test' in mode or not X['dataset_properties']['is_small_preprocess']: + if 'test' in mode: candidate_transformations.append((ExpandTransform())) candidate_transformations.extend(X['preprocess_transforms']) candidate_transformations.append((ContractTransform())) @@ -93,5 +93,5 @@ def _check_transform_requirements(self, X: Dict[str, Any], y: Any = None) -> Non mechanism, in which during a transform, a components adds relevant information so that further stages can be properly fitted """ - if not X['dataset_properties']['is_small_preprocess'] and 'preprocess_transforms' not in X: + if 'preprocess_transforms' not in X: raise ValueError("Cannot find the preprocess_transforms in the fit dictionary") diff --git a/autoPyTorch/pipeline/components/training/data_loader/image_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/image_data_loader.py index 21cc05447..38cdd48b0 100644 --- a/autoPyTorch/pipeline/components/training/data_loader/image_data_loader.py +++ b/autoPyTorch/pipeline/components/training/data_loader/image_data_loader.py @@ -41,7 +41,7 @@ def build_transform(self, X: Dict[str, Any], mode: str) -> torchvision.transform # check if data set is small enough to be preprocessed. # If it is, then no need to add preprocess_transforms to # the data loader as the data is already preprocessed - if 'test' in mode or not X['dataset_properties']['is_small_preprocess']: + if 'test' in mode: transformations.append(X['preprocess_transforms']) # Transform to tensor @@ -63,5 +63,5 @@ def _check_transform_requirements(self, X: Dict[str, Any], y: Any = None) -> Non if not X['image_augmenter'] and 'image_augmenter' not in X: raise ValueError("Cannot find the image_augmenter in the fit dictionary") - if not X['dataset_properties']['is_small_preprocess'] and 'preprocess_transforms' not in X: + if 'preprocess_transforms' not in X: raise ValueError("Cannot find the preprocess_transforms in the fit dictionary") diff --git a/autoPyTorch/pipeline/components/training/trainer/AdversarialTrainer.py b/autoPyTorch/pipeline/components/training/trainer/AdversarialTrainer.py index 67ae71188..fc78e4655 100644 --- a/autoPyTorch/pipeline/components/training/trainer/AdversarialTrainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/AdversarialTrainer.py @@ -76,7 +76,7 @@ def criterion_preparation(self, y_a: np.ndarray, y_b: np.ndarray = None, lam: fl # Initial implementation, consider the adversarial loss and the normal network loss # equally. return lambda criterion, pred, adversarial_pred: 0.5 * criterion(pred, y_a) + \ - 0.5 * criterion(adversarial_pred, y_a) + 0.5 * criterion(adversarial_pred, y_a) def train_step(self, data: np.ndarray, targets: np.ndarray) -> Tuple[float, torch.Tensor]: """ diff --git a/autoPyTorch/pipeline/components/training/trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/__init__.py index 28220fdcd..50869d000 100755 --- a/autoPyTorch/pipeline/components/training/trainer/__init__.py +++ b/autoPyTorch/pipeline/components/training/trainer/__init__.py @@ -86,7 +86,7 @@ def get_fit_requirements(self) -> Optional[List[FitRequirement]]: def get_available_components( self, - dataset_properties: Optional[Dict[str, str]] = None, + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, include: Optional[List[str]] = None, exclude: Optional[List[str]] = None, ) -> Dict[str, autoPyTorchComponent]: diff --git a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py index 574b2002f..517ae08bb 100644 --- a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py @@ -26,7 +26,6 @@ from autoPyTorch.pipeline.components.setup.lr_scheduler.constants import StepIntervalUnit from autoPyTorch.pipeline.components.training.base_training import autoPyTorchTrainingComponent from autoPyTorch.pipeline.components.training.metrics.metrics import CLASSIFICATION_METRICS, REGRESSION_METRICS -from autoPyTorch.pipeline.components.training.trainer.utils import Lookahead from autoPyTorch.pipeline.components.training.metrics.utils import calculate_score from autoPyTorch.pipeline.components.training.trainer.utils import Lookahead, swa_update from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py index c8e05182c..048514559 100644 --- a/test/test_data/test_feature_validator.py +++ b/test/test_data/test_feature_validator.py @@ -237,7 +237,7 @@ def test_featurevalidator_categorical_nan(input_data_featuretest): transformed_X = validator.transform(input_data_featuretest) assert any(pd.isna(input_data_featuretest)) categories_ = validator.column_transformer.\ - named_transformers_['categorical_pipeline'].named_steps['onehotencoder'].categories_ + named_transformers_['categorical_pipeline'].named_steps['ordinalencoder'].categories_ assert any(('0' in categories) or (0 in categories) or ('missing_value' in categories) for categories in categories_) assert np.issubdtype(transformed_X.dtype, np.number) @@ -313,9 +313,8 @@ def test_featurevalidator_get_columns_to_encode(): validator.fit(df) - categorical_columns, numerical_columns, feat_type = validator._get_columns_info(df) + categorical_columns, feat_type = validator._get_columns_info(df) - assert numerical_columns == ['int', 'float'] assert categorical_columns == ['category', 'bool'] assert feat_type == ['numerical', 'numerical', 'categorical', 'categorical'] @@ -327,8 +326,8 @@ def feature_validator_remove_nan_catcolumns(df_train: pd.DataFrame, df_test: pd. transformed_df_train = validator.transform(df_train) transformed_df_test = validator.transform(df_test) - assert np.array_equal(transformed_df_train, ans_train) - assert np.array_equal(transformed_df_test, ans_test) + np.testing.assert_array_equal(transformed_df_train, ans_train) + np.testing.assert_array_equal(transformed_df_test, ans_test) def test_feature_validator_remove_nan_catcolumns(): @@ -373,7 +372,7 @@ def test_feature_validator_remove_nan_catcolumns(): ], dtype='category', ) - ans_train = np.array([[0, 1], [1, 0], [0, 1]], dtype=np.float64) + ans_train = np.array([[1, np.nan, np.nan], [0, np.nan, np.nan], [1, np.nan, np.nan]], dtype=np.float64) df_test = pd.DataFrame( [ {'A': np.nan, 'B': np.nan, 'C': 5}, @@ -382,7 +381,7 @@ def test_feature_validator_remove_nan_catcolumns(): ], dtype='category', ) - ans_test = np.array([[1, 0], [1, 0], [0, 1]], dtype=np.float64) + ans_test = np.array([[0, np.nan, np.nan], [0, np.nan, np.nan], [1, np.nan, np.nan]], dtype=np.float64) feature_validator_remove_nan_catcolumns(df_train, df_test, ans_train, ans_test) # Second case, there exist null columns (B and C) in the training set and @@ -395,7 +394,7 @@ def test_feature_validator_remove_nan_catcolumns(): ], dtype='category', ) - ans_train = np.array([[0, 1], [1, 0], [0, 1]], dtype=np.float64) + ans_train = np.array([[1, np.nan, np.nan], [0, np.nan, np.nan], [1, np.nan, np.nan]], dtype=np.float64) df_test = pd.DataFrame( [ {'A': np.nan, 'B': np.nan, 'C': np.nan}, @@ -404,7 +403,7 @@ def test_feature_validator_remove_nan_catcolumns(): ], dtype='category', ) - ans_test = np.array([[1, 0], [1, 0], [0, 1]], dtype=np.float64) + ans_test = np.array([[0, np.nan, np.nan], [0, np.nan, np.nan], [1, np.nan, np.nan]], dtype=np.float64) feature_validator_remove_nan_catcolumns(df_train, df_test, ans_train, ans_test) # Third case, there exist no null columns in the training set and @@ -416,7 +415,7 @@ def test_feature_validator_remove_nan_catcolumns(): ], dtype='category', ) - ans_train = np.array([[1, 0, 1, 0], [0, 1, 0, 1]], dtype=np.float64) + ans_train = np.array([[0, 0], [1, 1]], dtype=np.float64) df_test = pd.DataFrame( [ {'A': np.nan, 'B': np.nan}, @@ -424,7 +423,7 @@ def test_feature_validator_remove_nan_catcolumns(): ], dtype='category', ) - ans_test = np.array([[0, 0, 0, 0], [0, 0, 0, 0]], dtype=np.float64) + ans_test = np.array([[-1, -1], [-1, -1]], dtype=np.float64) feature_validator_remove_nan_catcolumns(df_train, df_test, ans_train, ans_test) @@ -504,7 +503,7 @@ def test_column_transformer_created(input_data_featuretest): # Make sure that the encoded features are actually encoded. Categorical columns are at # the start after transformation. In our fixtures, this is also honored prior encode - cat_columns, _, feature_types = validator._get_columns_info(input_data_featuretest) + cat_columns, feature_types = validator._get_columns_info(input_data_featuretest) # At least one categorical assert 'categorical' in validator.feat_type @@ -513,13 +512,20 @@ def test_column_transformer_created(input_data_featuretest): if np.any([pd.api.types.is_numeric_dtype(input_data_featuretest[col] ) for col in input_data_featuretest.columns]): assert 'numerical' in validator.feat_type - # we expect this input to be the fixture 'pandas_mixed_nan' - np.testing.assert_array_equal(transformed_X, np.array([[1., 0., -1.], [0., 1., 1.]])) - else: - np.testing.assert_array_equal(transformed_X, np.array([[1., 0., 1., 0.], [0., 1., 0., 1.]])) - - if not all([feat_type in ['numerical', 'categorical'] for feat_type in feature_types]): - raise ValueError("Expected only numerical and categorical feature types") + for i, feat_type in enumerate(feature_types): + if 'numerical' in feat_type: + np.testing.assert_array_equal( + transformed_X[:, i], + input_data_featuretest[input_data_featuretest.columns[i]].to_numpy() + ) + elif 'categorical' in feat_type: + np.testing.assert_array_equal( + transformed_X[:, i], + # Expect always 0, 1... because we use a ordinal encoder + np.array([0, 1]) + ) + else: + raise ValueError(feat_type) def test_no_new_category_after_fit(): @@ -554,7 +560,7 @@ def test_unknown_encode_value(): # The first row should have a 0, 0 as we added a # new categorical there and one hot encoder marks # it as all zeros for the transformed column - expected_row = [0.0, 0.0, -0.5584294383572701, 0.5000000000000004, -1.5136598016833485] + expected_row = [-1, -41, -3, -987.2] assert expected_row == x_t[0].tolist() @@ -678,16 +684,11 @@ def test_feature_validator_imbalanced_data(): validator.fit(X_train) train_feature_types = copy.deepcopy(validator.feat_type) - assert train_feature_types == ['numerical'] + assert train_feature_types == ['numerical', 'numerical', 'numerical', 'numerical'] # validator will throw an error if the column types are not the same transformed_X_test = validator.transform(X_test) transformed_X_test = pd.DataFrame(transformed_X_test) assert sorted(validator.all_nan_columns) == sorted(['A', 'C', 'D']) - # as there are no categorical columns, we can make such an - # assertion. We only expect to drop the all nan columns - total_all_nan_columns = len(validator.all_nan_columns) - total_columns = len(validator.column_order) - assert total_columns - total_all_nan_columns == len(transformed_X_test.columns) # Columns with not all null values in the train split and # completely null on the test split. diff --git a/test/test_data/test_validation.py b/test/test_data/test_validation.py index 08d848e0e..97ef8cdae 100644 --- a/test/test_data/test_validation.py +++ b/test/test_data/test_validation.py @@ -84,7 +84,7 @@ def test_sparse_data_validation_for_regression(): validator.fit(X_train=X_sp, y_train=y) - X_t, y_t = validator.transform(X, y) + X_t, y_t = validator.transform(X_sp, y) # make sure everything was encoded to number assert np.issubdtype(X_t.dtype, np.number) assert np.issubdtype(y_t.dtype, np.number) diff --git a/test/test_datasets/test_tabular_dataset.py b/test/test_datasets/test_tabular_dataset.py index 2ee8b608e..710111f9c 100644 --- a/test/test_datasets/test_tabular_dataset.py +++ b/test/test_datasets/test_tabular_dataset.py @@ -28,7 +28,6 @@ def test_get_dataset_properties(backend, fit_dictionary_tabular): 'categorical_columns', 'numerical_columns', 'issparse', - 'is_small_preprocess', 'task_type', 'output_type', 'input_shape', diff --git a/test/test_evaluation/test_fit_evaluator.py b/test/test_evaluation/test_fit_evaluator.py deleted file mode 100644 index 1515ba74f..000000000 --- a/test/test_evaluation/test_fit_evaluator.py +++ /dev/null @@ -1,206 +0,0 @@ -import multiprocessing -import os -import queue -import shutil -import sys -import unittest -import unittest.mock - -from ConfigSpace import Configuration - -import numpy as np - -from sklearn.base import BaseEstimator - -from smac.tae import StatusType - -from autoPyTorch.automl_common.common.utils.backend import create -from autoPyTorch.datasets.resampling_strategy import NoResamplingStrategyTypes -from autoPyTorch.evaluation.fit_evaluator import FitEvaluator -from autoPyTorch.evaluation.utils import read_queue -from autoPyTorch.pipeline.base_pipeline import BasePipeline -from autoPyTorch.pipeline.components.training.metrics.metrics import accuracy - -this_directory = os.path.dirname(__file__) -sys.path.append(this_directory) -from evaluation_util import ( # noqa (E402: module level import not at top of file) - BaseEvaluatorTest, - get_binary_classification_datamanager, - get_multiclass_classification_datamanager, - get_regression_datamanager, -) # noqa (E402: module level import not at top of file) - - -class BackendMock(object): - def load_datamanager(self): - return get_multiclass_classification_datamanager() - - -class Dummy(object): - def __init__(self): - self.name = 'dummy' - - -class DummyPipeline(BasePipeline): - def __init__(self): - mocked_estimator = unittest.mock.Mock(spec=BaseEstimator) - self.steps = [('MockStep', mocked_estimator)] - pass - - def predict_proba(self, X, batch_size=None): - return np.tile([0.6, 0.4], (len(X), 1)) - - def get_additional_run_info(self): - return {} - - -class TestFitEvaluator(BaseEvaluatorTest, unittest.TestCase): - _multiprocess_can_split_ = True - - def setUp(self): - """ - Creates a backend mock - """ - tmp_dir_name = self.id() - self.ev_path = os.path.join(this_directory, '.tmp_evaluations', tmp_dir_name) - if os.path.exists(self.ev_path): - shutil.rmtree(self.ev_path) - os.makedirs(self.ev_path, exist_ok=False) - dummy_model_files = [os.path.join(self.ev_path, str(n)) for n in range(100)] - dummy_pred_files = [os.path.join(self.ev_path, str(n)) for n in range(100, 200)] - dummy_cv_model_files = [os.path.join(self.ev_path, str(n)) for n in range(200, 300)] - backend_mock = unittest.mock.Mock() - backend_mock.get_model_dir.return_value = self.ev_path - backend_mock.get_cv_model_dir.return_value = self.ev_path - backend_mock.get_model_path.side_effect = dummy_model_files - backend_mock.get_cv_model_path.side_effect = dummy_cv_model_files - backend_mock.get_prediction_output_path.side_effect = dummy_pred_files - backend_mock.temporary_directory = self.ev_path - self.backend_mock = backend_mock - - self.tmp_dir = os.path.join(self.ev_path, 'tmp_dir') - self.output_dir = os.path.join(self.ev_path, 'out_dir') - - def tearDown(self): - if os.path.exists(self.ev_path): - shutil.rmtree(self.ev_path) - - @unittest.mock.patch('autoPyTorch.pipeline.tabular_classification.TabularClassificationPipeline') - def test_no_resampling(self, pipeline_mock): - # Binary iris, contains 69 train samples, 31 test samples - D = get_binary_classification_datamanager(NoResamplingStrategyTypes.no_resampling) - pipeline_mock.predict_proba.side_effect = \ - lambda X, batch_size=None: np.tile([0.6, 0.4], (len(X), 1)) - pipeline_mock.side_effect = lambda **kwargs: pipeline_mock - pipeline_mock.get_additional_run_info.return_value = None - pipeline_mock.get_default_pipeline_options.return_value = {'budget_type': 'epochs', 'epochs': 10} - - configuration = unittest.mock.Mock(spec=Configuration) - backend_api = create(self.tmp_dir, self.output_dir, 'autoPyTorch') - backend_api.load_datamanager = lambda: D - queue_ = multiprocessing.Queue() - - evaluator = FitEvaluator(backend_api, queue_, configuration=configuration, metric=accuracy, budget=0) - evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output) - evaluator.file_output.return_value = (None, {}) - - evaluator.fit_predict_and_loss() - - rval = read_queue(evaluator.queue) - self.assertEqual(len(rval), 1) - result = rval[0]['loss'] - self.assertEqual(len(rval[0]), 3) - self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1) - - self.assertEqual(evaluator.file_output.call_count, 1) - self.assertEqual(result, 0.5806451612903225) - self.assertEqual(pipeline_mock.fit.call_count, 1) - # 2 calls because of train and test set - self.assertEqual(pipeline_mock.predict_proba.call_count, 2) - self.assertEqual(evaluator.file_output.call_count, 1) - # Should be none as no val preds are mentioned - self.assertIsNone(evaluator.file_output.call_args[0][0]) - # Number of y_test_preds and Y_test should be the same - self.assertEqual(evaluator.file_output.call_args[0][2].shape[0], - D.test_tensors[1].shape[0]) - self.assertEqual(evaluator.pipeline.fit.call_count, 1) - - @unittest.mock.patch.object(FitEvaluator, '_loss') - def test_file_output(self, loss_mock): - - D = get_regression_datamanager(NoResamplingStrategyTypes.no_resampling) - D.name = 'test' - self.backend_mock.load_datamanager.return_value = D - configuration = unittest.mock.Mock(spec=Configuration) - queue_ = multiprocessing.Queue() - loss_mock.return_value = None - - evaluator = FitEvaluator(self.backend_mock, queue_, configuration=configuration, metric=accuracy, budget=0) - - self.backend_mock.get_model_dir.return_value = True - evaluator.pipeline = 'model' - evaluator.Y_optimization = D.train_tensors[1] - rval = evaluator.file_output( - D.train_tensors[1], - None, - D.test_tensors[1], - ) - - self.assertEqual(rval, (None, {})) - # These targets are not saved as Fit evaluator is not used to make an ensemble - self.assertEqual(self.backend_mock.save_targets_ensemble.call_count, 0) - self.assertEqual(self.backend_mock.save_numrun_to_dir.call_count, 1) - self.assertEqual(self.backend_mock.save_numrun_to_dir.call_args_list[-1][1].keys(), - {'seed', 'idx', 'budget', 'model', 'cv_model', - 'ensemble_predictions', 'valid_predictions', 'test_predictions'}) - self.assertIsNotNone(self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]['model']) - self.assertIsNone(self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]['cv_model']) - - # Check for not containing NaNs - that the models don't predict nonsense - # for unseen data - D.test_tensors[1][0] = np.NaN - rval = evaluator.file_output( - D.train_tensors[1], - None, - D.test_tensors[1], - ) - self.assertEqual( - rval, - ( - 1.0, - { - 'error': - 'Model predictions for test set contains NaNs.' - }, - ) - ) - - @unittest.mock.patch('autoPyTorch.pipeline.tabular_classification.TabularClassificationPipeline') - def test_predict_proba_binary_classification(self, mock): - D = get_binary_classification_datamanager(NoResamplingStrategyTypes.no_resampling) - self.backend_mock.load_datamanager.return_value = D - mock.predict_proba.side_effect = lambda y, batch_size=None: np.array( - [[0.1, 0.9]] * y.shape[0] - ) - mock.side_effect = lambda **kwargs: mock - mock.get_default_pipeline_options.return_value = {'budget_type': 'epochs', 'epochs': 10} - configuration = unittest.mock.Mock(spec=Configuration) - queue_ = multiprocessing.Queue() - - evaluator = FitEvaluator(self.backend_mock, queue_, configuration=configuration, metric=accuracy, budget=0) - - evaluator.fit_predict_and_loss() - Y_test_pred = self.backend_mock.save_numrun_to_dir.call_args_list[0][1][ - 'test_predictions'] - - for i in range(7): - self.assertEqual(0.9, Y_test_pred[i][1]) - - def test_get_results(self): - queue_ = multiprocessing.Queue() - for i in range(5): - queue_.put((i * 1, 1 - (i * 0.2), 0, "", StatusType.SUCCESS)) - result = read_queue(queue_) - self.assertEqual(len(result), 5) - self.assertEqual(result[0][0], 0) - self.assertAlmostEqual(result[0][1], 1.0) diff --git a/test/test_pipeline/components/preprocessing/test_encoders.py b/test/test_pipeline/components/preprocessing/test_encoders.py index ac796291c..a901823ba 100644 --- a/test/test_pipeline/components/preprocessing/test_encoders.py +++ b/test/test_pipeline/components/preprocessing/test_encoders.py @@ -10,8 +10,6 @@ from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.OneHotEncoder import OneHotEncoder -# TODO: fix in preprocessing PR -@unittest.skip("Skipping tests as preprocessing is not finalised") class TestEncoders(unittest.TestCase): def test_one_hot_encoder_no_unknown(self): diff --git a/test/test_pipeline/components/preprocessing/test_imputers.py b/test/test_pipeline/components/preprocessing/test_imputers.py index d2de6d7d3..18b43bfa6 100644 --- a/test/test_pipeline/components/preprocessing/test_imputers.py +++ b/test/test_pipeline/components/preprocessing/test_imputers.py @@ -11,8 +11,6 @@ from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer -# TODO: fix in preprocessing PR -@unittest.skip("Skipping tests as preprocessing is not finalised") class TestSimpleImputer(unittest.TestCase): def test_get_config_space(self): diff --git a/test/test_pipeline/components/preprocessing/test_scalers.py b/test/test_pipeline/components/preprocessing/test_scalers.py index cd41308fa..94ba0f2dc 100644 --- a/test/test_pipeline/components/preprocessing/test_scalers.py +++ b/test/test_pipeline/components/preprocessing/test_scalers.py @@ -12,8 +12,6 @@ from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.StandardScaler import StandardScaler -# TODO: fix in preprocessing PR -@unittest.skip("Skipping tests as preprocessing is not finalised") class TestNormalizer(unittest.TestCase): def test_l2_norm(self): @@ -131,8 +129,6 @@ def test_max_norm(self): [0.84615385, 0.92307692, 1]])) -# TODO: fix in preprocessing PR -@unittest.skip("Skipping tests as preprocessing is not finalised") class TestMinMaxScaler(unittest.TestCase): def test_minmax_scaler(self): @@ -174,8 +170,6 @@ def test_minmax_scaler(self): [0.76923077, 0.76923077, 0.76923077]])) -# TODO: fix in preprocessing PR -@unittest.skip("Skipping tests as preprocessing is not finalised") class TestStandardScaler(unittest.TestCase): def test_standard_scaler(self): @@ -218,8 +212,6 @@ def test_standard_scaler(self): [0.8396642, 0.8396642, 0.8396642]])) -# TODO: fix in preprocessing PR -@unittest.skip("Skipping tests as preprocessing is not finalised") class TestNoneScaler(unittest.TestCase): def test_none_scaler(self): diff --git a/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py b/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py index d7a59383c..c4d8ccd50 100644 --- a/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py +++ b/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py @@ -14,13 +14,14 @@ # TODO: fix in preprocessing PR -@pytest.mark.skip("Skipping tests as preprocessing is not finalised") +# @pytest.mark.skip("Skipping tests as preprocessing is not finalised") @pytest.mark.parametrize("fit_dictionary_tabular", ['classification_numerical_only', 'classification_categorical_only', 'classification_numerical_and_categorical'], indirect=True) class TestTabularTransformer: def test_tabular_preprocess(self, fit_dictionary_tabular): pipeline = TabularPipeline(dataset_properties=fit_dictionary_tabular['dataset_properties']) + X_train = fit_dictionary_tabular['X_train'].copy() pipeline = pipeline.fit(fit_dictionary_tabular) X = pipeline.transform(fit_dictionary_tabular) column_transformer = X['tabular_transformer'] @@ -32,17 +33,17 @@ def test_tabular_preprocess(self, fit_dictionary_tabular): # as the later is not callable and runs into error in the compose transform assert isinstance(column_transformer, TabularColumnTransformer) - data = column_transformer.preprocessor.fit_transform(X['X_train']) + data = column_transformer.preprocessor.fit_transform(X_train) assert isinstance(data, np.ndarray) # Make sure no columns are unintentionally dropped after preprocessing if len(fit_dictionary_tabular['dataset_properties']["numerical_columns"]) == 0: categorical_pipeline = column_transformer.preprocessor.named_transformers_['categorical_pipeline'] - categorical_data = categorical_pipeline.transform(X['X_train']) + categorical_data = categorical_pipeline.transform(X_train) assert data.shape[1] == categorical_data.shape[1] elif len(fit_dictionary_tabular['dataset_properties']["categorical_columns"]) == 0: numerical_pipeline = column_transformer.preprocessor.named_transformers_['numerical_pipeline'] - numerical_data = numerical_pipeline.transform(X['X_train']) + numerical_data = numerical_pipeline.transform(X_train) assert data.shape[1] == numerical_data.shape[1] def test_sparse_data(self, fit_dictionary_tabular): diff --git a/test/test_pipeline/components/training/test_feature_data_loader.py b/test/test_pipeline/components/training/test_feature_data_loader.py index 7d4c9d80d..7e97494a4 100644 --- a/test/test_pipeline/components/training/test_feature_data_loader.py +++ b/test/test_pipeline/components/training/test_feature_data_loader.py @@ -9,13 +9,13 @@ class TestFeatureDataLoader(unittest.TestCase): - def test_build_transform_small_preprocess_true(self): + def test_build_transform(self): """ Makes sure a proper composition is created """ loader = FeatureDataLoader() - fit_dictionary = {'dataset_properties': {'is_small_preprocess': True}} + fit_dictionary = {'dataset_properties': {}} for thing in ['imputer', 'scaler', 'encoder']: fit_dictionary[thing] = [unittest.mock.Mock()] @@ -25,19 +25,3 @@ def test_build_transform_small_preprocess_true(self): # No preprocessing needed here as it was done before self.assertEqual(len(compose.transforms), 1) - - def test_build_transform_small_preprocess_false(self): - """ - Makes sure a proper composition is created - """ - loader = FeatureDataLoader() - - fit_dictionary = {'dataset_properties': {'is_small_preprocess': False}, - 'preprocess_transforms': [unittest.mock.Mock()]} - - compose = loader.build_transform(fit_dictionary, mode='train') - - self.assertIsInstance(compose, torchvision.transforms.Compose) - - # We expect the to tensor, the preproces transforms and the check_array - self.assertEqual(len(compose.transforms), 4) diff --git a/test/test_pipeline/components/training/test_training.py b/test/test_pipeline/components/training/test_training.py index 034ea71d7..5b7be7f49 100644 --- a/test/test_pipeline/components/training/test_training.py +++ b/test/test_pipeline/components/training/test_training.py @@ -93,12 +93,6 @@ def test_check_requirements(self): 'backend is needed to load the data from'): loader.fit(fit_dictionary) - # Then the is small fit - fit_dictionary.update({'backend': unittest.mock.Mock()}) - with self.assertRaisesRegex(ValueError, - 'is_small_pre-process is required to know if th'): - loader.fit(fit_dictionary) - def test_fit_transform(self): """ Makes sure that fit and transform work as intended """ backend = unittest.mock.Mock()