diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 48f395609..e9344e213 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -101,7 +101,10 @@ class BaseTask: Base class for the tasks that serve as API to the pipelines. Args: seed (int), (default=1): seed to be used for reproducibility. - n_jobs (int), (default=1): number of consecutive processes to spawn. + n_jobs (int), (default=1): + number of consecutive processes to spawn. + n_threads (int), (default=1): + number of threads to use for each process. logging_config (Optional[Dict]): specifies configuration for logging, if None, it is loaded from the logging.yaml ensemble_size (int), (default=50): Number of models added to the ensemble built by @@ -138,6 +141,7 @@ def __init__( self, seed: int = 1, n_jobs: int = 1, + n_threads: int = 1, logging_config: Optional[Dict] = None, ensemble_size: int = 50, ensemble_nbest: int = 50, @@ -158,6 +162,7 @@ def __init__( ) -> None: self.seed = seed self.n_jobs = n_jobs + self.n_threads = n_threads self.ensemble_size = ensemble_size self.ensemble_nbest = ensemble_nbest self.max_models_on_disc = max_models_on_disc @@ -189,6 +194,9 @@ def __init__( self.trajectory: Optional[List] = None self.dataset_name: Optional[str] = None self.cv_models_: Dict = {} + self.precision: Optional[int] = None + self.opt_metric: Optional[str] = None + self.dataset: Optional[BaseDataset] = None # By default try to use the TCP logging port or get a new port self._logger_port = logging.handlers.DEFAULT_TCP_LOGGING_PORT @@ -394,6 +402,7 @@ def _clean_logger(self) -> None: self.logging_server.join(timeout=5) self.logging_server.terminate() del self.stop_logging_server + self._logger = None def _create_dask_client(self) -> None: """ @@ -408,7 +417,7 @@ def _create_dask_client(self) -> None: dask.distributed.LocalCluster( n_workers=self.n_jobs, processes=True, - threads_per_worker=1, + threads_per_worker=self.n_threads, # We use the temporal directory to save the # dask workers, because deleting workers # more time than deleting backend directories @@ -488,6 +497,23 @@ def _load_models(self) -> bool: return True + def _cleanup(self) -> None: + """ + Closes the different servers created during api search. + Returns: + None + """ + if hasattr(self, '_logger') and self._logger is not None: + self._logger.info("Closing the dask infrastructure") + self._close_dask_client() + self._logger.info("Finished closing the dask infrastructure") + + # Clean up the logger + self._logger.info("Starting to clean up the logger") + self._clean_logger() + else: + self._close_dask_client() + def _load_best_individual_model(self) -> SingleBest: """ In case of failure during ensemble building, @@ -725,6 +751,38 @@ def _do_traditional_prediction(self, time_left: int, func_eval_time_limit_secs: save_external=True) return + def run_traditional_ml( + self, + current_task_name: str, + runtime_limit: int, + func_eval_time_limit_secs: int + ) -> None: + """ + This function can be used to run the suite of traditional machine + learning models during the current task (for e.g, ensemble fit, search) + + Args: + current_task_name (str): name of the current task, + runtime_limit (int): time limit for fitting traditional models, + func_eval_time_limit_secs (int): Time limit + for a single call to the machine learning model. + Model fitting will be terminated if the machine + learning algorithm runs over the time limit. + """ + assert self._logger is not None # for mypy compliancy + if STRING_TO_TASK_TYPES[self.task_type] in REGRESSION_TASKS: + self._logger.warning("Traditional Pipeline is not enabled for regression. Skipping...") + else: + traditional_task_name = 'runTraditional' + self._stopwatch.start_task(traditional_task_name) + elapsed_time = self._stopwatch.wall_elapsed(current_task_name) + time_for_traditional = int(runtime_limit - elapsed_time) + self._do_traditional_prediction( + func_eval_time_limit_secs=func_eval_time_limit_secs, + time_left=time_for_traditional, + ) + self._stopwatch.stop_task(traditional_task_name) + def _search( self, optimize_metric: str, @@ -819,8 +877,10 @@ def _search( if self.task_type != dataset.task_type: raise ValueError("Incompatible dataset entered for current task," - "expected dataset to have task type :{} got " + "expected dataset to have task type :{} but got " ":{}".format(self.task_type, dataset.task_type)) + if precision not in [16, 32, 64]: + raise ValueError("precision must be one of 16, 32, 64 but got {}".format(precision)) # Initialise information needed for the experiment experiment_task_name: str = 'runSearch' @@ -866,7 +926,7 @@ def _search( # If no dask client was provided, we create one, so that we can # start a ensemble process in parallel to smbo optimize if ( - self._dask_client is None and (self.ensemble_size > 0 or self.n_jobs is not None and self.n_jobs > 1) + self._dask_client is None and (self.ensemble_size > 0 or self.n_jobs > 1) ): self._create_dask_client() else: @@ -895,31 +955,25 @@ def _search( ) # ============> Run dummy predictions - dummy_task_name = 'runDummy' - self._stopwatch.start_task(dummy_task_name) - self._do_dummy_prediction() - self._stopwatch.stop_task(dummy_task_name) + # We only want to run dummy predictions in case we want to build an ensemble + if self.ensemble_size > 0: + dummy_task_name = 'runDummy' + self._stopwatch.start_task(dummy_task_name) + self._do_dummy_prediction() + self._stopwatch.stop_task(dummy_task_name) # ============> Run traditional ml - - if enable_traditional_pipeline: - if STRING_TO_TASK_TYPES[self.task_type] in REGRESSION_TASKS: - self._logger.warning("Traditional Pipeline is not enabled for regression. Skipping...") - else: - traditional_task_name = 'runTraditional' - self._stopwatch.start_task(traditional_task_name) - elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name) - # We want time for at least 1 Neural network in SMAC - time_for_traditional = int( - self._time_for_task - elapsed_time - func_eval_time_limit_secs - ) - self._do_traditional_prediction( - func_eval_time_limit_secs=func_eval_time_limit_secs, - time_left=time_for_traditional, - ) - self._stopwatch.stop_task(traditional_task_name) + # We only want to run traditional predictions in case we want to build an ensemble + # We want time for at least 1 Neural network in SMAC + if enable_traditional_pipeline and self.ensemble_size > 0: + traditional_runtime_limit = int(self._time_for_task - func_eval_time_limit_secs) + self.run_traditional_ml(current_task_name=self.dataset_name, + runtime_limit=traditional_runtime_limit, + func_eval_time_limit_secs=func_eval_time_limit_secs) # ============> Starting ensemble + self.precision = precision + self.opt_metric = optimize_metric elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name) time_left_for_ensembles = max(0, total_walltime_limit - elapsed_time) proc_ensemble = None @@ -936,26 +990,12 @@ def _search( self._logger.info("Starting ensemble") ensemble_task_name = 'ensemble' self._stopwatch.start_task(ensemble_task_name) - proc_ensemble = EnsembleBuilderManager( - start_time=time.time(), - time_left_for_ensembles=time_left_for_ensembles, - backend=copy.deepcopy(self._backend), - dataset_name=str(dataset.dataset_name), - output_type=STRING_TO_OUTPUT_TYPES[dataset.output_type], - task_type=STRING_TO_TASK_TYPES[self.task_type], - metrics=[self._metric], - opt_metric=optimize_metric, - ensemble_size=self.ensemble_size, - ensemble_nbest=self.ensemble_nbest, - max_models_on_disc=self.max_models_on_disc, - seed=self.seed, - max_iterations=None, - read_at_most=sys.maxsize, - ensemble_memory_limit=self._memory_limit, - random_state=self.seed, - precision=precision, - logger_port=self._logger_port, - ) + proc_ensemble = self._init_ensemble_builder(time_left_for_ensembles=time_left_for_ensembles, + ensemble_size=self.ensemble_size, + ensemble_nbest=self.ensemble_nbest, + precision=precision, + optimize_metric=self.opt_metric + ) self._stopwatch.stop_task(ensemble_task_name) # ==> Run SMAC @@ -1035,18 +1075,12 @@ def _search( pd.DataFrame(self.ensemble_performance_history).to_json( os.path.join(self._backend.internals_directory, 'ensemble_history.json')) - self._logger.info("Closing the dask infrastructure") - self._close_dask_client() - self._logger.info("Finished closing the dask infrastructure") - if load_models: self._logger.info("Loading models...") self._load_models() self._logger.info("Finished loading models...") - # Clean up the logger - self._logger.info("Starting to clean up the logger") - self._clean_logger() + self._cleanup() return self @@ -1241,7 +1275,7 @@ def fit_pipeline(self, dataset_requirements = get_dataset_requirements( info=self._get_required_dataset_properties(dataset)) dataset_properties = dataset.get_dataset_properties(dataset_requirements) - self._backend.save_datamanager(dataset) + self._backend.replace_datamanager(dataset) if self._logger is None: self._logger = self._get_logger(dataset.dataset_name) @@ -1333,6 +1367,207 @@ def fit_pipeline(self, return fitted_pipeline, run_info, run_value, dataset + def fit_ensemble( + self, + optimize_metric: Optional[str] = None, + precision: Optional[int] = None, + ensemble_nbest: int = 50, + ensemble_size: int = 50, + load_models: bool = True, + time_for_task: int = 100, + func_eval_time_limit_secs: int = 50, + enable_traditional_pipeline: bool = True, + ) -> 'BaseTask': + """ + Enables post-hoc fitting of the ensemble after the `search()` + method is finished. This method creates an ensemble using all + the models stored on disk during the smbo run. + + Args: + optimize_metric (str): name of the metric that is used to + evaluate a pipeline. if not specified, value passed to search will be used + precision (int), (default=32): Numeric precision used when loading + ensemble data. Can be either 16, 32 or 64. + ensemble_nbest (Optional[int]): + only consider the ensemble_nbest models to build the ensemble. + If None, uses the value stored in class attribute `ensemble_nbest`. + ensemble_size (int) (default=50): + Number of models added to the ensemble built by + Ensemble selection from libraries of models. + Models are drawn with replacement. + enable_traditional_pipeline (bool), (default=True): + We fit traditional machine learning algorithms + (LightGBM, CatBoost, RandomForest, ExtraTrees, KNN, SVM) + prior building PyTorch Neural Networks. You can disable this + feature by turning this flag to False. All machine learning + algorithms that are fitted during search() are considered for + ensemble building. + load_models (bool), (default=True): Whether to load the + models after fitting AutoPyTorch. + time_for_task (int), (default=100): Time limit + in seconds for the search of appropriate models. + By increasing this value, autopytorch has a higher + chance of finding better models. + func_eval_time_limit_secs (int), (default=None): Time limit + for a single call to the machine learning model. + Model fitting will be terminated if the machine + learning algorithm runs over the time limit. Set + this value high enough so that typical machine + learning algorithms can be fit on the training + data. + When set to None, this time will automatically be set to + total_walltime_limit // 2 to allow enough time to fit + at least 2 individual machine learning algorithms. + Set to np.inf in case no time limit is desired. + + Returns: + self + """ + # Make sure that input is valid + if self.dataset is None or self.opt_metric is None: + raise ValueError("fit_ensemble() can only be called after `search()`. " + "Please call the `search()` method of {} prior to " + "fit_ensemble().".format(self.__class__.__name__)) + + if precision not in [16, 32, 64]: + raise ValueError("precision must be one of 16, 32, 64 but got {}".format(precision)) + + if self._logger is None: + self._logger = self._get_logger(self.dataset.dataset_name) + + # Create a client if needed + if self._dask_client is None: + self._create_dask_client() + else: + self._is_dask_client_internally_created = False + + ensemble_fit_task_name = 'EnsembleFit' + self._stopwatch.start_task(ensemble_fit_task_name) + if enable_traditional_pipeline: + if func_eval_time_limit_secs is None or func_eval_time_limit_secs > time_for_task: + self._logger.warning( + 'Time limit for a single run is higher than total time ' + 'limit. Capping the limit for a single run to the total ' + 'time given to Ensemble fit (%f)' % time_for_task + ) + func_eval_time_limit_secs = time_for_task + + # Make sure that at least 2 models are created for the ensemble process + num_models = time_for_task // func_eval_time_limit_secs + if num_models < 2: + func_eval_time_limit_secs = time_for_task // 2 + self._logger.warning( + "Capping the func_eval_time_limit_secs to {} to have " + "time for at least 2 models to ensemble.".format( + func_eval_time_limit_secs + ) + ) + # ============> Run Dummy predictions + dummy_task_name = 'runDummy' + self._stopwatch.start_task(dummy_task_name) + self._do_dummy_prediction() + self._stopwatch.stop_task(dummy_task_name) + + # ============> Run traditional ml + if enable_traditional_pipeline: + self.run_traditional_ml(current_task_name=ensemble_fit_task_name, + runtime_limit=time_for_task, + func_eval_time_limit_secs=func_eval_time_limit_secs) + + elapsed_time = self._stopwatch.wall_elapsed(ensemble_fit_task_name) + time_left_for_ensemble = int(time_for_task - elapsed_time) + manager = self._init_ensemble_builder( + time_left_for_ensembles=time_left_for_ensemble, + optimize_metric=self.opt_metric if optimize_metric is None else optimize_metric, + precision=self.precision if precision is None else precision, + ensemble_size=ensemble_size, + ensemble_nbest=ensemble_nbest, + ) + + manager.build_ensemble(self._dask_client) + future = manager.futures.pop() + result = future.result() + if result is None: + raise ValueError("Errors occurred while building the ensemble - please" + " check the log file and command line output for error messages.") + self.ensemble_performance_history, _, _, _ = result + + if load_models: + self._load_models() + + self._stopwatch.stop_task(ensemble_fit_task_name) + + self._cleanup() + + return self + + def _init_ensemble_builder( + self, + time_left_for_ensembles: float, + optimize_metric: str, + ensemble_nbest: int, + ensemble_size: int, + precision: int = 32, + ) -> EnsembleBuilderManager: + """ + Initializes an `EnsembleBuilderManager`. + Args: + time_left_for_ensembles (float): + Time (in seconds) allocated to building the ensemble + optimize_metric (str): + Name of the metric to optimize the ensemble. + ensemble_nbest (int): + only consider the ensemble_nbest models to build the ensemble. + ensemble_size (int): + Number of models added to the ensemble built by + Ensemble selection from libraries of models. + Models are drawn with replacement. + precision (int), (default=32): Numeric precision used when loading + ensemble data. Can be either 16, 32 or 64. + + Returns: + EnsembleBuilderManager + """ + if self._logger is None: + raise ValueError("logger should be initialized to fit ensemble") + if self.dataset is None: + raise ValueError("ensemble can only be initialised after or during `search()`. " + "Please call the `search()` method of {}.".format(self.__class__.__name__)) + + self._logger.info("Starting ensemble") + ensemble_task_name = 'ensemble' + self._stopwatch.start_task(ensemble_task_name) + + # Use the current thread to start the ensemble builder process + # The function ensemble_builder_process will internally create a ensemble + # builder in the provide dask client + required_dataset_properties = {'task_type': self.task_type, + 'output_type': self.dataset.output_type} + proc_ensemble = EnsembleBuilderManager( + start_time=time.time(), + time_left_for_ensembles=time_left_for_ensembles, + backend=copy.deepcopy(self._backend), + dataset_name=str(self.dataset.dataset_name), + output_type=STRING_TO_OUTPUT_TYPES[self.dataset.output_type], + task_type=STRING_TO_TASK_TYPES[self.task_type], + metrics=[self._metric] if self._metric is not None else get_metrics( + dataset_properties=required_dataset_properties, names=[optimize_metric]), + opt_metric=optimize_metric, + ensemble_size=ensemble_size, + ensemble_nbest=ensemble_nbest, + max_models_on_disc=self.max_models_on_disc, + seed=self.seed, + max_iterations=None, + read_at_most=sys.maxsize, + ensemble_memory_limit=self._memory_limit, + random_state=self.seed, + precision=precision, + logger_port=self._logger_port, + ) + self._stopwatch.stop_task(ensemble_task_name) + + return proc_ensemble + def predict( self, X_test: np.ndarray, @@ -1382,7 +1617,7 @@ def predict( predictions = self.ensemble_.predict(all_predictions) - self._clean_logger() + self._cleanup() return predictions @@ -1419,10 +1654,7 @@ def __getstate__(self) -> Dict[str, Any]: return self.__dict__ def __del__(self) -> None: - # Clean up the logger - self._clean_logger() - - self._close_dask_client() + self._cleanup() # When a multiprocessing work is done, the # objects are deleted. We don't want to delete run areas diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index 209853c6e..b261f3b0f 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -27,11 +27,14 @@ class TabularClassificationTask(BaseTask): """ Tabular Classification API to the pipelines. + Args: seed (int): seed to be used for reproducibility. n_jobs (int), (default=1): number of consecutive processes to spawn. + n_threads (int), (default=1): + number of threads to use for each process. logging_config (Optional[Dict]): specifies configuration for logging, if None, it is loaded from the logging.yaml ensemble_size (int), (default=50): @@ -63,6 +66,7 @@ def __init__( self, seed: int = 1, n_jobs: int = 1, + n_threads: int = 1, logging_config: Optional[Dict] = None, ensemble_size: int = 50, ensemble_nbest: int = 50, @@ -83,6 +87,7 @@ def __init__( super().__init__( seed=seed, n_jobs=n_jobs, + n_threads=n_threads, logging_config=logging_config, ensemble_size=ensemble_size, ensemble_nbest=ensemble_nbest, @@ -277,6 +282,8 @@ def search( y_test=y_test, dataset_name=dataset_name) + if self.dataset is None: + raise ValueError("`dataset` in {} must be initialized, but got None".format(self.__class__.__name__)) return self._search( dataset=self.dataset, optimize_metric=optimize_metric, diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py index ed7e1c4f6..cbaaa9099 100644 --- a/autoPyTorch/api/tabular_regression.py +++ b/autoPyTorch/api/tabular_regression.py @@ -27,9 +27,13 @@ class TabularRegressionTask(BaseTask): """ Tabular Regression API to the pipelines. + Args: seed (int): seed to be used for reproducibility. - n_jobs (int), (default=1): number of consecutive processes to spawn. + n_jobs (int), (default=1): + number of consecutive processes to spawn. + n_threads (int), (default=1): + number of threads to use for each process. logging_config (Optional[Dict]): specifies configuration for logging, if None, it is loaded from the logging.yaml ensemble_size (int), (default=50): Number of models added to the ensemble built by @@ -50,11 +54,11 @@ class TabularRegressionTask(BaseTask): Otherwise specifies set of components not to use. Incompatible with include components """ - def __init__( self, seed: int = 1, n_jobs: int = 1, + n_threads: int = 1, logging_config: Optional[Dict] = None, ensemble_size: int = 50, ensemble_nbest: int = 50, @@ -75,6 +79,7 @@ def __init__( super().__init__( seed=seed, n_jobs=n_jobs, + n_threads=n_threads, logging_config=logging_config, ensemble_size=ensemble_size, ensemble_nbest=ensemble_nbest, @@ -263,6 +268,8 @@ def search( y_test=y_test, dataset_name=dataset_name) + if self.dataset is None: + raise ValueError("`dataset` in {} must be initialized, but got None".format(self.__class__.__name__)) return self._search( dataset=self.dataset, optimize_metric=optimize_metric, diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py index 0106a3aa8..700af1050 100644 --- a/autoPyTorch/data/base_feature_validator.py +++ b/autoPyTorch/data/base_feature_validator.py @@ -1,5 +1,5 @@ import logging -import typing +from typing import List, Optional, Set, Tuple, Union import numpy as np @@ -12,8 +12,8 @@ from autoPyTorch.utils.logging_ import PicklableClientLogger -SUPPORTED_FEAT_TYPES = typing.Union[ - typing.List, +SUPPORTED_FEAT_TYPES = Union[ + List, pd.DataFrame, np.ndarray, scipy.sparse.bsr_matrix, @@ -29,66 +29,64 @@ class BaseFeatureValidator(BaseEstimator): """ A class to pre-process features. In this regards, the format of the data is checked, - and if applicable, features are encoded + and if applicable, features are encoded. + Attributes: feat_type (List[str]): List of the column types found by this estimator during fit. data_type (str): Class name of the data type provided during fit. - encoder (typing.Optional[BaseEstimator]) + encoder (Optional[BaseEstimator]) Host a encoder object if the data requires transformation (for example, - if provided a categorical column in a pandas DataFrame) - enc_columns (typing.List[str]) - List of columns that were encoded. + if provided a categorical column in a pandas DataFrame). """ - def __init__(self, - logger: typing.Optional[typing.Union[PicklableClientLogger, logging.Logger - ]] = None, - ) -> None: + def __init__( + self, + logger: Optional[Union[PicklableClientLogger, logging.Logger]] = None, + ) -> None: # Register types to detect unsupported data format changes - self.feat_type = None # type: typing.Optional[typing.List[str]] - self.data_type = None # type: typing.Optional[type] - self.dtypes = [] # type: typing.List[str] - self.column_order = [] # type: typing.List[str] + self.feat_type: Optional[List[str]] = None + self.data_type: Optional[type] = None + self.dtypes: List[str] = [] + self.column_order: List[str] = [] - self.encoder = None # type: typing.Optional[BaseEstimator] - self.enc_columns = [] # type: typing.List[str] + self.column_transformer: Optional[BaseEstimator] = None - self.logger: typing.Union[ + self.logger: Union[ PicklableClientLogger, logging.Logger ] = logger if logger is not None else logging.getLogger(__name__) # Required for dataset properties - self.num_features = None # type: typing.Optional[int] - self.categories = [] # type: typing.List[typing.List[int]] - self.categorical_columns: typing.List[int] = [] - self.numerical_columns: typing.List[int] = [] - # column identifiers may be integers or strings - self.null_columns: typing.Set[str] = set() + self.num_features: Optional[int] = None + self.categories: List[List[int]] = [] + self.categorical_columns: List[int] = [] + self.numerical_columns: List[int] = [] + + self.all_nan_columns: Optional[Set[Union[int, str]]] = None self._is_fitted = False def fit( self, X_train: SUPPORTED_FEAT_TYPES, - X_test: typing.Optional[SUPPORTED_FEAT_TYPES] = None, + X_test: Optional[SUPPORTED_FEAT_TYPES] = None, ) -> BaseEstimator: """ Validates and fit a categorical encoder (if needed) to the features. The supported data types are List, numpy arrays and pandas DataFrames. CSR sparse data types are also supported - Arguments: + Args: X_train (SUPPORTED_FEAT_TYPES): A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding - X_test (typing.Optional[SUPPORTED_FEAT_TYPES]): + X_test (Optional[SUPPORTED_FEAT_TYPES]): A hold out set of data used for checking """ # If a list was provided, it will be converted to pandas if isinstance(X_train, list): - X_train, X_test = self.list_to_dataframe(X_train, X_test) + X_train, X_test = self.list_to_pandas(X_train, X_test) self._check_data(X_train) @@ -114,7 +112,7 @@ def _fit( X: SUPPORTED_FEAT_TYPES, ) -> BaseEstimator: """ - Arguments: + Args: X (SUPPORTED_FEAT_TYPES): A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding @@ -122,6 +120,7 @@ def _fit( self: The fitted base estimator """ + raise NotImplementedError() def _check_data( @@ -131,11 +130,12 @@ def _check_data( """ Feature dimensionality and data type checks - Arguments: + Args: X (SUPPORTED_FEAT_TYPES): A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding """ + raise NotImplementedError() def transform( @@ -143,7 +143,7 @@ def transform( X: SUPPORTED_FEAT_TYPES, ) -> np.ndarray: """ - Arguments: + Args: X_train (SUPPORTED_FEAT_TYPES): A set of features, whose categorical features are going to be transformed @@ -152,4 +152,30 @@ def transform( np.ndarray: The transformed array """ + + raise NotImplementedError() + + def list_to_pandas( + self, + X_train: SUPPORTED_FEAT_TYPES, + X_test: Optional[SUPPORTED_FEAT_TYPES] = None, + ) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]: + """ + Converts a list to a pandas DataFrame. In this process, column types are inferred. + + If test data is provided, we proactively match it to train data + + Args: + X_train (SUPPORTED_FEAT_TYPES): + A set of features that are going to be validated (type and dimensionality + checks) and a encoder fitted in the case the data needs encoding + X_test (Optional[SUPPORTED_FEAT_TYPES]): + A hold out set of data used for checking + Returns: + pd.DataFrame: + transformed train data from list to pandas DataFrame + pd.DataFrame: + transformed test data from list to pandas DataFrame + """ + raise NotImplementedError() diff --git a/autoPyTorch/data/base_target_validator.py b/autoPyTorch/data/base_target_validator.py index dba9c19e3..f191e985b 100644 --- a/autoPyTorch/data/base_target_validator.py +++ b/autoPyTorch/data/base_target_validator.py @@ -1,5 +1,5 @@ import logging -import typing +from typing import List, Optional, Union, cast import numpy as np @@ -12,8 +12,8 @@ from autoPyTorch.utils.logging_ import PicklableClientLogger -SUPPORTED_TARGET_TYPES = typing.Union[ - typing.List, +SUPPORTED_TARGET_TYPES = Union[ + List, pd.Series, pd.DataFrame, np.ndarray, @@ -35,48 +35,50 @@ class BaseTargetValidator(BaseEstimator): is_classification (bool): A bool that indicates if the validator should operate in classification mode. During classification, the targets are encoded. - encoder (typing.Optional[BaseEstimator]): + encoder (Optional[BaseEstimator]): Host a encoder object if the data requires transformation (for example, if provided a categorical column in a pandas DataFrame) - enc_columns (typing.List[str]) + enc_columns (List[str]) List of columns that where encoded """ def __init__(self, is_classification: bool = False, - logger: typing.Optional[typing.Union[PicklableClientLogger, logging.Logger - ]] = None, + logger: Optional[Union[PicklableClientLogger, + logging.Logger + ] + ] = None, ) -> None: self.is_classification = is_classification - self.data_type = None # type: typing.Optional[type] + self.data_type: Optional[type] = None - self.encoder = None # type: typing.Optional[BaseEstimator] + self.encoder: Optional[BaseEstimator] = None - self.out_dimensionality = None # type: typing.Optional[int] - self.type_of_target = None # type: typing.Optional[str] + self.out_dimensionality: Optional[int] = None + self.type_of_target: Optional[str] = None - self.logger: typing.Union[ + self.logger: Union[ PicklableClientLogger, logging.Logger ] = logger if logger is not None else logging.getLogger(__name__) # Store the dtype for remapping to correct type - self.dtype = None # type: typing.Optional[type] + self.dtype: Optional[type] = None self._is_fitted = False def fit( self, y_train: SUPPORTED_TARGET_TYPES, - y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None, + y_test: Optional[SUPPORTED_TARGET_TYPES] = None, ) -> BaseEstimator: """ Validates and fit a categorical encoder (if needed) to the targets The supported data types are List, numpy arrays and pandas DataFrames. - Arguments: + Args: y_train (SUPPORTED_TARGET_TYPES) A set of targets set aside for training - y_test (typing.Union[SUPPORTED_TARGET_TYPES]) + y_test (Union[SUPPORTED_TARGET_TYPES]) A hold out set of data used of the targets. It is also used to fit the categories of the encoder. """ @@ -95,8 +97,8 @@ def fit( np.shape(y_test) )) if isinstance(y_train, pd.DataFrame): - y_train = typing.cast(pd.DataFrame, y_train) - y_test = typing.cast(pd.DataFrame, y_test) + y_train = cast(pd.DataFrame, y_train) + y_test = cast(pd.DataFrame, y_test) if y_train.columns.tolist() != y_test.columns.tolist(): raise ValueError( "Train and test targets must both have the same columns, yet " @@ -127,24 +129,24 @@ def fit( def _fit( self, y_train: SUPPORTED_TARGET_TYPES, - y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None, + y_test: Optional[SUPPORTED_TARGET_TYPES] = None, ) -> BaseEstimator: """ - Arguments: + Args: y_train (SUPPORTED_TARGET_TYPES) The labels of the current task. They are going to be encoded in case of classification - y_test (typing.Optional[SUPPORTED_TARGET_TYPES]) + y_test (Optional[SUPPORTED_TARGET_TYPES]) A holdout set of labels """ raise NotImplementedError() def transform( self, - y: typing.Union[SUPPORTED_TARGET_TYPES], + y: Union[SUPPORTED_TARGET_TYPES], ) -> np.ndarray: """ - Arguments: + Args: y (SUPPORTED_TARGET_TYPES) A set of targets that are going to be encoded if the current task is classification @@ -161,8 +163,8 @@ def inverse_transform( """ Revert any encoding transformation done on a target array - Arguments: - y (typing.Union[np.ndarray, pd.DataFrame, pd.Series]): + Args: + y (Union[np.ndarray, pd.DataFrame, pd.Series]): Target array to be transformed back to original form before encoding Returns: np.ndarray: diff --git a/autoPyTorch/data/base_validator.py b/autoPyTorch/data/base_validator.py index 7528d56ab..4ef54c665 100644 --- a/autoPyTorch/data/base_validator.py +++ b/autoPyTorch/data/base_validator.py @@ -58,7 +58,7 @@ def fit( + Checks for dimensionality as well as missing values are performed. + If performing a classification task, the data is going to be encoded - Arguments: + Args: X_train (SUPPORTED_FEAT_TYPES): A set of features that are going to be validated (type and dimensionality checks). If this data contains categorical columns, an encoder is going to @@ -102,7 +102,7 @@ def transform( """ Transform the given target or features to a numpy array - Arguments: + Args: X (SUPPORTED_FEAT_TYPES): A set of features to transform y (typing.Optional[SUPPORTED_TARGET_TYPES]): diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index f978046ad..10736d473 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -1,5 +1,5 @@ import functools -import typing +from typing import Dict, List, Optional, Tuple, cast import numpy as np @@ -9,15 +9,100 @@ import scipy.sparse import sklearn.utils -from sklearn import preprocessing from sklearn.base import BaseEstimator from sklearn.compose import ColumnTransformer from sklearn.exceptions import NotFittedError +from sklearn.impute import SimpleImputer +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import OneHotEncoder, StandardScaler from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SUPPORTED_FEAT_TYPES +def _create_column_transformer( + preprocessors: Dict[str, List[BaseEstimator]], + numerical_columns: List[str], + categorical_columns: List[str], +) -> ColumnTransformer: + """ + Given a dictionary of preprocessors, this function + creates a sklearn column transformer with appropriate + columns associated with their preprocessors. + + Args: + preprocessors (Dict[str, List[BaseEstimator]]): + Dictionary containing list of numerical and categorical preprocessors. + numerical_columns (List[str]): + List of names of numerical columns + categorical_columns (List[str]): + List of names of categorical columns + + Returns: + ColumnTransformer + """ + + numerical_pipeline = 'drop' + categorical_pipeline = 'drop' + if len(numerical_columns) > 0: + numerical_pipeline = make_pipeline(*preprocessors['numerical']) + if len(categorical_columns) > 0: + categorical_pipeline = make_pipeline(*preprocessors['categorical']) + + return ColumnTransformer([ + ('categorical_pipeline', categorical_pipeline, categorical_columns), + ('numerical_pipeline', numerical_pipeline, numerical_columns)], + remainder='drop' + ) + + +def get_tabular_preprocessors() -> Dict[str, List[BaseEstimator]]: + """ + This function creates a Dictionary containing a list + of numerical and categorical preprocessors + + Returns: + Dict[str, List[BaseEstimator]] + """ + preprocessors: Dict[str, List[BaseEstimator]] = dict() + + # Categorical Preprocessors + onehot_encoder = OneHotEncoder(categories='auto', sparse=False, handle_unknown='ignore') + categorical_imputer = SimpleImputer(strategy='constant', copy=False) + + # Numerical Preprocessors + numerical_imputer = SimpleImputer(strategy='median', copy=False) + standard_scaler = StandardScaler(with_mean=True, with_std=True, copy=False) + + preprocessors['categorical'] = [categorical_imputer, onehot_encoder] + preprocessors['numerical'] = [numerical_imputer, standard_scaler] + + return preprocessors + + class TabularFeatureValidator(BaseFeatureValidator): + + @staticmethod + def _comparator(cmp1: str, cmp2: str) -> int: + """Order so that categorical columns come left and numerical columns come right + + Args: + cmp1 (str): First variable to compare + cmp2 (str): Second variable to compare + + Raises: + ValueError: if the values of the variables to compare + are not in 'categorical' or 'numerical' + + Returns: + int: either [0, -1, 1] + """ + choices = ['categorical', 'numerical'] + if cmp1 not in choices or cmp2 not in choices: + raise ValueError('The comparator for the column order only accepts {}, ' + 'but got {} and {}'.format(choices, cmp1, cmp2)) + idx1, idx2 = choices.index(cmp1), choices.index(cmp2) + return idx1 - idx2 + def _fit( self, X: SUPPORTED_FEAT_TYPES, @@ -27,7 +112,7 @@ def _fit( features (from categorical for example) to a numerical value that further stages will be able to use - Arguments: + Args: X (SUPPORTED_FEAT_TYPES): A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding @@ -39,79 +124,45 @@ def _fit( # The final output of a validator is a numpy array. But pandas # gives us information about the column dtype if isinstance(X, np.ndarray): - X = self.numpy_array_to_pandas(X) + + X = self.numpy_to_pandas(X) + # Replace the data type from the previously saved type. + self.data_type = type(X) + # save all the information about the column order and data types + self._check_data(X) if hasattr(X, "iloc") and not scipy.sparse.issparse(X): - X = typing.cast(pd.DataFrame, X) - # Treat a column with all instances a NaN as numerical - # This will prevent doing encoding to a categorical column made completely - # out of nan values -- which will trigger a fail, as encoding is not supported - # with nan values. - # Columns that are completely made of NaN values are provided to the pipeline - # so that later stages decide how to handle them - - # Clear whatever null column markers we had previously - self.null_columns.clear() - if np.any(pd.isnull(X)): - for column in X.columns: - if X[column].isna().all(): - self.null_columns.add(column) - X[column] = pd.to_numeric(X[column]) - # Also note this change in self.dtypes - if len(self.dtypes) != 0: - self.dtypes[list(X.columns).index(column)] = X[column].dtype - - if not X.select_dtypes(include='object').empty: - X = self.infer_objects(X) - self._check_data(X) - self.enc_columns, self.feat_type = self._get_columns_to_encode(X) - - if len(self.enc_columns) > 0: - X = self.impute_nan_in_categories(X) - - self.encoder = ColumnTransformer( - [ - ("encoder", - preprocessing.OrdinalEncoder( - handle_unknown='use_encoded_value', - unknown_value=-1, - ), self.enc_columns)], - remainder="passthrough" - ) + X = cast(pd.DataFrame, X) - # Mypy redefinition - assert self.encoder is not None - self.encoder.fit(X) - - # The column transformer reoders the feature types - we therefore need to change - # it as well - # This means columns are shifted to the right - def comparator(cmp1: str, cmp2: str) -> int: - if ( - cmp1 == 'categorical' and cmp2 == 'categorical' - or cmp1 == 'numerical' and cmp2 == 'numerical' - ): - return 0 - elif cmp1 == 'categorical' and cmp2 == 'numerical': - return -1 - elif cmp1 == 'numerical' and cmp2 == 'categorical': - return 1 - else: - raise ValueError((cmp1, cmp2)) - - self.feat_type = sorted( - self.feat_type, - key=functools.cmp_to_key(comparator) - ) + self.all_nan_columns = set([column for column in X.columns if X[column].isna().all()]) - self.categories = [ - # We fit an ordinal encoder, where all categorical - # columns are shifted to the left - list(range(len(cat))) - for cat in self.encoder.transformers_[0][1].categories_ - ] + categorical_columns, numerical_columns, feat_type = self._get_columns_info(X) + + self.enc_columns = categorical_columns + + preprocessors = get_tabular_preprocessors() + self.column_transformer = _create_column_transformer( + preprocessors=preprocessors, + numerical_columns=numerical_columns, + categorical_columns=categorical_columns, + ) + + # Mypy redefinition + assert self.column_transformer is not None + self.column_transformer.fit(X) + # The column transformer reorders the feature types + # therefore, we need to change the order of columns as well + # This means categorical columns are shifted to the left + + self.feat_type = sorted( + feat_type, + key=functools.cmp_to_key(self._comparator) + ) + + # differently to categorical_columns and numerical_columns, + # this saves the index of the column. for i, type_ in enumerate(self.feat_type): if 'numerical' in type_: self.numerical_columns.append(i) @@ -120,6 +171,7 @@ def comparator(cmp1: str, cmp2: str) -> int: # Lastly, store the number of features self.num_features = np.shape(X)[1] + return self def transform( @@ -130,7 +182,7 @@ def transform( Validates and fit a categorical encoder (if needed) to the features. The supported data types are List, numpy arrays and pandas DataFrames. - Arguments: + Args: X_train (SUPPORTED_FEAT_TYPES): A set of features, whose categorical features are going to be transformed @@ -138,49 +190,70 @@ def transform( Return: np.ndarray: The transformed array + + Note: + The default transform performs the folloing: + * simple imputation for both + * scaling for numerical + * one-hot encoding for categorical + For example, here is a simple case + of which all the columns are categorical. + data = [ + {'A': 1, 'B': np.nan, 'C': np.nan}, + {'A': np.nan, 'B': 3, 'C': np.nan}, + {'A': 2, 'B': np.nan, 'C': np.nan} + ] + and suppose all the columns are categorical, + then + * `A` in {np.nan, 1, 2} + * `B` in {np.nan, 3} + * `C` in {np.nan} <=== it will be dropped. + + So in the column A, + * np.nan ==> [1, 0, 0] (always the index 0) + * 1 ==> [0, 1, 0] + * 2 ==> [0, 0, 1] + in the column B, + * np.nan ==> [1, 0] + * 3 ==> [0, 1] + Therefore, by concatenating, + * {'A': 1, 'B': np.nan, 'C': np.nan} ==> [0, 1, 0, 1, 0] + * {'A': np.nan, 'B': 3, 'C': np.nan} ==> [1, 0, 0, 0, 1] + * {'A': 2, 'B': np.nan, 'C': np.nan} ==> [0, 0, 1, 1, 0] + ==> [ + [0, 1, 0, 1, 0], + [1, 0, 0, 0, 1], + [0, 0, 1, 1, 0] + ] """ if not self._is_fitted: raise NotFittedError("Cannot call transform on a validator that is not fitted") # If a list was provided, it will be converted to pandas if isinstance(X, list): - X, _ = self.list_to_dataframe(X) + X, _ = self.list_to_pandas(X) if isinstance(X, np.ndarray): - X = self.numpy_array_to_pandas(X) + X = self.numpy_to_pandas(X) if hasattr(X, "iloc") and not scipy.sparse.issparse(X): - X = typing.cast(pd.DataFrame, X) - # If we had null columns in our fit call and we made them numeric, then: - # - If the columns are null even in transform, apply the same procedure. - # - Otherwise, substitute the values with np.NaN and then make the columns numeric. - # If the column is null here, but it was not in fit, it does not matter. - for column in self.null_columns: - # The column is not null, make it null since it was null in fit. - if not X[column].isna().all(): - X[column] = np.NaN - X[column] = pd.to_numeric(X[column]) - - # for the test set, if we have columns with only null values - # they will probably have a numeric type. If these columns were not - # with only null values in the train set, they should be converted - # to the type that they had during fitting. - for column in X.columns: - if X[column].isna().all(): - X[column] = X[column].astype(self.dtypes[list(X.columns).index(column)]) - - # Also remove the object dtype for new data - if not X.select_dtypes(include='object').empty: - X = self.infer_objects(X) + X = cast(pd.DataFrame, X) # Check the data here so we catch problems on new test data self._check_data(X) - # We also need to fillna on the transformation - # in case test data is provided - X = self.impute_nan_in_categories(X) - if self.encoder is not None: - X = self.encoder.transform(X) + # in case of test data being all none and train data + # having a value for a categorical column. + # We need to convert the column in test data to + # object otherwise the test column is interpreted as float + if len(self.categorical_columns) > 0: + categorical_columns = self.column_transformer.transformers_[0][-1] + for column in categorical_columns: + if X[column].isna().all(): + X[column] = X[column].astype('object') + + if self.column_transformer is not None: + X = self.column_transformer.transform(X) # Sparse related transformations # Not all sparse format support index sorting @@ -209,7 +282,7 @@ def _check_data( """ Feature dimensionality and data type checks - Arguments: + Args: X (SUPPORTED_FEAT_TYPES): A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding @@ -246,125 +319,120 @@ def _check_data( # Then for Pandas, we do not support Nan in categorical columns if hasattr(X, "iloc"): # If entered here, we have a pandas dataframe - X = typing.cast(pd.DataFrame, X) + X = cast(pd.DataFrame, X) # Handle objects if possible - if not X.select_dtypes(include='object').empty: + exist_object_columns = has_object_columns(X.dtypes.values) + if exist_object_columns: X = self.infer_objects(X) - # Define the column to be encoded here as the feature validator is fitted once - # per estimator - enc_columns, _ = self._get_columns_to_encode(X) - column_order = [column for column in X.columns] if len(self.column_order) > 0: if self.column_order != column_order: - raise ValueError("Changing the column order of the features after fit() is " - "not supported. Fit() method was called with " - "{} whereas the new features have {} as type".format(self.column_order, - column_order,) - ) + raise ValueError("The column order of the features must not be changed after fit(), but" + " the column order are different between training ({}) and" + " test ({}) datasets.".format(self.column_order, column_order)) else: self.column_order = column_order dtypes = [dtype.name for dtype in X.dtypes] - if len(self.dtypes) > 0: - if self.dtypes != dtypes: - raise ValueError("Changing the dtype of the features after fit() is " - "not supported. Fit() method was called with " - "{} whereas the new features have {} as type".format(self.dtypes, - dtypes, - ) - ) - else: - self.dtypes = dtypes - def _get_columns_to_encode( + dtypes_diff = [s_dtype != dtype for s_dtype, dtype in zip(self.dtypes, dtypes)] + if len(self.dtypes) == 0: + self.dtypes = dtypes + elif ( + any(dtypes_diff) # the dtypes of some columns are different in train and test dataset + and self.all_nan_columns is not None # Ignore all_nan_columns is None + and len(set(X.columns[dtypes_diff]).difference(self.all_nan_columns)) != 0 + ): + # The dtypes can be different if and only if the column belongs + # to all_nan_columns as these columns would be imputed. + raise ValueError("The dtype of the features must not be changed after fit(), but" + " the dtypes of some columns are different between training ({}) and" + " test ({}) datasets.".format(self.dtypes, dtypes)) + + def _get_columns_info( self, X: pd.DataFrame, - ) -> typing.Tuple[typing.List[str], typing.List[str]]: + ) -> Tuple[List[str], List[str], List[str]]: """ Return the columns to be encoded from a pandas dataframe - Arguments: + Args: X (pd.DataFrame) A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding Returns: - enc_columns (List[str]): - Columns to encode, if any - feat_type: + categorical_columns (List[str]) + List of the names of categorical columns. + numerical_columns (List[str]) + List of the names of numerical columns. + feat_type (List[str]) Type of each column numerical/categorical """ # Register if a column needs encoding - enc_columns = [] - + numerical_columns = [] + categorical_columns = [] # Also, register the feature types for the estimator feat_type = [] # Make sure each column is a valid type - for column in X.columns: - if X[column].dtype.name in ['category', 'bool']: - - enc_columns.append(column) + for i, column in enumerate(X.columns): + if self.all_nan_columns is not None and column in self.all_nan_columns: + continue + column_dtype = self.dtypes[i] + err_msg = "Valid types are `numerical`, `categorical` or `boolean`, " \ + "but input column {} has an invalid type `{}`.".format(column, column_dtype) + if column_dtype in ['category', 'bool']: + categorical_columns.append(column) feat_type.append('categorical') # Move away from np.issubdtype as it causes # TypeError: data type not understood in certain pandas types - elif not is_numeric_dtype(X[column]): - if X[column].dtype.name == 'object': - raise ValueError( - "Input Column {} has invalid type object. " - "Cast it to a valid dtype before using it in AutoPyTorch. " - "Valid types are numerical, categorical or boolean. " - "You can cast it to a valid dtype using " - "pandas.Series.astype ." - "If working with string objects, the following " - "tutorial illustrates how to work with text data: " - "https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html".format( - # noqa: E501 - column, - ) - ) - elif pd.core.dtypes.common.is_datetime_or_timedelta_dtype( - X[column].dtype - ): - raise ValueError( - "AutoPyTorch does not support time and/or date datatype as given " - "in column {}. Please convert the time information to a numerical value " - "first. One example on how to do this can be found on " - "https://stats.stackexchange.com/questions/311494/".format( - column, - ) - ) - else: - raise ValueError( - "Input Column {} has unsupported dtype {}. " - "Supported column types are categorical/bool/numerical dtypes. " - "Make sure your data is formatted in a correct way, " - "before feeding it to AutoPyTorch.".format( - column, - X[column].dtype.name, - ) + elif is_numeric_dtype(column_dtype): + feat_type.append('numerical') + numerical_columns.append(column) + elif column_dtype == 'object': + # TODO verify how would this happen when we always convert the object dtypes to category + raise TypeError( + "{} Cast it to a valid dtype before feeding it to AutoPyTorch. " + "You can cast it to a valid dtype using pandas.Series.astype." + "If you are working with string objects, the following " + "tutorial illustrates how to work with text data: " + "https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html".format( + # noqa: E501 + err_msg, ) + ) + elif pd.core.dtypes.common.is_datetime_or_timedelta_dtype(column_dtype): + raise TypeError( + "{} Convert the time information to a numerical value" + " before feeding it to AutoPyTorch. " + "One example of the conversion can be found on " + "https://stats.stackexchange.com/questions/311494/".format(err_msg) + ) else: - feat_type.append('numerical') - return enc_columns, feat_type + raise TypeError( + "{} Make sure your data is formatted in a correct way" + "before feeding it to AutoPyTorch.".format(err_msg) + ) + + return categorical_columns, numerical_columns, feat_type - def list_to_dataframe( + def list_to_pandas( self, X_train: SUPPORTED_FEAT_TYPES, - X_test: typing.Optional[SUPPORTED_FEAT_TYPES] = None, - ) -> typing.Tuple[pd.DataFrame, typing.Optional[pd.DataFrame]]: + X_test: Optional[SUPPORTED_FEAT_TYPES] = None, + ) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]: """ Converts a list to a pandas DataFrame. In this process, column types are inferred. If test data is provided, we proactively match it to train data - Arguments: + Args: X_train (SUPPORTED_FEAT_TYPES): A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding - X_test (typing.Optional[SUPPORTED_FEAT_TYPES]): + X_test (Optional[SUPPORTED_FEAT_TYPES]): A hold out set of data used for checking Returns: pd.DataFrame: @@ -374,7 +442,7 @@ def list_to_dataframe( """ # If a list was provided, it will be converted to pandas - X_train = pd.DataFrame(data=X_train).infer_objects() + X_train = pd.DataFrame(data=X_train).convert_dtypes() self.logger.warning("The provided feature types to AutoPyTorch are of type list." "Features have been interpreted as: {}".format([(col, t) for col, t in zip(X_train.columns, X_train.dtypes)])) @@ -383,17 +451,18 @@ def list_to_dataframe( self.logger.warning("Train features are a list while the provided test data" "is {}. X_test will be casted as DataFrame.".format(type(X_test)) ) - X_test = pd.DataFrame(data=X_test).infer_objects() + X_test = pd.DataFrame(data=X_test).convert_dtypes() + return X_train, X_test - def numpy_array_to_pandas( - self, + @staticmethod + def numpy_to_pandas( X: np.ndarray, ) -> pd.DataFrame: """ Converts a numpy array to pandas for type inference - Arguments: + Args: X (np.ndarray): data to be interpreted. @@ -408,7 +477,7 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame: This has to be done once, so the test and train data are treated equally - Arguments: + Args: X (pd.DataFrame): data to be interpreted. @@ -426,66 +495,33 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame: self.logger.warning(f'Casting the column {key} to {dtype} caused the exception {e}') pass else: + # Calling for the first time to infer the categories X = X.infer_objects() - for column in X.columns: - if not is_numeric_dtype(X[column]): + for column, data_type in zip(X.columns, X.dtypes): + if not is_numeric_dtype(data_type): X[column] = X[column].astype('category') - self.object_dtype_mapping = {column: X[column].dtype for column in X.columns} + + # only numerical attributes and categories + self.object_dtype_mapping = {column: data_type for column, data_type in zip(X.columns, X.dtypes)} + self.logger.debug(f"Infer Objects: {self.object_dtype_mapping}") + return X - def impute_nan_in_categories(self, X: pd.DataFrame) -> pd.DataFrame: - """ - impute missing values before encoding, - remove once sklearn natively supports - it in ordinal encoding. Sklearn issue: - "https://github.com/scikit-learn/scikit-learn/issues/17123)" - Arguments: - X (pd.DataFrame): - data to be interpreted. +def has_object_columns( + feature_types: pd.Series, +) -> bool: + """ + Indicate whether on a Series of dtypes for a Pandas DataFrame + there exists one or more object columns. - Returns: - pd.DataFrame - """ + Args: + feature_types (pd.Series): The feature types for a DataFrame. - # To be on the safe side, map always to the same missing - # value per column - if not hasattr(self, 'dict_nancol_to_missing'): - self.dict_missing_value_per_col: typing.Dict[str, typing.Any] = {} - - # First make sure that we do not alter the type of the column which cause: - # TypeError: '<' not supported between instances of 'int' and 'str' - # in the encoding - for column in self.enc_columns: - if X[column].isna().any(): - if column not in self.dict_missing_value_per_col: - try: - float(X[column].dropna().values[0]) - can_cast_as_number = True - except Exception: - can_cast_as_number = False - if can_cast_as_number: - # In this case, we expect to have a number as category - # it might be string, but its value represent a number - missing_value: typing.Union[str, int] = '-1' if isinstance(X[column].dropna().values[0], - str) else -1 - else: - missing_value = 'Missing!' - - # Make sure this missing value is not seen before - # Do this check for categorical columns - # else modify the value - if hasattr(X[column], 'cat'): - while missing_value in X[column].cat.categories: - if isinstance(missing_value, str): - missing_value += '0' - else: - missing_value += missing_value - self.dict_missing_value_per_col[column] = missing_value - - # Convert the frame in place - X[column].cat.add_categories([self.dict_missing_value_per_col[column]], - inplace=True) - X.fillna({column: self.dict_missing_value_per_col[column]}, inplace=True) - return X + Returns: + bool: + True if the DataFrame dtypes contain an object column, False + otherwise. + """ + return np.dtype('O') in feature_types diff --git a/autoPyTorch/data/tabular_target_validator.py b/autoPyTorch/data/tabular_target_validator.py index 239791768..7cbd88c38 100644 --- a/autoPyTorch/data/tabular_target_validator.py +++ b/autoPyTorch/data/tabular_target_validator.py @@ -28,7 +28,7 @@ def _fit( It does so by also using the classes from the test data, to prevent encoding errors - Arguments: + Args: y_train (SUPPORTED_TARGET_TYPES) The labels of the current task. They are going to be encoded in case of classification @@ -100,7 +100,7 @@ def transform( Validates and fit a categorical encoder (if needed) to the features. The supported data types are List, numpy arrays and pandas DataFrames. - Arguments: + Args: y (SUPPORTED_TARGET_TYPES) A set of targets that are going to be encoded if the current task is classification @@ -152,7 +152,7 @@ def inverse_transform( """ Revert any encoding transformation done on a target array - Arguments: + Args: y (typing.Union[np.ndarray, pd.DataFrame, pd.Series]): Target array to be transformed back to original form before encoding Returns: @@ -189,7 +189,7 @@ def _check_data( """ Perform dimensionality and data type checks on the targets - Arguments: + Args: y (typing.Union[np.ndarray, pd.DataFrame, pd.Series]): A set of features whose dimensionality and data type is going to be checked """ diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py index 9f9b95369..f041be5ec 100644 --- a/autoPyTorch/datasets/base_dataset.py +++ b/autoPyTorch/datasets/base_dataset.py @@ -330,13 +330,22 @@ def get_dataset_for_training(self, split_id: int, train: bool) -> Dataset: to provide training data to fit a pipeline Args: - split (int): The desired subset of the dataset to split and use + split_id (int): which split id to get from the splits + train (bool): whether the dataset is required for training or evaluating. Returns: + Dataset: the reduced dataset to be used for testing """ # Subset creates a dataset. Splits is a (train_indices, test_indices) tuple - return TransformSubset(self, self.splits[split_id][0], train=train) + if split_id >= len(self.splits): # old version: split_id > len(self.splits) + raise IndexError("split_id out of range, got split_id={}" + " (>= num_splits={})".format(split_id, len(self.splits))) + subset = int(not train) + indices = self.splits[split_id][subset] + if indices is None: + raise ValueError("Specified fold (or subset) does not exist") + return TransformSubset(self, indices, train=train) def replace_data(self, X_train: BaseDatasetInputType, X_test: Optional[BaseDatasetInputType]) -> 'BaseDataset': diff --git a/autoPyTorch/ensemble/singlebest_ensemble.py b/autoPyTorch/ensemble/singlebest_ensemble.py index c6fbaf576..6f82cbdf4 100644 --- a/autoPyTorch/ensemble/singlebest_ensemble.py +++ b/autoPyTorch/ensemble/singlebest_ensemble.py @@ -3,7 +3,7 @@ import numpy as np -from smac.runhistory.runhistory import RunHistory +from smac.runhistory.runhistory import RunHistory, StatusType from autoPyTorch.ensemble.abstract_ensemble import AbstractEnsemble from autoPyTorch.pipeline.base_pipeline import BasePipeline @@ -49,6 +49,9 @@ def get_identifiers_from_run_history(self) -> List[Tuple[int, int, float]]: for run_key in self.run_history.data.keys(): run_value = self.run_history.data[run_key] + if run_value.status == StatusType.CRASHED: + continue + score = self.metric._optimum - (self.metric._sign * run_value.cost) if (score > best_model_score and self.metric._sign > 0) \ diff --git a/autoPyTorch/pipeline/base_pipeline.py b/autoPyTorch/pipeline/base_pipeline.py index 842f63271..d98be9bd4 100644 --- a/autoPyTorch/pipeline/base_pipeline.py +++ b/autoPyTorch/pipeline/base_pipeline.py @@ -21,7 +21,9 @@ get_match_array ) from autoPyTorch.utils.common import FitRequirement -from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates +from autoPyTorch.utils.hyperparameter_search_space_update import ( + HyperparameterSearchSpaceUpdates +) class BasePipeline(Pipeline): @@ -398,6 +400,7 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]], raise ValueError("Unknown node name. Expected update node name to be in {} " "got {}".format(self.named_steps.keys(), update.node_name)) node = self.named_steps[update.node_name] + node_name = node.__class__.__name__ # if node is a choice module if hasattr(node, 'get_components'): split_hyperparameter = update.hyperparameter.split(':') @@ -425,18 +428,18 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]], if choice in exclude[update.node_name]: raise ValueError("Found {} in exclude".format(choice)) if choice not in components.keys(): - raise ValueError("Unknown hyperparameter for choice {}. " + raise ValueError("Unknown component choice for node {}. " "Expected update hyperparameter " - "to be in {} got {}".format(node.__class__.__name__, - components.keys(), choice)) + "to be in {}, but got {}".format(node_name, + components.keys(), choice)) # check if the component whose hyperparameter # needs to be updated is in components of the # choice module elif split_hyperparameter[0] not in components.keys(): - raise ValueError("Unknown hyperparameter for choice {}. " - "Expected update hyperparameter " - "to be in {} got {}".format(node.__class__.__name__, - components.keys(), split_hyperparameter[0])) + raise ValueError("Unknown component choice for node {}. " + "Expected update component " + "to be in {}, but got {}".format(node_name, + components.keys(), split_hyperparameter[0])) else: # check if hyperparameter is in the search space of the component component = components[split_hyperparameter[0]] @@ -449,14 +452,16 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]], component.get_hyperparameter_search_space( dataset_properties=self.dataset_properties).get_hyperparameter_names()]): continue - raise ValueError("Unknown hyperparameter for component {}. " - "Expected update hyperparameter " - "to be in {} got {}".format(node.__class__.__name__, - component. - get_hyperparameter_search_space( - dataset_properties=self.dataset_properties). - get_hyperparameter_names(), - split_hyperparameter[1])) + component_hyperparameters = component.get_hyperparameter_search_space( + dataset_properties=self.dataset_properties).get_hyperparameter_names() + raise ValueError("Unknown hyperparameter for component {} of node {}." + " Expected update hyperparameter " + "to be in {}, but got {}.".format(component.__name__, + node_name, + component_hyperparameters, + split_hyperparameter[1] + ) + ) else: if update.hyperparameter not in node.get_hyperparameter_search_space( dataset_properties=self.dataset_properties): @@ -464,13 +469,13 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]], node.get_hyperparameter_search_space( dataset_properties=self.dataset_properties).get_hyperparameter_names()]): continue - raise ValueError("Unknown hyperparameter for component {}. " + node_hyperparameters = node.get_hyperparameter_search_space( + dataset_properties=self.dataset_properties).get_hyperparameter_names() + raise ValueError("Unknown hyperparameter for node {}. " "Expected update hyperparameter " - "to be in {} got {}".format(node.__class__.__name__, - node. - get_hyperparameter_search_space( - dataset_properties=self.dataset_properties). - get_hyperparameter_names(), update.hyperparameter)) + "to be in {}, but got {}".format(node_name, + node_hyperparameters, + update.hyperparameter)) def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]] ) -> List[Tuple[str, autoPyTorchChoice]]: diff --git a/autoPyTorch/pipeline/components/preprocessing/image_preprocessing/normalise/ImageNormalizer.py b/autoPyTorch/pipeline/components/preprocessing/image_preprocessing/normalise/ImageNormalizer.py index 4327d6346..a3be8fa79 100644 --- a/autoPyTorch/pipeline/components/preprocessing/image_preprocessing/normalise/ImageNormalizer.py +++ b/autoPyTorch/pipeline/components/preprocessing/image_preprocessing/normalise/ImageNormalizer.py @@ -2,7 +2,7 @@ import numpy as np -import torch.tensor +import torch from autoPyTorch.pipeline.components.preprocessing.image_preprocessing.normalise.base_normalizer import BaseNormalizer @@ -30,16 +30,16 @@ def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> "ImageNormalizer": self.std = X['dataset_properties']['std'] return self - def __call__(self, X: Union[np.ndarray, torch.tensor]) -> Union[np.ndarray, torch.tensor]: + def __call__(self, X: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]: """ Makes the autoPyTorchPreprocessingComponent Callable. Calling the component calls the transform function of the underlying early_preprocessor and returns the transformed array. Args: - X (Union[np.ndarray, torch.tensor]): input data tensor + X (Union[np.ndarray, torch.Tensor]): input data tensor Returns: - Union[np.ndarray, torch.tensor]: Transformed data tensor + Union[np.ndarray, torch.Tensor]: Transformed data tensor """ X = (X - self.mean) / self.std return X diff --git a/autoPyTorch/pipeline/components/preprocessing/image_preprocessing/normalise/NoNormalizer.py b/autoPyTorch/pipeline/components/preprocessing/image_preprocessing/normalise/NoNormalizer.py index 7aeb83a9c..b36a50f4e 100644 --- a/autoPyTorch/pipeline/components/preprocessing/image_preprocessing/normalise/NoNormalizer.py +++ b/autoPyTorch/pipeline/components/preprocessing/image_preprocessing/normalise/NoNormalizer.py @@ -2,7 +2,7 @@ import numpy as np -import torch.tensor +import torch from autoPyTorch.pipeline.components.preprocessing.image_preprocessing.normalise.base_normalizer import ( BaseNormalizer @@ -34,16 +34,16 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: X.update({'normalise': self}) return X - def __call__(self, X: Union[np.ndarray, torch.tensor]) -> Union[np.ndarray, torch.tensor]: + def __call__(self, X: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]: """ Makes the autoPyTorchPreprocessingComponent Callable. Calling the component calls the transform function of the underlying early_preprocessor and returns the transformed array. Args: - X (Union[np.ndarray, torch.tensor]): input data tensor + X (Union[np.ndarray, torch.Tensor]): input data tensor Returns: - Union[np.ndarray, torch.tensor]: Transformed data tensor + Union[np.ndarray, torch.Tensor]: Transformed data tensor """ return X diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py index e1e08e94e..e513b8729 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py @@ -3,14 +3,14 @@ import numpy as np from sklearn.compose import ColumnTransformer -from sklearn.pipeline import make_pipeline +# from sklearn.pipeline import make_pipeline import torch from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import ( autoPyTorchTabularPreprocessingComponent ) -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.utils import get_tabular_preprocessers +# from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.utils import get_tabular_preprocessers from autoPyTorch.utils.common import FitRequirement, subsampler @@ -47,15 +47,16 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer": Returns: "TabularColumnTransformer": an instance of self """ + self.check_requirements(X, y) - numerical_pipeline = 'drop' - categorical_pipeline = 'drop' + numerical_pipeline = 'passthrough' + categorical_pipeline = 'passthrough' - preprocessors = get_tabular_preprocessers(X) - if len(X['dataset_properties']['numerical_columns']): - numerical_pipeline = make_pipeline(*preprocessors['numerical']) - if len(X['dataset_properties']['categorical_columns']): - categorical_pipeline = make_pipeline(*preprocessors['categorical']) + # preprocessors = get_tabular_preprocessers(X) + # if len(X['dataset_properties']['numerical_columns']): + # numerical_pipeline = make_pipeline(*preprocessors['numerical']) + # if len(X['dataset_properties']['categorical_columns']): + # categorical_pipeline = make_pipeline(*preprocessors['categorical']) self.preprocessor = ColumnTransformer([ ('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns']), @@ -71,6 +72,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer": X_train = X['backend'].load_datamanager().train_tensors[0] self.preprocessor.fit(X_train) + return self def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py index eadc0a188..9829cadcd 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py @@ -28,5 +28,5 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None: raise ValueError("cant call transform on {} without fitting first." .format(self.__class__.__name__)) - X.update({'encoder': self.preprocessor}) + # X.update({'encoder': self.preprocessor}) return X diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py index b65f3c229..ac0648481 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py @@ -29,5 +29,5 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None: raise ValueError("cant call transform on {} without fitting first." .format(self.__class__.__name__)) - X.update({'imputer': self.preprocessor}) + # X.update({'imputer': self.preprocessor}) return X diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler.py index 39834dd2b..270fac246 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler.py @@ -28,5 +28,5 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None: raise ValueError("cant call transform on {} without fitting first." .format(self.__class__.__name__)) - X.update({'scaler': self.preprocessor}) + # X.update({'scaler': self.preprocessor}) return X diff --git a/autoPyTorch/pipeline/components/setup/network/base_network.py b/autoPyTorch/pipeline/components/setup/network/base_network.py index c58f25283..512c8360e 100644 --- a/autoPyTorch/pipeline/components/setup/network/base_network.py +++ b/autoPyTorch/pipeline/components/setup/network/base_network.py @@ -131,13 +131,15 @@ def _predict(self, network: torch.nn.Module, loader: torch.utils.data.DataLoader # Batch prediction Y_batch_preds = list() - for i, (X_batch, Y_batch) in enumerate(loader): - # Predict on batch - X_batch = X_batch.float().to(self.device) - Y_batch_pred = network(X_batch) - if self.final_activation is not None: - Y_batch_pred = self.final_activation(Y_batch_pred) - Y_batch_preds.append(Y_batch_pred.detach().cpu()) + # `torch.no_grad` reduces memory usage even after `model.eval()` + with torch.no_grad(): + for i, (X_batch, Y_batch) in enumerate(loader): + # Predict on batch + X_batch = X_batch.float().to(self.device) + Y_batch_pred = network(X_batch) + if self.final_activation is not None: + Y_batch_pred = self.final_activation(Y_batch_pred) + Y_batch_preds.append(Y_batch_pred.detach().cpu()) return torch.cat(Y_batch_preds, 0) diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/ResNetBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/ResNetBackbone.py index 7a3350ca8..51269a254 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/ResNetBackbone.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/ResNetBackbone.py @@ -139,6 +139,14 @@ def get_hyperparameter_search_space( value_range=(True, False), default_value=True, ), + shake_shake_update_func: HyperparameterSearchSpace = HyperparameterSearchSpace( + hyperparameter="shake_shake_update_func", + value_range=('shake-shake', + 'shake-even', + 'even-even', + 'M3'), + default_value='shake-shake', + ), use_shake_drop: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="use_shake_drop", value_range=(True, False), default_value=True, @@ -180,16 +188,25 @@ def get_hyperparameter_search_space( if skip_connection_flag: + shake_shake_flag = 'shake-shake' in multi_branch_choice.value_range shake_drop_prob_flag = 'shake-drop' in multi_branch_choice.value_range mb_choice = get_hyperparameter(multi_branch_choice, CategoricalHyperparameter) cs.add_hyperparameter(mb_choice) cs.add_condition(CS.EqualsCondition(mb_choice, use_sc, True)) + shake_shake_update_func_conditional: List[str] = list() if shake_drop_prob_flag: shake_drop_prob = get_hyperparameter(max_shake_drop_probability, UniformFloatHyperparameter) cs.add_hyperparameter(shake_drop_prob) cs.add_condition(CS.EqualsCondition(shake_drop_prob, mb_choice, "shake-drop")) + shake_shake_update_func_conditional.append('shake-drop') + if shake_shake_flag: + shake_shake_update_func_conditional.append('shake-shake') + if len(shake_shake_update_func_conditional) > 0: + method = get_hyperparameter(shake_shake_update_func, CategoricalHyperparameter) + cs.add_hyperparameter(method) + cs.add_condition(CS.InCondition(method, mb_choice, shake_shake_update_func_conditional)) # It is the upper bound of the nr of groups, # since the configuration will actually be sampled. @@ -258,7 +275,7 @@ def __init__( # if in != out the shortcut needs a linear layer to match the result dimensions # if the shortcut needs a layer we apply batchnorm and activation to the shortcut # as well (start_norm) - if in_features != out_features: + if in_features != out_features and self.config["use_skip_connection"]: self.shortcut = nn.Linear(in_features, out_features) initial_normalization = list() if self.config['use_batch_norm']: @@ -288,13 +305,6 @@ def _build_block(self, in_features: int, out_features: int) -> nn.Module: if self.config['use_batch_norm']: layers.append(nn.BatchNorm1d(in_features)) layers.append(self.activation()) - elif not self.config['use_skip_connection']: - # if start norm is not None and skip connection is False - # we will never apply the start_norm for the first layer in the block, - # which is why we should account for this case. - if self.config['use_batch_norm']: - layers.append(nn.BatchNorm1d(in_features)) - layers.append(self.activation()) layers.append(nn.Linear(in_features, out_features)) @@ -310,9 +320,6 @@ def _build_block(self, in_features: int, out_features: int) -> nn.Module: def forward(self, x: torch.FloatTensor) -> torch.FloatTensor: - if self.config["use_skip_connection"]: - residual = x - # if shortcut is not none we need a layer such that x matches the output dimension if self.shortcut is not None and self.start_norm is not None: # in this case self.start_norm is also != none @@ -320,38 +327,42 @@ def forward(self, x: torch.FloatTensor) -> torch.FloatTensor: # in front of shortcut and layers. Note that in this case layers # does not start with batchnorm+activation but with the first linear layer # (see _build_block). As a result if in_features == out_features - # -> result = x + W(~D(A(BN(W(A(BN(x)))))) + # -> result = x + W_2(~D(A(BN(W_1(A(BN(x)))))) # if in_features != out_features # -> result = W_shortcut(A(BN(x))) + W_2(~D(A(BN(W_1(A(BN(x)))))) x = self.start_norm(x) - if self.config["use_skip_connection"]: - residual = self.shortcut(x) - - # TODO make the below code better - if self.config["use_skip_connection"]: - if self.config["multi_branch_choice"] == 'shake-shake': - x1 = self.layers(x) - x2 = self.shake_shake_layers(x) - alpha, beta = shake_get_alpha_beta(self.training, x.is_cuda) - x = shake_shake(x1, x2, alpha, beta) - else: - x = self.layers(x) + residual = self.shortcut(x) + elif self.config["use_skip_connection"]: + # We use a skip connection but we do not need to match dimensions + residual = x + else: # Early-return because no need of skip connection + return self.layers(x) + + if self.config["multi_branch_choice"] == 'shake-shake': + x1 = self.layers(x) + x2 = self.shake_shake_layers(x) + alpha, beta = shake_get_alpha_beta( + is_training=self.training, + is_cuda=x.is_cuda, + method=self.config['shake_shake_update_func'], + ) + x = shake_shake(x1, x2, alpha, beta) + elif self.config["multi_branch_choice"] == 'shake-drop': + x = self.layers(x) + alpha, beta = shake_get_alpha_beta( + is_training=self.training, + is_cuda=x.is_cuda, + method=self.config['shake_shake_update_func'], + ) + bl = shake_drop_get_bl( + self.block_index, + 1 - self.config["max_shake_drop_probability"], + self.num_blocks, + self.training, + x.is_cuda, + ) + x = shake_drop(x, alpha, beta, bl) else: x = self.layers(x) - if self.config["use_skip_connection"]: - if self.config["multi_branch_choice"] == 'shake-drop': - alpha, beta = shake_get_alpha_beta(self.training, x.is_cuda) - bl = shake_drop_get_bl( - self.block_index, - 1 - self.config["max_shake_drop_probability"], - self.num_blocks, - self.training, - x.is_cuda, - ) - x = shake_drop(x, alpha, beta, bl) - - if self.config["use_skip_connection"]: - x = x + residual - - return x + return x + residual diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/ShapedResNetBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/ShapedResNetBackbone.py index 1a37482d7..72046a9b7 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/ShapedResNetBackbone.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/ShapedResNetBackbone.py @@ -72,6 +72,7 @@ def build_backbone(self, input_shape: Tuple[int, ...]) -> None: ) if self.config['use_batch_norm']: layers.append(torch.nn.BatchNorm1d(self.config["num_units_%i" % self.config['num_groups']])) + layers.append(_activations[self.config["activation"]]()) backbone = torch.nn.Sequential(*layers) self.backbone = backbone return backbone @@ -144,6 +145,14 @@ def get_hyperparameter_search_space( # type: ignore[override] 'stairs'), default_value='funnel', ), + shake_shake_update_func: HyperparameterSearchSpace = HyperparameterSearchSpace( + hyperparameter="shake_shake_update_func", + value_range=('shake-shake', + 'shake-even', + 'even-even', + 'M3'), + default_value='shake-shake', + ), max_shake_drop_probability: HyperparameterSearchSpace = HyperparameterSearchSpace( hyperparameter="max_shake_drop_probability", value_range=(0, 1), @@ -187,17 +196,24 @@ def get_hyperparameter_search_space( # type: ignore[override] if skip_connection_flag: - shake_drop_prob_flag = False - if 'shake-drop' in multi_branch_choice.value_range: - shake_drop_prob_flag = True + shake_shake_flag = 'shake-shake' in multi_branch_choice.value_range + shake_drop_prob_flag = 'shake-drop' in multi_branch_choice.value_range mb_choice = get_hyperparameter(multi_branch_choice, CategoricalHyperparameter) cs.add_hyperparameter(mb_choice) cs.add_condition(CS.EqualsCondition(mb_choice, use_sc, True)) + shake_shake_update_func_conditional: List[str] = list() if shake_drop_prob_flag: shake_drop_prob = get_hyperparameter(max_shake_drop_probability, UniformFloatHyperparameter) cs.add_hyperparameter(shake_drop_prob) cs.add_condition(CS.EqualsCondition(shake_drop_prob, mb_choice, "shake-drop")) + shake_shake_update_func_conditional.append('shake-drop') + if shake_shake_flag: + shake_shake_update_func_conditional.append('shake-shake') + if len(shake_shake_update_func_conditional) > 0: + method = get_hyperparameter(shake_shake_update_func, CategoricalHyperparameter) + cs.add_hyperparameter(method) + cs.add_condition(CS.InCondition(method, mb_choice, shake_shake_update_func_conditional)) return cs diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py index f7c682ef9..70cab66fc 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py @@ -28,6 +28,7 @@ def get_output_shape(network: torch.nn.Module, input_shape: typing.Tuple[int, .. placeholder = torch.randn((2, *input_shape), dtype=torch.float) with torch.no_grad(): output = network(placeholder) + return tuple(output.shape[1:]) @@ -44,8 +45,8 @@ def forward( ctx: typing.Any, # No typing for AutogradContext x1: torch.Tensor, x2: torch.Tensor, - alpha: torch.tensor, - beta: torch.tensor, + alpha: torch.Tensor, + beta: torch.Tensor, ) -> torch.Tensor: ctx.save_for_backward(x1, x2, alpha, beta) @@ -85,10 +86,10 @@ class ShakeDropFunction(Function): """ @staticmethod def forward(ctx: typing.Any, - x: torch.tensor, - alpha: torch.tensor, - beta: torch.tensor, - bl: torch.tensor, + x: torch.Tensor, + alpha: torch.Tensor, + beta: torch.Tensor, + bl: torch.Tensor, ) -> torch.Tensor: ctx.save_for_backward(x, alpha, beta, bl) @@ -111,15 +112,53 @@ def backward(ctx: typing.Any, shake_drop = ShakeDropFunction.apply -def shake_get_alpha_beta(is_training: bool, is_cuda: bool - ) -> typing.Tuple[torch.tensor, torch.tensor]: - if is_training: +def shake_get_alpha_beta( + is_training: bool, + is_cuda: bool, + method: str +) -> typing.Tuple[torch.Tensor, torch.Tensor]: + """ + The methods used in this function have been introduced in 'ShakeShake Regularisation' + Each method name is available in the referred paper. + Currently, this function supports `even-even`, `shake-even`, `shake-shake` and `M3`. + + Args: + is_training (bool): Whether the computation for the training + is_cuda (bool): Whether the tensor is on CUDA + method (str): The shake method either `even-even`, `shake-even`, `shake-shake` or `M3` + + Returns: + alpha, beta (Tuple[float, float]): + alpha (in [0, 1]) is the weight coefficient for the forward pass + beta (in [0, 1]) is the weight coefficient for the backward pass + + Reference: + Title: Shake-shake regularization + Author: Xavier Gastaldi + URL: https://arxiv.org/abs/1705.07485 + + The names have been taken from the paper as well. + Currently, this function supports `even-even`, `shake-even`, `shake-shake` and `M3`. + """ + if not is_training: result = (torch.FloatTensor([0.5]), torch.FloatTensor([0.5])) return result if not is_cuda else (result[0].cuda(), result[1].cuda()) # TODO implement other update methods - alpha = torch.rand(1) - beta = torch.rand(1) + # alpha is the weight ratio for the forward pass and beta is that for the backward pass + alpha = torch.FloatTensor([0.5]) if method.startswith('even') else torch.rand(1) + if method.endswith('even'): + beta = torch.FloatTensor([0.5]) + elif method.endswith('shake'): + beta = torch.rand(1) + elif method == 'M3': + # Table 4 in the paper `Shake-Shake regularization` + rnd = torch.rand(1) + beta = torch.FloatTensor( + [rnd * (0.5 - alpha) + alpha if alpha < 0.5 else rnd * (alpha - 0.5) + 0.5] + ) + else: + raise ValueError(f"Unknown method `{method}` for ShakeShakeRegularisation in NetworkBackbone") if is_cuda: alpha = alpha.cuda() @@ -129,19 +168,37 @@ def shake_get_alpha_beta(is_training: bool, is_cuda: bool def shake_drop_get_bl( - block_index: int, - min_prob_no_shake: float, - num_blocks: int, - is_training: bool, - is_cuda: bool -) -> torch.tensor: + block_index: int, + min_prob_no_shake: float, + num_blocks: int, + is_training: bool, + is_cuda: bool +) -> torch.Tensor: + """ + The sampling of Bernoulli random variable + based on Eq. (4) in the paper + Args: + block_index (int): The index of the block from the input layer + min_prob_no_shake (float): The initial shake probability + num_blocks (int): The total number of building blocks + is_training (bool): Whether it is training + is_cuda (bool): Whether the tensor is on CUDA + + Returns: + bl (torch.Tensor): a Bernoulli random variable in {0, 1} + Reference: + ShakeDrop Regularization for Deep Residual Learning + Yoshihiro Yamada et. al. (2020) + paper: https://arxiv.org/pdf/1802.02375.pdf + implementation: https://github.com/imenurok/ShakeDrop + """ pl = 1 - ((block_index + 1) / num_blocks) * (1 - min_prob_no_shake) - if not is_training: - # Move to torch.randn(1) for reproducibility - bl = torch.tensor(1.0) if torch.randn(1) <= pl else torch.tensor(0.0) if is_training: - bl = torch.tensor(pl) + # Move to torch.randn(1) for reproducibility + bl = torch.as_tensor(1.0) if torch.rand(1) <= pl else torch.as_tensor(0.0) + else: + bl = torch.as_tensor(pl) if is_cuda: bl = bl.cuda() diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py index 5ae2880ed..6feac0fba 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py @@ -1,4 +1,4 @@ -import copy +# import copy from typing import Any, Dict, Optional, Tuple import numpy as np @@ -31,21 +31,22 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module: raise NotImplementedError - def _get_args(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]: + def _get_args(self, X: Dict[str, Any]) -> Tuple[None, None]: # Tuple[int, np.ndarray]: # Feature preprocessors can alter numerical columns - if len(X['dataset_properties']['numerical_columns']) == 0: - num_numerical_columns = 0 - else: - X_train = copy.deepcopy(X['backend'].load_datamanager().train_tensors[0][:2]) - - numerical_column_transformer = X['tabular_transformer'].preprocessor. \ - named_transformers_['numerical_pipeline'] - num_numerical_columns = numerical_column_transformer.transform( - X_train[:, X['dataset_properties']['numerical_columns']]).shape[1] - num_input_features = np.zeros((num_numerical_columns + len(X['dataset_properties']['categorical_columns'])), - dtype=int) - categories = X['dataset_properties']['categories'] - - for i, category in enumerate(categories): - num_input_features[num_numerical_columns + i, ] = len(category) - return num_numerical_columns, num_input_features + # if len(X['dataset_properties']['numerical_columns']) == 0: + # num_numerical_columns = 0 + # else: + # X_train = copy.deepcopy(X['backend'].load_datamanager().train_tensors[0][:2]) + # + # numerical_column_transformer = X['tabular_transformer'].preprocessor. \ + # named_transformers_['numerical_pipeline'] + # num_numerical_columns = numerical_column_transformer.transform( + # X_train[:, X['dataset_properties']['numerical_columns']]).shape[1] + # num_input_features = np.zeros((num_numerical_columns + len(X['dataset_properties']['categorical_columns'])), + # dtype=int) + # categories = X['dataset_properties']['categories'] + # + # for i, category in enumerate(categories): + # num_input_features[num_numerical_columns + i, ] = len(category) + # return num_numerical_columns, num_input_features + return None, None diff --git a/autoPyTorch/pipeline/components/setup/network_head/no_head.py b/autoPyTorch/pipeline/components/setup/network_head/no_head.py index 870f680fb..0e711f06c 100644 --- a/autoPyTorch/pipeline/components/setup/network_head/no_head.py +++ b/autoPyTorch/pipeline/components/setup/network_head/no_head.py @@ -23,7 +23,6 @@ def build_head(self, input_shape: Tuple[int, ...], output_shape: Tuple[int, ...] layers = [] in_features = np.prod(input_shape).item() out_features = np.prod(output_shape).item() - layers.append(_activations[self.config["activation"]]()) layers.append(nn.Linear(in_features=in_features, out_features=out_features)) return nn.Sequential(*layers) diff --git a/autoPyTorch/pipeline/components/setup/optimizer/AdamWOptimizer.py b/autoPyTorch/pipeline/components/setup/optimizer/AdamWOptimizer.py index 4d11c3026..a415ff1c6 100644 --- a/autoPyTorch/pipeline/components/setup/optimizer/AdamWOptimizer.py +++ b/autoPyTorch/pipeline/components/setup/optimizer/AdamWOptimizer.py @@ -95,9 +95,9 @@ def get_hyperparameter_search_space( default_value=True, ), weight_decay: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="weight_decay", - value_range=(1E-7, 0.1), + value_range=(1E-5, 0.1), default_value=1E-4, - log=True), + log=False), ) -> ConfigurationSpace: cs = ConfigurationSpace() diff --git a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py index 21a3b9022..f568c3dc4 100644 --- a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py +++ b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py @@ -115,7 +115,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader: shuffle=True, num_workers=X.get('num_workers', 0), pin_memory=X.get('pin_memory', True), - drop_last=X.get('drop_last', True), + drop_last=X.get('drop_last', False), collate_fn=custom_collate_fn, ) @@ -148,6 +148,8 @@ def get_loader(self, X: np.ndarray, y: Optional[np.ndarray] = None, batch_size: dataset = BaseDataset( train_tensors=(X, y), # This dataset is used for loading test data in a batched format + seed=self.random_state.get_state()[1][0], + shuffle=False, train_transforms=self.test_transform, val_transforms=self.test_transform, ) diff --git a/autoPyTorch/pipeline/components/training/trainer/AdversarialTrainer.py b/autoPyTorch/pipeline/components/training/trainer/AdversarialTrainer.py index a24aac328..a95832ab1 100644 --- a/autoPyTorch/pipeline/components/training/trainer/AdversarialTrainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/AdversarialTrainer.py @@ -31,7 +31,7 @@ class AdversarialTrainer(BaseTrainerComponent): def __init__( self, epsilon: float, - weighted_loss: bool = False, + weighted_loss: int = 0, random_state: Optional[np.random.RandomState] = None, use_stochastic_weight_averaging: bool = False, use_snapshot_ensemble: bool = False, @@ -157,7 +157,7 @@ def get_properties(dataset_properties: Optional[Dict[str, Any]] = None 'shortname': 'AdversarialTrainer', 'name': 'AdversarialTrainer', 'handles_tabular': True, - 'handles_image': False, + 'handles_image': True, 'handles_time_series': False, } @@ -166,8 +166,8 @@ def get_hyperparameter_search_space( dataset_properties: Optional[Dict] = None, weighted_loss: HyperparameterSearchSpace = HyperparameterSearchSpace( hyperparameter="weighted_loss", - value_range=(True, False), - default_value=True), + value_range=(1, ), + default_value=1), la_steps: HyperparameterSearchSpace = HyperparameterSearchSpace( hyperparameter="la_steps", value_range=(5, 10), @@ -192,16 +192,21 @@ def get_hyperparameter_search_space( default_value=True), se_lastk: HyperparameterSearchSpace = HyperparameterSearchSpace( hyperparameter="se_lastk", - value_range=(3,), + value_range=(3, ), default_value=3), epsilon: HyperparameterSearchSpace = HyperparameterSearchSpace( hyperparameter="epsilon", - value_range=(0.05, 0.2), - default_value=0.2), + value_range=(0.001, 0.15), + default_value=0.007, + log=True), ) -> ConfigurationSpace: cs = ConfigurationSpace() + epsilon = HyperparameterSearchSpace(hyperparameter="epsilon", + value_range=(0.007, 0.007), + default_value=0.007) add_hyperparameter(cs, epsilon, UniformFloatHyperparameter) + add_hyperparameter(cs, use_stochastic_weight_averaging, CategoricalHyperparameter) snapshot_ensemble_flag = any(use_snapshot_ensemble.value_range) @@ -229,9 +234,17 @@ def get_hyperparameter_search_space( parent_hyperparameter=parent_hyperparameter ) + """ # TODO, decouple the weighted loss from the trainer if dataset_properties is not None: if STRING_TO_TASK_TYPES[dataset_properties['task_type']] in CLASSIFICATION_TASKS: add_hyperparameter(cs, weighted_loss, CategoricalHyperparameter) + """ + # TODO, decouple the weighted loss from the trainer. Uncomment the code above and + # remove the code below. Also update the method signature, so the weighted loss + # is not a constant. + if dataset_properties is not None: + if STRING_TO_TASK_TYPES[dataset_properties['task_type']] in CLASSIFICATION_TASKS: + add_hyperparameter(cs, weighted_loss, Constant) return cs diff --git a/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py b/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py index e36faf121..3f7866f3c 100644 --- a/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py @@ -1,4 +1,4 @@ -import typing +from typing import Any, Dict, Optional, Tuple, Union import numpy as np @@ -11,7 +11,7 @@ class RowCutMixTrainer(MixUp, BaseTrainerComponent): def data_preparation(self, X: np.ndarray, y: np.ndarray, - ) -> typing.Tuple[np.ndarray, typing.Dict[str, np.ndarray]]: + ) -> Tuple[np.ndarray, Dict[str, np.ndarray]]: """ Depending on the trainer choice, data fed to the network might be pre-processed on a different way. That is, in standard training we provide the data to the @@ -26,39 +26,38 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray, np.ndarray: that processes data typing.Dict[str, np.ndarray]: arguments to the criterion function """ - alpha, beta = 1.0, 1.0 - lam = self.random_state.beta(alpha, beta) - batch_size = X.shape[0] - device = torch.device('cuda' if X.is_cuda else 'cpu') - permed_indices = torch.randperm(batch_size).to(device) + beta = 1.0 + lam = self.random_state.beta(beta, beta) + batch_size, n_columns = np.shape(X) + # shuffled_indices: Shuffled version of torch.arange(batch_size) + shuffled_indices = torch.randperm(batch_size).cuda() if X.is_cuda else torch.randperm(batch_size) r = self.random_state.rand(1) if beta <= 0 or r > self.alpha: - return X, {'y_a': y, 'y_b': y[permed_indices], 'lam': 1} + return X, {'y_a': y, 'y_b': y[shuffled_indices], 'lam': 1} - # batch_size (permutation of rows), col_size = X.shape - col_size = X.shape[1] - col_indices = torch.tensor( + cut_column_indices = torch.as_tensor( self.random_state.choice( - range(col_size), - max(1, int(col_size * lam)), - replace=False - ) + range(n_columns), + max(1, np.int32(n_columns * lam)), + replace=False, + ), ) - # Replace selected columns with columns from another data point - X[:, col_indices] = X[permed_indices, :][:, col_indices] + # Replace the values in `cut_indices` columns with + # the values from `permed_indices` + X[:, cut_column_indices] = X[shuffled_indices, :][:, cut_column_indices] - # Adjust lam - lam = 1 - len(col_indices) / X.shape[1] + # Since we cannot cut exactly `lam x 100 %` of rows, we need to adjust the `lam` + lam = 1 - (len(cut_column_indices) / n_columns) - y_a, y_b = y, y[permed_indices] + y_a, y_b = y, y[shuffled_indices] return X, {'y_a': y_a, 'y_b': y_b, 'lam': lam} @staticmethod - def get_properties(dataset_properties: typing.Optional[typing.Dict[str, typing.Any]] = None - ) -> typing.Dict[str, typing.Union[str, bool]]: + def get_properties(dataset_properties: Optional[Dict[str, Any]] = None + ) -> Dict[str, Union[str, bool]]: return { 'shortname': 'RowCutMixTrainer', 'name': 'MixUp Regularized with Cutoff Tabular Trainer', diff --git a/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py b/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py index 5e1b9a1a3..4578082cb 100644 --- a/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py @@ -1,9 +1,7 @@ -import typing +from typing import Any, Dict, Optional, Tuple, Union import numpy as np -import torch - from autoPyTorch.pipeline.components.training.trainer.base_trainer import BaseTrainerComponent from autoPyTorch.pipeline.components.training.trainer.cutout_utils import CutOut @@ -17,13 +15,8 @@ class RowCutOutTrainer(CutOut, BaseTrainerComponent): Github URL: https://github.com/hysts/pytorch_cutout/blob/master/dataloader.py#L36-L68 """ - # 0 is non-informative in image data - NUMERICAL_VALUE = 0 - # -1 is the conceptually equivalent to 0 in a image, i.e. 0-pad - CATEGORICAL_VALUE = -1 - def data_preparation(self, X: np.ndarray, y: np.ndarray, - ) -> typing.Tuple[np.ndarray, typing.Dict[str, np.ndarray]]: + ) -> Tuple[np.ndarray, Dict[str, np.ndarray]]: """ Depending on the trainer choice, data fed to the network might be pre-processed on a different way. That is, in standard training we provide the data to the @@ -36,9 +29,8 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray, Returns: np.ndarray: that processes data - typing.Dict[str, np.ndarray]: arguments to the criterion function + Dict[str, np.ndarray]: arguments to the criterion function """ - r = self.random_state.rand(1) if r > self.cutout_prob: y_a = y @@ -46,30 +38,23 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray, lam = 1 return X, {'y_a': y_a, 'y_b': y_b, 'lam': lam} - # (batch_size (permutation of rows), col_size) = X.shape - col_size = X.shape[1] - col_indices = self.random_state.choice(range(col_size), max(1, int(col_size * self.patch_ratio)), - replace=False) - - if not isinstance(self.numerical_columns, typing.Iterable): - raise ValueError("numerical_columns in {} must be iterable, " - "but got {}.".format(self.__class__.__name__, - self.numerical_columns)) - - numerical_indices = torch.tensor(self.numerical_columns) - categorical_indices = torch.tensor([idx for idx in col_indices if idx not in self.numerical_columns]) - - X[:, categorical_indices.long()] = self.CATEGORICAL_VALUE - X[:, numerical_indices.long()] = self.NUMERICAL_VALUE + size: int = np.shape(X)[1] + cut_column_indices = self.random_state.choice( + range(size), + max(1, np.int32(size * self.patch_ratio)), + replace=False, + ) + # Mask the selected features as 0 + X[:, cut_column_indices] = 0 lam = 1 y_a = y y_b = y return X, {'y_a': y_a, 'y_b': y_b, 'lam': lam} @staticmethod - def get_properties(dataset_properties: typing.Optional[typing.Dict[str, typing.Any]] = None - ) -> typing.Dict[str, typing.Union[str, bool]]: + def get_properties(dataset_properties: Optional[Dict[str, Any]] = None + ) -> Dict[str, Union[str, bool]]: return { 'shortname': 'RowCutOutTrainer', 'name': 'RowCutOutTrainer', diff --git a/autoPyTorch/pipeline/components/training/trainer/StandardTrainer.py b/autoPyTorch/pipeline/components/training/trainer/StandardTrainer.py index 02db03f1d..1f6bbd224 100644 --- a/autoPyTorch/pipeline/components/training/trainer/StandardTrainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/StandardTrainer.py @@ -7,7 +7,7 @@ class StandardTrainer(BaseTrainerComponent): def __init__(self, - weighted_loss: bool = False, + weighted_loss: int = 0, use_stochastic_weight_averaging: bool = False, use_snapshot_ensemble: bool = False, se_lastk: int = 3, @@ -18,7 +18,7 @@ def __init__(self, This class handles the training of a network for a single given epoch. Args: - weighted_loss (bool): whether to use weighted loss + weighted_loss (int): whether to use weighted loss """ super().__init__(random_state=random_state, diff --git a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py index a8023bee5..8c5df84b2 100644 --- a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py @@ -175,7 +175,7 @@ class BaseTrainerComponent(autoPyTorchTrainingComponent): """ Base class for training Args: - weighted_loss (bool, default=False): In case for classification, whether to weight + weighted_loss (int, default=0): In case for classification, whether to weight the loss function according to the distribution of classes in the target use_stochastic_weight_averaging (bool, default=True): whether to use stochastic weight averaging. Stochastic weight averaging is a simple average of @@ -190,7 +190,7 @@ class BaseTrainerComponent(autoPyTorchTrainingComponent): random_state: **lookahead_config: """ - def __init__(self, weighted_loss: bool = False, + def __init__(self, weighted_loss: int = 0, use_stochastic_weight_averaging: bool = True, use_snapshot_ensemble: bool = True, se_lastk: int = 3, @@ -319,8 +319,12 @@ def on_epoch_end(self, X: Dict[str, Any], epoch: int) -> bool: if self.use_snapshot_ensemble: assert self.model_snapshots is not None, "model snapshots container can't be " \ "none when snapshot ensembling is enabled" - model_copy = deepcopy(self.swa_model) if self.use_stochastic_weight_averaging \ - else deepcopy(self.model) + is_last_epoch = (epoch == self.budget_tracker.max_epochs) + if is_last_epoch and self.use_stochastic_weight_averaging: + model_copy = deepcopy(self.swa_model) + else: + model_copy = deepcopy(self.model) + assert model_copy is not None model_copy.cpu() self.model_snapshots.append(model_copy) @@ -538,8 +542,8 @@ def get_hyperparameter_search_space( dataset_properties: Optional[Dict] = None, weighted_loss: HyperparameterSearchSpace = HyperparameterSearchSpace( hyperparameter="weighted_loss", - value_range=(True, False), - default_value=True), + value_range=(1, ), + default_value=1), la_steps: HyperparameterSearchSpace = HyperparameterSearchSpace( hyperparameter="la_steps", value_range=(5, 10), @@ -564,7 +568,7 @@ def get_hyperparameter_search_space( default_value=True), se_lastk: HyperparameterSearchSpace = HyperparameterSearchSpace( hyperparameter="se_lastk", - value_range=(3,), + value_range=(3, ), default_value=3), ) -> ConfigurationSpace: cs = ConfigurationSpace() @@ -596,9 +600,17 @@ def get_hyperparameter_search_space( parent_hyperparameter=parent_hyperparameter ) + """ # TODO, decouple the weighted loss from the trainer if dataset_properties is not None: if STRING_TO_TASK_TYPES[dataset_properties['task_type']] in CLASSIFICATION_TASKS: add_hyperparameter(cs, weighted_loss, CategoricalHyperparameter) + """ + # TODO, decouple the weighted loss from the trainer. Uncomment the code above and + # remove the code below. Also update the method signature, so the weighted loss + # is not a constant. + if dataset_properties is not None: + if STRING_TO_TASK_TYPES[dataset_properties['task_type']] in CLASSIFICATION_TASKS: + add_hyperparameter(cs, weighted_loss, Constant) return cs diff --git a/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py b/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py index 1d63b6141..ec2ef6106 100755 --- a/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py +++ b/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py @@ -285,7 +285,6 @@ def fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> autoPyTorchCom self.choice: autoPyTorchComponent = cast(autoPyTorchComponent, self.choice) if self.choice.use_snapshot_ensemble: X['network_snapshots'].extend(self.choice.model_snapshots) - return self.choice def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoice': @@ -402,14 +401,15 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic torch.cuda.empty_cache() if self.choice.use_stochastic_weight_averaging and self.choice.swa_updated: + # update batch norm statistics swa_utils.update_bn(loader=X['train_data_loader'], model=self.choice.swa_model.double()) # change model update_model_state_dict_from_swa(X['network'], self.choice.swa_model.state_dict()) if self.choice.use_snapshot_ensemble: - for model in self.choice.model_snapshots: - swa_utils.update_bn(loader=X['train_data_loader'], model=model.double()) + # we update only the last network which pertains to the stochastic weight averaging model + swa_utils.update_bn(X['train_data_loader'], self.choice.model_snapshots[-1].double()) # wrap up -- add score if not evaluating every epoch if not self.eval_valid_each_epoch(X): diff --git a/autoPyTorch/pipeline/components/training/trainer/cutout_utils.py b/autoPyTorch/pipeline/components/training/trainer/cutout_utils.py index 1b987d599..4feedf5cb 100644 --- a/autoPyTorch/pipeline/components/training/trainer/cutout_utils.py +++ b/autoPyTorch/pipeline/components/training/trainer/cutout_utils.py @@ -20,7 +20,7 @@ class CutOut: def __init__(self, patch_ratio: float, cutout_prob: float, - weighted_loss: bool = False, + weighted_loss: int = 0, random_state: Optional[np.random.RandomState] = None, use_stochastic_weight_averaging: bool = False, use_snapshot_ensemble: bool = False, @@ -63,8 +63,8 @@ def get_hyperparameter_search_space( dataset_properties: Optional[Dict] = None, weighted_loss: HyperparameterSearchSpace = HyperparameterSearchSpace( hyperparameter="weighted_loss", - value_range=(True, False), - default_value=True), + value_range=(1, ), + default_value=1), la_steps: HyperparameterSearchSpace = HyperparameterSearchSpace( hyperparameter="la_steps", value_range=(5, 10), @@ -136,9 +136,17 @@ def get_hyperparameter_search_space( parent_hyperparameter=parent_hyperparameter ) + """ # TODO, decouple the weighted loss from the trainer if dataset_properties is not None: if STRING_TO_TASK_TYPES[dataset_properties['task_type']] in CLASSIFICATION_TASKS: add_hyperparameter(cs, weighted_loss, CategoricalHyperparameter) + """ + # TODO, decouple the weighted loss from the trainer. Uncomment the code above and + # remove the code below. Also update the method signature, so the weighted loss + # is not a constant. + if dataset_properties is not None: + if STRING_TO_TASK_TYPES[dataset_properties['task_type']] in CLASSIFICATION_TASKS: + add_hyperparameter(cs, weighted_loss, Constant) return cs diff --git a/autoPyTorch/pipeline/components/training/trainer/mixup_utils.py b/autoPyTorch/pipeline/components/training/trainer/mixup_utils.py index e33011bf5..e2ea25148 100644 --- a/autoPyTorch/pipeline/components/training/trainer/mixup_utils.py +++ b/autoPyTorch/pipeline/components/training/trainer/mixup_utils.py @@ -26,7 +26,7 @@ class MixUp: Github URL: https://github.com/facebookresearch/mixup-cifar10/blob/master/train.py#L119-L138 """ def __init__(self, alpha: float, - weighted_loss: bool = False, + weighted_loss: int = 0, random_state: Optional[np.random.RandomState] = None, use_stochastic_weight_averaging: bool = False, use_snapshot_ensemble: bool = False, @@ -68,8 +68,8 @@ def get_hyperparameter_search_space( dataset_properties: Optional[Dict] = None, weighted_loss: HyperparameterSearchSpace = HyperparameterSearchSpace( hyperparameter="weighted_loss", - value_range=(True, False), - default_value=True), + value_range=(1, ), + default_value=1), la_steps: HyperparameterSearchSpace = HyperparameterSearchSpace( hyperparameter="la_steps", value_range=(5, 10), @@ -94,7 +94,7 @@ def get_hyperparameter_search_space( default_value=True), se_lastk: HyperparameterSearchSpace = HyperparameterSearchSpace( hyperparameter="se_lastk", - value_range=(3,), + value_range=(3, ), default_value=3), alpha: HyperparameterSearchSpace = HyperparameterSearchSpace( hyperparameter="alpha", @@ -134,9 +134,18 @@ def get_hyperparameter_search_space( la_config_space, parent_hyperparameter=parent_hyperparameter ) + + """ # TODO, decouple the weighted loss from the trainer if dataset_properties is not None: if STRING_TO_TASK_TYPES[dataset_properties['task_type']] in CLASSIFICATION_TASKS: add_hyperparameter(cs, weighted_loss, CategoricalHyperparameter) + """ + # TODO, decouple the weighted loss from the trainer. Uncomment the code above and + # remove the code below. Also update the method signature, so the weighted loss + # is not a constant. + if dataset_properties is not None: + if STRING_TO_TASK_TYPES[dataset_properties['task_type']] in CLASSIFICATION_TASKS: + add_hyperparameter(cs, weighted_loss, Constant) return cs diff --git a/autoPyTorch/pipeline/tabular_classification.py b/autoPyTorch/pipeline/tabular_classification.py index bb4cb10ac..b611cd8b3 100644 --- a/autoPyTorch/pipeline/tabular_classification.py +++ b/autoPyTorch/pipeline/tabular_classification.py @@ -7,7 +7,6 @@ import numpy as np -import sklearn.preprocessing from sklearn.base import ClassifierMixin import torch @@ -91,13 +90,8 @@ def _predict_proba(self, X: np.ndarray) -> np.ndarray: loader = self.named_steps['data_loader'].get_loader(X=X) pred = self.named_steps['network'].predict(loader) if isinstance(self.dataset_properties['output_shape'], int): - proba = pred[:, :self.dataset_properties['output_shape']] - normalizer = proba.sum(axis=1)[:, np.newaxis] - normalizer[normalizer == 0.0] = 1.0 - proba /= normalizer - - return proba - + # The final layer is always softmax now (`pred` already gives pseudo proba) + return pred else: all_proba = [] @@ -108,7 +102,7 @@ def _predict_proba(self, X: np.ndarray) -> np.ndarray: proba_k /= normalizer all_proba.append(proba_k) - return all_proba + return np.array(all_proba) def predict_proba(self, X: np.ndarray, batch_size: Optional[int] = None) -> np.ndarray: """predict_proba. @@ -145,11 +139,6 @@ def predict_proba(self, X: np.ndarray, batch_size: Optional[int] = None) -> np.n pred_prob = self.predict_proba(X[batch_from:batch_to], batch_size=None) y[batch_from:batch_to] = pred_prob.astype(np.float32) - # Neural networks might not be fit to produce a [0-1] output - # For instance, after small number of epochs. - y = np.clip(y, 0, 1) - y = sklearn.preprocessing.normalize(y, axis=1, norm='l1') - return y def _get_hyperparameter_search_space(self, diff --git a/autoPyTorch/utils/backend.py b/autoPyTorch/utils/backend.py index c9681adb3..5348bd11c 100644 --- a/autoPyTorch/utils/backend.py +++ b/autoPyTorch/utils/backend.py @@ -204,13 +204,14 @@ def temporary_directory(self) -> str: return self.context.temporary_directory def _make_internals_directory(self) -> None: + # TODO: make exist_ok a function argument try: - os.makedirs(self.internals_directory) + os.makedirs(self.internals_directory, exist_ok=True) except Exception as e: if self._logger is not None: self._logger.debug("_make_internals_directory: %s" % e) try: - os.makedirs(self.get_runs_directory()) + os.makedirs(self.get_runs_directory(), exist_ok=True) except Exception as e: if self._logger is not None: self._logger.debug("_make_internals_directory: %s" % e) @@ -328,6 +329,20 @@ def load_datamanager(self) -> BaseDataset: with open(filepath, 'rb') as fh: return pickle.load(fh) + def replace_datamanager(self, datamanager: BaseDataset) -> None: + """ + This function is called to replace the old datamanager with a datamanager + in case it is required. + + Args: + datamanager (BaseDataset): the new datamanager to replace the old. + """ + warnings.warn("Original dataset will be overwritten with the provided dataset") + datamanager_pickle_file = self._get_datamanager_pickle_filename() + if os.path.exists(datamanager_pickle_file): + os.remove(datamanager_pickle_file) + self.save_datamanager(datamanager=datamanager) + def get_runs_directory(self) -> str: return os.path.join(self.internals_directory, 'runs') diff --git a/autoPyTorch/utils/common.py b/autoPyTorch/utils/common.py index a2ec4d580..892b5a1ad 100644 --- a/autoPyTorch/utils/common.py +++ b/autoPyTorch/utils/common.py @@ -96,7 +96,7 @@ def replace_prefix_in_config_dict(config: Dict[str, Any], prefix: str, replace: k.startswith(prefix)} -def custom_collate_fn(batch: List) -> List[Optional[torch.tensor]]: +def custom_collate_fn(batch: List) -> List[Optional[torch.Tensor]]: """ In the case of not providing a y tensor, in a dataset of form {X, y}, y would be None. diff --git a/autoPyTorch/utils/implementations.py b/autoPyTorch/utils/implementations.py index 2130cfd6b..40ceb85d1 100644 --- a/autoPyTorch/utils/implementations.py +++ b/autoPyTorch/utils/implementations.py @@ -35,7 +35,6 @@ def __call__(self, y: Union[np.ndarray, torch.Tensor]) -> np.ndarray: weights = (np.ones(y.shape[1]) * weight_per_class) / np.maximum(counts, 1) else: classes, counts = np.unique(y, axis=0, return_counts=True) - classes, counts = classes[::-1], counts[::-1] weight_per_class = total_weight / classes.shape[0] weights = (np.ones(classes.shape[0]) * weight_per_class) / counts diff --git a/examples/tabular/40_advanced/example_custom_configuration_space.py b/examples/tabular/40_advanced/example_custom_configuration_space.py index 6a3764b94..b95ceeaa5 100644 --- a/examples/tabular/40_advanced/example_custom_configuration_space.py +++ b/examples/tabular/40_advanced/example_custom_configuration_space.py @@ -54,6 +54,15 @@ def get_search_space_updates(): hyperparameter='ResNetBackbone:dropout', value_range=[0, 0.5], default_value=0.2) + updates.append(node_name='network_backbone', + hyperparameter='ResNetBackbone:multi_branch_choice', + value_range=['shake-shake'], + default_value='shake-shake') + updates.append(node_name='network_backbone', + hyperparameter='ResNetBackbone:shake_shake_method', + value_range=['M3'], + default_value='M3' + ) return updates @@ -74,7 +83,7 @@ def get_search_space_updates(): # ================================================== api = TabularClassificationTask( search_space_updates=get_search_space_updates(), - include_components={'network_backbone': ['MLPBackbone', 'ResNetBackbone'], + include_components={'network_backbone': ['ResNetBackbone'], 'encoder': ['OneHotEncoder']} ) diff --git a/examples/tabular/40_advanced/example_posthoc_ensemble_fit.py b/examples/tabular/40_advanced/example_posthoc_ensemble_fit.py new file mode 100644 index 000000000..b9383b2a6 --- /dev/null +++ b/examples/tabular/40_advanced/example_posthoc_ensemble_fit.py @@ -0,0 +1,81 @@ +""" +===================================================== +Tabular Classification with Post-Hoc Ensemble Fitting +===================================================== + +The following example shows how to fit a sample classification model +and create an ensemble post-hoc with AutoPyTorch +""" +import os +import tempfile as tmp +import warnings + +os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir() +os.environ['OMP_NUM_THREADS'] = '1' +os.environ['OPENBLAS_NUM_THREADS'] = '1' +os.environ['MKL_NUM_THREADS'] = '1' + +warnings.simplefilter(action='ignore', category=UserWarning) +warnings.simplefilter(action='ignore', category=FutureWarning) + +import sklearn.datasets +import sklearn.model_selection + +from autoPyTorch.api.tabular_classification import TabularClassificationTask + + +if __name__ == '__main__': + + ############################################################################ + # Data Loading + # ============ + X, y = sklearn.datasets.fetch_openml(data_id=40981, return_X_y=True, as_frame=True) + X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, + y, + random_state=42, + ) + + ############################################################################ + # Build and fit a classifier + # ========================== + api = TabularClassificationTask( + ensemble_size=0, + seed=42, + ) + + ############################################################################ + # Search for the best neural network + # ================================== + api.search( + X_train=X_train, + y_train=y_train, + X_test=X_test.copy(), + y_test=y_test.copy(), + optimize_metric='accuracy', + total_walltime_limit=250, + func_eval_time_limit_secs=50 + ) + + ############################################################################ + # Print the final performance of the incumbent neural network + # =========================================================== + print(api.run_history, api.trajectory) + y_pred = api.predict(X_test) + score = api.score(y_pred, y_test) + print(score) + + ############################################################################ + # Fit an ensemble with the neural networks fitted during the search + # ================================================================= + + api.fit_ensemble(ensemble_size=5, + # Set the enable_traditional_pipeline=True + # to also include traditional models + # in the ensemble + enable_traditional_pipeline=False) + # Print the final ensemble built by AutoPyTorch + y_pred = api.predict(X_test) + score = api.score(y_pred, y_test) + print(score) + print(api.show_models()) diff --git a/requirements.txt b/requirements.txt index c79104461..f4a913789 100755 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ imgaug>=0.4.0 ConfigSpace>=0.4.14,<0.5 pynisher>=0.6.3 pyrfr>=0.7,<0.9 -smac>=0.13.1,<0.14 +smac dask distributed>=2.2.0 catboost diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py index f9ba2855e..3a70549b0 100644 --- a/test/test_data/test_feature_validator.py +++ b/test/test_data/test_feature_validator.py @@ -1,4 +1,5 @@ import copy +import functools import numpy as np @@ -283,9 +284,9 @@ def test_featurevalidator_fitontypeA_transformtypeB(input_data_featuretest): if isinstance(input_data_featuretest, pd.DataFrame): pytest.skip("Column order change in pandas is not supported") elif isinstance(input_data_featuretest, np.ndarray): - complementary_type = validator.numpy_array_to_pandas(input_data_featuretest) + complementary_type = validator.numpy_to_pandas(input_data_featuretest) elif isinstance(input_data_featuretest, list): - complementary_type, _ = validator.list_to_dataframe(input_data_featuretest) + complementary_type, _ = validator.list_to_pandas(input_data_featuretest) elif sparse.issparse(input_data_featuretest): complementary_type = sparse.csr_matrix(input_data_featuretest.todense()) else: @@ -311,10 +312,121 @@ def test_featurevalidator_get_columns_to_encode(): for col in df.columns: df[col] = df[col].astype(col) - enc_columns, feature_types = validator._get_columns_to_encode(df) + validator.fit(df) - assert enc_columns == ['category', 'bool'] - assert feature_types == ['numerical', 'numerical', 'categorical', 'categorical'] + categorical_columns, numerical_columns, feat_type = validator._get_columns_info(df) + + assert numerical_columns == ['int', 'float'] + assert categorical_columns == ['category', 'bool'] + assert feat_type == ['numerical', 'numerical', 'categorical', 'categorical'] + + +def feature_validator_remove_nan_catcolumns(df_train: pd.DataFrame, df_test: pd.DataFrame, + ans_train: np.ndarray, ans_test: np.ndarray) -> None: + validator = TabularFeatureValidator() + validator.fit(df_train) + transformed_df_train = validator.transform(df_train) + transformed_df_test = validator.transform(df_test) + + assert np.array_equal(transformed_df_train, ans_train) + assert np.array_equal(transformed_df_test, ans_test) + + +def test_feature_validator_remove_nan_catcolumns(): + """ + Make sure categorical columns that have only nan values are removed. + Transform performs the folloing: + * simple imputation for both + * scaling for numerical + * one-hot encoding for categorical + For example, + data = [ + {'A': 1, 'B': np.nan, 'C': np.nan}, + {'A': np.nan, 'B': 3, 'C': np.nan}, + {'A': 2, 'B': np.nan, 'C': np.nan} + ] + and suppose all the columns are categorical, + then + * `A` in {np.nan, 1, 2} + * `B` in {np.nan, 3} + * `C` in {np.nan} <=== it will be dropped. + + So in the column A, + * np.nan ==> [1, 0, 0] + * 1 ==> [0, 1, 0] + * 2 ==> [0, 0, 1] + in the column B, + * np.nan ==> [1, 0] + * 3 ==> [0, 1] + Therefore, by concatenating, + * {'A': 1, 'B': np.nan, 'C': np.nan} ==> [0, 1, 0, 1, 0] + * {'A': np.nan, 'B': 3, 'C': np.nan} ==> [1, 0, 0, 0, 1] + * {'A': 2, 'B': np.nan, 'C': np.nan} ==> [0, 0, 1, 1, 0] + """ + # First case, there exist null columns (B and C) in the train set + # and a same column (C) are not all null for the test set. + + df_train = pd.DataFrame( + [ + {'A': 1, 'B': np.nan, 'C': np.nan}, + {'A': np.nan, 'C': np.nan}, + {'A': 1} + ], + dtype='category', + ) + ans_train = np.array([[0, 1], [1, 0], [0, 1]], dtype=np.float64) + df_test = pd.DataFrame( + [ + {'A': np.nan, 'B': np.nan, 'C': 5}, + {'A': np.nan, 'C': np.nan}, + {'A': 1} + ], + dtype='category', + ) + ans_test = np.array([[1, 0], [1, 0], [0, 1]], dtype=np.float64) + feature_validator_remove_nan_catcolumns(df_train, df_test, ans_train, ans_test) + + # Second case, there exist null columns (B and C) in the training set and + # the same columns (B and C) are null in the test set. + df_train = pd.DataFrame( + [ + {'A': 1, 'B': np.nan, 'C': np.nan}, + {'A': np.nan, 'C': np.nan}, + {'A': 1} + ], + dtype='category', + ) + ans_train = np.array([[0, 1], [1, 0], [0, 1]], dtype=np.float64) + df_test = pd.DataFrame( + [ + {'A': np.nan, 'B': np.nan, 'C': np.nan}, + {'A': np.nan, 'C': np.nan}, + {'A': 1} + ], + dtype='category', + ) + ans_test = np.array([[1, 0], [1, 0], [0, 1]], dtype=np.float64) + feature_validator_remove_nan_catcolumns(df_train, df_test, ans_train, ans_test) + + # Third case, there exist no null columns in the training set and + # null columns exist in the test set. + df_train = pd.DataFrame( + [ + {'A': 1, 'B': 1}, + {'A': 2, 'B': 2} + ], + dtype='category', + ) + ans_train = np.array([[1, 0, 1, 0], [0, 1, 0, 1]], dtype=np.float64) + df_test = pd.DataFrame( + [ + {'A': np.nan, 'B': np.nan}, + {'A': np.nan, 'B': np.nan} + ], + dtype='category', + ) + ans_test = np.array([[0, 0, 0, 0], [0, 0, 0, 0]], dtype=np.float64) + feature_validator_remove_nan_catcolumns(df_train, df_test, ans_train, ans_test) def test_features_unsupported_calls_are_raised(): @@ -550,15 +662,16 @@ def test_feature_validator_imbalanced_data(): validator.fit(X_train) train_feature_types = copy.deepcopy(validator.feat_type) - assert train_feature_types == ['numerical', 'numerical', 'numerical', 'numerical'] + assert train_feature_types == ['numerical'] # validator will throw an error if the column types are not the same transformed_X_test = validator.transform(X_test) transformed_X_test = pd.DataFrame(transformed_X_test) - null_columns = [] - for column in transformed_X_test.columns: - if transformed_X_test[column].isna().all(): - null_columns.append(column) - assert null_columns == [0, 2, 3] + assert sorted(validator.all_nan_columns) == sorted(['A', 'C', 'D']) + # as there are no categorical columns, we can make such an + # assertion. We only expect to drop the all nan columns + total_all_nan_columns = len(validator.all_nan_columns) + total_columns = len(validator.column_order) + assert total_columns - total_all_nan_columns == len(transformed_X_test.columns) # Columns with not all null values in the train split and # completely null on the test split. @@ -577,14 +690,33 @@ def test_feature_validator_imbalanced_data(): X_test = pd.DataFrame.from_dict(test_features) validator = TabularFeatureValidator() validator.fit(X_train) + train_feature_types = copy.deepcopy(validator.feat_type) assert train_feature_types == ['categorical', 'numerical', 'numerical'] transformed_X_test = validator.transform(X_test) transformed_X_test = pd.DataFrame(transformed_X_test) - null_columns = [] - for column in transformed_X_test.columns: - if transformed_X_test[column].isna().all(): - null_columns.append(column) - - assert null_columns == [1] + assert not len(validator.all_nan_columns) + + +def test_comparator(): + numerical = 'numerical' + categorical = 'categorical' + + validator = TabularFeatureValidator + + feat_type = [numerical, categorical] * 10 + ans = [categorical] * 10 + [numerical] * 10 + feat_type = sorted( + feat_type, + key=functools.cmp_to_key(validator._comparator) + ) + assert ans == feat_type + + feat_type = [numerical] * 10 + [categorical] * 10 + ans = [categorical] * 10 + [numerical] * 10 + feat_type = sorted( + feat_type, + key=functools.cmp_to_key(validator._comparator) + ) + assert ans == feat_type