From 751d08688f99f1c3c54700baf6c6b5d1d4caa821 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 22 Feb 2022 11:20:41 +0100 Subject: [PATCH 01/12] Initial implementation without tests --- autoPyTorch/api/tabular_classification.py | 28 +- autoPyTorch/data/tabular_feature_validator.py | 25 +- autoPyTorch/data/tabular_validator.py | 8 +- autoPyTorch/data/utils.py | 302 ++++++++++++++++++ 4 files changed, 358 insertions(+), 5 deletions(-) create mode 100644 autoPyTorch/data/utils.py diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index 03519bef8..ef8a80b79 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -1,4 +1,4 @@ -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union import numpy as np @@ -11,6 +11,11 @@ TASK_TYPES_TO_STRING, ) from autoPyTorch.data.tabular_validator import TabularInputValidator +from autoPyTorch.data.utils import ( + DatasetCompressionSpec, + default_dataset_compression_arg, + validate_dataset_compression_arg +) from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType from autoPyTorch.datasets.resampling_strategy import ( HoldoutValTypes, @@ -163,6 +168,7 @@ def _get_dataset_input_validator( resampling_strategy: Optional[ResamplingStrategies] = None, resampling_strategy_args: Optional[Dict[str, Any]] = None, dataset_name: Optional[str] = None, + dataset_compression: Optional[Mapping[str, Any]] = None, ) -> Tuple[TabularDataset, TabularInputValidator]: """ Returns an object of `TabularDataset` and an object of @@ -202,6 +208,7 @@ def _get_dataset_input_validator( InputValidator = TabularInputValidator( is_classification=True, logger_port=self._logger_port, + dataset_compression=dataset_compression ) # Fit a input validator to check the provided data @@ -242,6 +249,7 @@ def search( disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, load_models: bool = True, portfolio_selection: Optional[str] = None, + dataset_compression: Optional[Mapping[str, Any]] = None, ) -> 'BaseTask': """ Search for the best pipeline configuration for the given dataset. @@ -374,6 +382,20 @@ def search( """ + self._dataset_compression: Optional[DatasetCompressionSpec] + + if isinstance(dataset_compression, bool): + if dataset_compression is True: + self._dataset_compression = default_dataset_compression_arg + else: + self._dataset_compression = None + else: + self._dataset_compression = dataset_compression + + if self._dataset_compression is not None: + self._dataset_compression = validate_dataset_compression_arg( + self._dataset_compression, memory_limit=memory_limit) + self.dataset, self.InputValidator = self._get_dataset_input_validator( X_train=X_train, y_train=y_train, @@ -381,7 +403,9 @@ def search( y_test=y_test, resampling_strategy=self.resampling_strategy, resampling_strategy_args=self.resampling_strategy_args, - dataset_name=dataset_name) + dataset_name=dataset_name, + dataset_compression=self._dataset_compression) + return self._search( dataset=self.dataset, diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 4bab001c6..8927ff013 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -1,5 +1,6 @@ import functools -from typing import Dict, List, Optional, Tuple, cast +from logging import Logger +from typing import Any, Dict, List, Mapping, Optional, Tuple, Type, Union, cast import numpy as np @@ -17,6 +18,8 @@ from sklearn.pipeline import make_pipeline from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SupportedFeatTypes +from autoPyTorch.data.utils import DatasetDTypeContainerType, reduce_dataset_size_if_too_large +from autoPyTorch.utils.logging_ import PicklableClientLogger def _create_column_transformer( @@ -92,6 +95,15 @@ class TabularFeatureValidator(BaseFeatureValidator): categorical_columns (List[int]): List of indices of categorical columns """ + def __init__( + self, + logger: Optional[Union[PicklableClientLogger, Logger]] = None, + dataset_compression: Optional[Mapping[str, Any]] = None, + ) -> None: + self._dataset_compression = dataset_compression + self._precision: Optional[DatasetDTypeContainerType] = None + super().__init__(logger) + @staticmethod def _comparator(cmp1: str, cmp2: str) -> int: """Order so that categorical columns come left and numerical columns come right @@ -259,6 +271,17 @@ def transform( if scipy.sparse.issparse(X) and hasattr(X, 'sort_indices'): X.sort_indices() + if ( + ( + isinstance(X, np.ndarray) or scipy.sparse.issparse(X) or hasattr(X, 'iloc') + ) + and self._dataset_compression is not None + ): + if self._precision is not None: + X.astype(self._precision) + else: + X, self._precision = reduce_dataset_size_if_too_large(X, **self._dataset_compression) + try: X = sklearn.utils.check_array( X, diff --git a/autoPyTorch/data/tabular_validator.py b/autoPyTorch/data/tabular_validator.py index 677b55d4b..4db415f93 100644 --- a/autoPyTorch/data/tabular_validator.py +++ b/autoPyTorch/data/tabular_validator.py @@ -1,6 +1,6 @@ # -*- encoding: utf-8 -*- import logging -from typing import Optional, Union +from typing import Any, Mapping, Optional, Union from autoPyTorch.data.base_validator import BaseInputValidator from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator @@ -32,9 +32,11 @@ def __init__( self, is_classification: bool = False, logger_port: Optional[int] = None, + dataset_compression: Optional[Mapping[str, Any]] = None, ) -> None: self.is_classification = is_classification self.logger_port = logger_port + self.dataset_compression = dataset_compression if self.logger_port is not None: self.logger: Union[logging.Logger, PicklableClientLogger] = get_named_client_logger( name='Validation', @@ -43,7 +45,9 @@ def __init__( else: self.logger = logging.getLogger('Validation') - self.feature_validator = TabularFeatureValidator(logger=self.logger) + self.feature_validator = TabularFeatureValidator( + dataset_compression=self.dataset_compression, + logger=self.logger) self.target_validator = TabularTargetValidator( is_classification=self.is_classification, logger=self.logger diff --git a/autoPyTorch/data/utils.py b/autoPyTorch/data/utils.py new file mode 100644 index 000000000..fde1df95e --- /dev/null +++ b/autoPyTorch/data/utils.py @@ -0,0 +1,302 @@ +# Implementation used from https://github.com/automl/auto-sklearn/blob/development/autosklearn/util/data.py +from math import floor +import warnings +from typing import ( + Any, + Dict, + Iterator, + List, + Mapping, + Optional, + Sequence, + Tuple, + Type, + Union, + cast +) + +import numpy as np + +import pandas as pd + +from scipy.sparse import spmatrix, issparse + + +# TODO: TypedDict with python 3.8 +# +# When upgrading to python 3.8 as minimum version, this should be a TypedDict +# so that mypy can identify the fields types +DatasetCompressionSpec = Dict[str, Union[int, float, List[str]]] +DatasetDTypeContainerType = Union[Type, Dict[str, Type]] +DatasetCompressionInputType = Union[np.ndarray, spmatrix, pd.DataFrame] + +# Default specification for arg `dataset_compression` +default_dataset_compression_arg: DatasetCompressionSpec = { + "memory_allocation": 0.1, + "methods": ["precision"] +} + + +def validate_dataset_compression_arg( + dataset_compression: Mapping[str, Any], + memory_limit: int +) -> DatasetCompressionSpec: + """Validates and return a correct dataset_compression argument + + The returned value can be safely used with `reduce_dataset_size_if_too_large`. + + Parameters + ---------- + dataset_compression: Mapping[str, Any] + The argumnents to validate + + Returns + ------- + DatasetCompressionSpec + The validated and correct dataset compression spec + """ + if isinstance(dataset_compression, Mapping): + # Fill with defaults if they don't exist + dataset_compression = { + **default_dataset_compression_arg, + **dataset_compression + } + + # Must contain known keys + if set(dataset_compression.keys()) != set(default_dataset_compression_arg.keys()): + raise ValueError( + f"Unknown key in dataset_compression, {list(dataset_compression.keys())}." + f"\nPossible keys are {list(default_dataset_compression_arg.keys())}" + ) + + memory_allocation = dataset_compression["memory_allocation"] + + # "memory_allocation" must be float or int + if not (isinstance(memory_allocation, float) or isinstance(memory_allocation, int)): + raise ValueError( + "key 'memory_allocation' must be an `int` or `float`" + f"\ntype = {memory_allocation}" + f"\ndataset_compression = {dataset_compression}" + ) + + # "memory_allocation" if absolute, should be > 0 and < memory_limit + if isinstance(memory_allocation, int) and not (0 < memory_allocation < memory_limit): + raise ValueError( + f"key 'memory_allocation' if int must be in (0, memory_limit={memory_limit})" + f"\nmemory_allocation = {memory_allocation}" + f"\ndataset_compression = {dataset_compression}" + ) + + # "memory_allocation" must be in (0,1) if float + if isinstance(memory_allocation, float): + if not (0.0 < memory_allocation < 1.0): + raise ValueError( + "key 'memory_allocation' if float must be in (0, 1)" + f"\nmemory_allocation = {memory_allocation}" + f"\ndataset_compression = {dataset_compression}" + ) + # convert to int so we can directly use + dataset_compression["memory_allocation"] = floor(memory_allocation * memory_limit) + + # "methods" must be non-empty sequence + if ( + not isinstance(dataset_compression["methods"], Sequence) + or len(dataset_compression["methods"]) <= 0 + ): + raise ValueError( + "key 'methods' must be a non-empty list" + f"\nmethods = {dataset_compression['methods']}" + f"\ndataset_compression = {dataset_compression}" + ) + + # "methods" must contain known methods + if any( + method not in cast(Sequence, default_dataset_compression_arg["methods"]) # mypy + for method in dataset_compression["methods"] + ): + raise ValueError( + f"key 'methods' can only contain {default_dataset_compression_arg['methods']}" + f"\nmethods = {dataset_compression['methods']}" + f"\ndataset_compression = {dataset_compression}" + ) + + return cast(DatasetCompressionSpec, dataset_compression) + else: + raise ValueError( + f"Unknown type for `dataset_compression` {type(dataset_compression)}" + f"\ndataset_compression = {dataset_compression}" + ) + + +class _DtypeReductionMapping(Mapping): + """ + Unfortuantly, mappings compare by hash(item) and not the __eq__ operator + between the key and the item. + + Hence we wrap the dict in a Mapping class and implement our own __getitem__ + such that we do use __eq__ between keys and query items. + + >>> np.float32 == dtype('float32') # True, they are considered equal + >>> + >>> mydict = { np.float32: 'hello' } + >>> + >>> # Equal by __eq__ but dict operations fail + >>> np.dtype('float32') in mydict # False + >>> mydict[dtype('float32')] # KeyError + + This mapping class fixes that supporting the `in` operator as well as `__getitem__` + + >>> reduction_mapping = _DtypeReductionMapping() + >>> + >>> reduction_mapping[np.dtype('float64')] # np.float32 + >>> np.dtype('float32') in reduction_mapping # True + """ + + # Information about dtype support + _mapping: Dict[type, type] = { + np.float32: np.float32, + np.float64: np.float32, + np.int32: np.int32, + np.int64: np.int32 + } + + # In spite of the names, np.float96 and np.float128 + # provide only as much precision as np.longdouble, + # that is, 80 bits on most x86 machines and 64 bits + # in standard Windows builds. + if hasattr(np, 'float96'): + _mapping[np.float96] = np.float64 + + if hasattr(np, 'float128'): + _mapping[np.float128] = np.float64 + + @classmethod + def __getitem__(cls, item: type) -> type: + for k, v in cls._mapping.items(): + if k == item: + return v + raise KeyError(item) + + @classmethod + def __iter__(cls) -> Iterator[type]: + return iter(cls._mapping.keys()) + + @classmethod + def __len__(cls) -> int: + return len(cls._mapping) + + +reduction_mapping = _DtypeReductionMapping() +supported_precision_reductions = list(reduction_mapping) + + +def reduce_precision( + X: DatasetCompressionInputType +) -> Tuple[DatasetCompressionInputType, DatasetDTypeContainerType, DatasetDTypeContainerType]: + """ Reduces the precision of a dataset containing floats or ints + + Parameters + ---------- + X: DatasetCompressionInputType + The data to reduce precision of. + + Returns + ------- + Tuple[DatasetCompressionInputType, DatasetDTypeContainerType, DatasetDTypeContainerType] + Returns the reduced data X along with the dtypes it and the dtypes it was reduced to. + """ + precision: Optional[DatasetDTypeContainerType] = None + if isinstance(X, np.ndarray) or issparse(X): + dtypes = X.dtype + if X.dtype not in supported_precision_reductions: + raise ValueError(f"X.dtype = {X.dtype} not equal to any supported" + f" {supported_precision_reductions}") + precision = reduction_mapping[X.dtype] + X = X.astype(precision) + elif hasattr(X, 'iloc'): + dtypes = {col: X[col].dtype for col in X.columns} + precision = {col: reduction_mapping[dtype] for col, dtype in dtypes.items() + if dtype in supported_precision_reductions} + X = X.astype(precision) + else: + raise ValueError(f"Unrecognised data type of X, expected data type to " + f"be in (ndarray, spmatrix, pd.DataFrame), but got :{type(X)}") + + return X, precision, dtypes + + +def reduce_dataset_size_if_too_large( + X: DatasetCompressionInputType, + memory_allocation: int, + methods: List[str] = ['precision'], +) -> Tuple[DatasetCompressionInputType, Optional[DatasetDTypeContainerType]]: + f""" Reduces the size of the dataset if it's too close to the memory limit. + + Follows the order of the operations passed in and retains the type of its + input. + + Precision reduction will only work on the following data types: + - {supported_precision_reductions} + + Precision reduction will only perform one level of precision reduction. + Technically, you could supply multiple rounds of precision reduction, i.e. + to reduce np.float128 to np.float32 you could use `methods = ['precision'] * 2`. + + However, if that's the use case, it'd be advised to simply use the function + `autoPyTorch.data.utils.reduce_precision`. + + Parameters + ---------- + X: DatasetCompressionInputType + The features of the dataset. + + methods: List[str] = ['precision'] + A list of operations that are permitted to be performed to reduce + the size of the dataset. + + **precision** + + Reduce the precision of float types + + memory_allocation: int + The amount of memory to allocate to the dataset. It should specify an + absolute amount. + + Returns + ------- + DatasetCompressionInputType + The reduced X if reductions were needed + Optional[DatasetDTypeContainerType] + If the precision of the dataset is reduced, + we return the precision dtype container that can be + used for any other dataset in the current experiment. + """ + + def megabytes(arr: DatasetCompressionInputType) -> float: + memory_in_bytes: Optional[int] = None + if isinstance(arr, np.ndarray): + memory_in_bytes = arr.nbytes + elif issparse(arr): + memory_in_bytes = arr.data.nbytes + elif hasattr(arr, 'iloc'): + memory_in_bytes = arr.memory_usage(index=True, deep=True).sum() + else: + return 0 + return memory_in_bytes / (2**20) + + precision: Optional[DatasetDTypeContainerType] = None + for method in methods: + + if method == 'precision': + # If the dataset is too big for the allocated memory, + # we then try to reduce the precision if it's a high precision dataset + if megabytes(X) > memory_allocation: + X, precision, dtypes = reduce_precision(X) + warnings.warn( + f'Dataset too large for allocated memory {memory_allocation}MB, ' + f'reduced the precision from {dtypes} to {precision}', + ) + else: + raise ValueError(f"Unknown operation `{method}`") + + return X, precision From ac98a8d1c4c3061316dc496f103dfe298b29946e Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 22 Feb 2022 15:59:41 +0100 Subject: [PATCH 02/12] add tests and make necessary changes --- autoPyTorch/api/tabular_classification.py | 41 ++++++++-- autoPyTorch/api/tabular_regression.py | 30 ++++++- autoPyTorch/data/tabular_feature_validator.py | 26 +++--- autoPyTorch/data/utils.py | 63 +++++++++------ test/test_data/test_feature_validator.py | 45 +++++++++++ test/test_data/test_utils.py | 81 +++++++++++++++++++ 6 files changed, 238 insertions(+), 48 deletions(-) create mode 100644 test/test_data/test_utils.py diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index ef8a80b79..8bfa446c4 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -12,7 +12,6 @@ ) from autoPyTorch.data.tabular_validator import TabularInputValidator from autoPyTorch.data.utils import ( - DatasetCompressionSpec, default_dataset_compression_arg, validate_dataset_compression_arg ) @@ -241,7 +240,7 @@ def search( total_walltime_limit: int = 100, func_eval_time_limit_secs: Optional[int] = None, enable_traditional_pipeline: bool = True, - memory_limit: Optional[int] = 4096, + memory_limit: int = 4096, smac_scenario_args: Optional[Dict[str, Any]] = None, get_smac_object_callback: Optional[Callable] = None, all_supported_metrics: bool = True, @@ -249,7 +248,7 @@ def search( disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, load_models: bool = True, portfolio_selection: Optional[str] = None, - dataset_compression: Optional[Mapping[str, Any]] = None, + dataset_compression: Union[Mapping[str, Any], bool] = False, ) -> 'BaseTask': """ Search for the best pipeline configuration for the given dataset. @@ -318,7 +317,7 @@ def search( feature by turning this flag to False. All machine learning algorithms that are fitted during search() are considered for ensemble building. - memory_limit (Optional[int]: default=4096): + memory_limit (int: default=4096): Memory limit in MB for the machine learning algorithm. Autopytorch will stop fitting the machine learning algorithm if it tries to allocate more than memory_limit MB. If None @@ -376,13 +375,42 @@ def search( Additionally, the keyword 'greedy' is supported, which would use the default portfolio from `AutoPyTorch Tabular `_. + dataset_compression: Union[bool, Mapping[str, Any]] = True + We compress datasets so that they fit into some predefined amount of memory. + **NOTE** + + Default configuration when left as ``True``: + .. code-block:: python + { + "memory_allocation": 0.1, + "methods": ["precision"] + } + You can also pass your own configuration with the same keys and choosing + from the available ``"methods"``. + The available options are described here: + **memory_allocation** + By default, we attempt to fit the dataset into ``0.1 * memory_limit``. This + float value can be set with ``"memory_allocation": 0.1``. We also allow for + specifying absolute memory in MB, e.g. 10MB is ``"memory_allocation": 10``. + The memory used by the dataset is checked after each reduction method is + performed. If the dataset fits into the allocated memory, any further methods + listed in ``"methods"`` will not be performed. + + **methods** + We currently provide the following methods for reducing the dataset size. + These can be provided in a list and are performed in the order as given. + * ``"precision"`` - We reduce floating point precision as follows: + * ``np.float128 -> np.float64`` + * ``np.float96 -> np.float64`` + * ``np.float64 -> np.float32`` + * pandas dataframes are reduced using the downcast option of `pd.to_numeric` + to the lowest possible precision. Returns: self """ - - self._dataset_compression: Optional[DatasetCompressionSpec] + self._dataset_compression: Optional[Mapping[str, Any]] if isinstance(dataset_compression, bool): if dataset_compression is True: @@ -406,7 +434,6 @@ def search( dataset_name=dataset_name, dataset_compression=self._dataset_compression) - return self._search( dataset=self.dataset, optimize_metric=optimize_metric, diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py index 8c0637e39..b4c0c3e1c 100644 --- a/autoPyTorch/api/tabular_regression.py +++ b/autoPyTorch/api/tabular_regression.py @@ -1,4 +1,4 @@ -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union import numpy as np @@ -11,6 +11,10 @@ TASK_TYPES_TO_STRING ) from autoPyTorch.data.tabular_validator import TabularInputValidator +from autoPyTorch.data.utils import ( + default_dataset_compression_arg, + validate_dataset_compression_arg +) from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType from autoPyTorch.datasets.resampling_strategy import ( HoldoutValTypes, @@ -164,6 +168,7 @@ def _get_dataset_input_validator( resampling_strategy: Optional[ResamplingStrategies] = None, resampling_strategy_args: Optional[Dict[str, Any]] = None, dataset_name: Optional[str] = None, + dataset_compression: Optional[Mapping[str, Any]] = None, ) -> Tuple[TabularDataset, TabularInputValidator]: """ Returns an object of `TabularDataset` and an object of @@ -203,6 +208,7 @@ def _get_dataset_input_validator( InputValidator = TabularInputValidator( is_classification=False, logger_port=self._logger_port, + dataset_compression=dataset_compression ) # Fit a input validator to check the provided data @@ -235,7 +241,7 @@ def search( total_walltime_limit: int = 100, func_eval_time_limit_secs: Optional[int] = None, enable_traditional_pipeline: bool = True, - memory_limit: Optional[int] = 4096, + memory_limit: int = 4096, smac_scenario_args: Optional[Dict[str, Any]] = None, get_smac_object_callback: Optional[Callable] = None, all_supported_metrics: bool = True, @@ -243,6 +249,7 @@ def search( disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, load_models: bool = True, portfolio_selection: Optional[str] = None, + dataset_compression: Union[Mapping[str, Any], bool] = False, ) -> 'BaseTask': """ Search for the best pipeline configuration for the given dataset. @@ -311,7 +318,7 @@ def search( feature by turning this flag to False. All machine learning algorithms that are fitted during search() are considered for ensemble building. - memory_limit (Optional[int]: default=4096): + memory_limit (int: default=4096): Memory limit in MB for the machine learning algorithm. Autopytorch will stop fitting the machine learning algorithm if it tries to allocate more than memory_limit MB. If None @@ -374,6 +381,20 @@ def search( self """ + self._dataset_compression: Optional[Mapping[str, Any]] + + if isinstance(dataset_compression, bool): + if dataset_compression is True: + self._dataset_compression = default_dataset_compression_arg + else: + self._dataset_compression = None + else: + self._dataset_compression = dataset_compression + + if self._dataset_compression is not None: + self._dataset_compression = validate_dataset_compression_arg( + self._dataset_compression, memory_limit=memory_limit) + self.dataset, self.InputValidator = self._get_dataset_input_validator( X_train=X_train, y_train=y_train, @@ -381,7 +402,8 @@ def search( y_test=y_test, resampling_strategy=self.resampling_strategy, resampling_strategy_args=self.resampling_strategy_args, - dataset_name=dataset_name) + dataset_name=dataset_name, + dataset_compression=self._dataset_compression) return self._search( dataset=self.dataset, diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 8927ff013..b9f211283 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -1,6 +1,6 @@ import functools from logging import Logger -from typing import Any, Dict, List, Mapping, Optional, Tuple, Type, Union, cast +from typing import Any, Dict, List, Mapping, Optional, Tuple, Union, cast import numpy as np @@ -271,17 +271,6 @@ def transform( if scipy.sparse.issparse(X) and hasattr(X, 'sort_indices'): X.sort_indices() - if ( - ( - isinstance(X, np.ndarray) or scipy.sparse.issparse(X) or hasattr(X, 'iloc') - ) - and self._dataset_compression is not None - ): - if self._precision is not None: - X.astype(self._precision) - else: - X, self._precision = reduce_dataset_size_if_too_large(X, **self._dataset_compression) - try: X = sklearn.utils.check_array( X, @@ -295,6 +284,19 @@ def transform( "Please try to manually cast it to a supported " "numerical or categorical values.") raise e + + if ( + ( + isinstance(X, np.ndarray) or scipy.sparse.issparse(X) or hasattr(X, 'iloc') + ) + and self._dataset_compression is not None + ): + if self._precision is not None: + X = X.astype(self._precision) + else: + X = reduce_dataset_size_if_too_large(X, **self._dataset_compression) + self._precision = dict(X.dtypes) if hasattr(X, 'iloc') else X.dtype + return X def _check_data( diff --git a/autoPyTorch/data/utils.py b/autoPyTorch/data/utils.py index fde1df95e..16b0439f6 100644 --- a/autoPyTorch/data/utils.py +++ b/autoPyTorch/data/utils.py @@ -1,6 +1,6 @@ # Implementation used from https://github.com/automl/auto-sklearn/blob/development/autosklearn/util/data.py -from math import floor import warnings +from math import floor from typing import ( Any, Dict, @@ -18,8 +18,9 @@ import numpy as np import pandas as pd +from pandas.api.types import is_float_dtype, is_numeric_dtype -from scipy.sparse import spmatrix, issparse +from scipy.sparse import issparse, spmatrix # TODO: TypedDict with python 3.8 @@ -195,6 +196,9 @@ def reduce_precision( ) -> Tuple[DatasetCompressionInputType, DatasetDTypeContainerType, DatasetDTypeContainerType]: """ Reduces the precision of a dataset containing floats or ints + Note: + For dataframe, the column's precision is reduced using pd.to_numeric. + Parameters ---------- X: DatasetCompressionInputType @@ -214,22 +218,47 @@ def reduce_precision( precision = reduction_mapping[X.dtype] X = X.astype(precision) elif hasattr(X, 'iloc'): - dtypes = {col: X[col].dtype for col in X.columns} - precision = {col: reduction_mapping[dtype] for col, dtype in dtypes.items() - if dtype in supported_precision_reductions} - X = X.astype(precision) + dtypes = dict(X.dtypes) + + integer_columns = [] + float_columns = [] + + for col, dtype in dtypes.items(): + if is_numeric_dtype(dtype): + if is_float_dtype(dtype): + float_columns.append(col) + else: + integer_columns.append(col) + + if len(integer_columns) > 0: + X[integer_columns] = X[integer_columns].apply(lambda column: pd.to_numeric(column, downcast='integer')) + if len(float_columns) > 0: + X[float_columns] = X[float_columns].apply(lambda column: pd.to_numeric(column, downcast='float')) + precision = dict(X.dtypes) else: raise ValueError(f"Unrecognised data type of X, expected data type to " - f"be in (ndarray, spmatrix, pd.DataFrame), but got :{type(X)}") + f"be in (np.ndarray, spmatrix, pd.DataFrame), but got :{type(X)}") return X, precision, dtypes +def megabytes(arr: DatasetCompressionInputType) -> float: + if isinstance(arr, np.ndarray): + memory_in_bytes = arr.nbytes + elif issparse(arr): + memory_in_bytes = arr.data.nbytes + elif hasattr(arr, 'iloc'): + memory_in_bytes = arr.memory_usage(index=True, deep=True).sum() + else: + return 0 + return float(memory_in_bytes / (2**20)) + + def reduce_dataset_size_if_too_large( X: DatasetCompressionInputType, memory_allocation: int, methods: List[str] = ['precision'], -) -> Tuple[DatasetCompressionInputType, Optional[DatasetDTypeContainerType]]: +) -> DatasetCompressionInputType: f""" Reduces the size of the dataset if it's too close to the memory limit. Follows the order of the operations passed in and retains the type of its @@ -266,24 +295,8 @@ def reduce_dataset_size_if_too_large( ------- DatasetCompressionInputType The reduced X if reductions were needed - Optional[DatasetDTypeContainerType] - If the precision of the dataset is reduced, - we return the precision dtype container that can be - used for any other dataset in the current experiment. """ - def megabytes(arr: DatasetCompressionInputType) -> float: - memory_in_bytes: Optional[int] = None - if isinstance(arr, np.ndarray): - memory_in_bytes = arr.nbytes - elif issparse(arr): - memory_in_bytes = arr.data.nbytes - elif hasattr(arr, 'iloc'): - memory_in_bytes = arr.memory_usage(index=True, deep=True).sum() - else: - return 0 - return memory_in_bytes / (2**20) - precision: Optional[DatasetDTypeContainerType] = None for method in methods: @@ -299,4 +312,4 @@ def megabytes(arr: DatasetCompressionInputType) -> float: else: raise ValueError(f"Unknown operation `{method}`") - return X, precision + return X diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py index 7f2ff2507..5eb28309d 100644 --- a/test/test_data/test_feature_validator.py +++ b/test/test_data/test_feature_validator.py @@ -13,6 +13,7 @@ import sklearn.model_selection from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator +from autoPyTorch.data.utils import megabytes # Fixtures to be used in this class. By default all elements have 100 datapoints @@ -557,3 +558,47 @@ def test_comparator(): key=functools.cmp_to_key(validator._comparator) ) assert ans == feat_type + + +# Actual checks for the features +@pytest.mark.parametrize( + 'input_data_featuretest', + ( + 'numpy_numericalonly_nonan', + 'numpy_numericalonly_nan', + 'numpy_mixed_nan', + 'pandas_numericalonly_nan', + 'sparse_bsr_nonan', + 'sparse_bsr_nan', + 'sparse_coo_nonan', + 'sparse_coo_nan', + 'sparse_csc_nonan', + 'sparse_csc_nan', + 'sparse_csr_nonan', + 'sparse_csr_nan', + 'sparse_dia_nonan', + 'sparse_dia_nan', + 'sparse_dok_nonan', + 'sparse_dok_nan', + 'openml_40981', # Australian + ), + indirect=True +) +def test_featurevalidator_reduce_precision(input_data_featuretest): + X_train, X_test = sklearn.model_selection.train_test_split( + input_data_featuretest, test_size=0.1, random_state=1) + validator = TabularFeatureValidator(dataset_compression={'memory_allocation': 0, 'methods': ['precision']}) + validator.fit(X_train=X_train) + transformed_X_train = validator.transform(X_train.copy()) + + assert validator._precision is not None + assert megabytes(transformed_X_train) < megabytes(X_train) + + transformed_X_test = validator.transform(X_test.copy()) + assert megabytes(transformed_X_test) < megabytes(X_test) + if hasattr(transformed_X_train, 'iloc'): + assert all(transformed_X_train.dtypes == transformed_X_test.dtypes) + assert all(transformed_X_train.dtypes == validator._precision) + else: + assert transformed_X_train.dtype == transformed_X_test.dtype + assert transformed_X_test.dtype == validator._precision diff --git a/test/test_data/test_utils.py b/test/test_data/test_utils.py new file mode 100644 index 000000000..2c7b31419 --- /dev/null +++ b/test/test_data/test_utils.py @@ -0,0 +1,81 @@ +import numpy as np + +from pandas.testing import assert_frame_equal + +import pytest + +from sklearn.datasets import fetch_openml +from sklearn.model_selection import train_test_split + +from autoPyTorch.data.utils import ( + megabytes, + reduce_dataset_size_if_too_large, + reduce_precision, + validate_dataset_compression_arg +) +from autoPyTorch.utils.common import subsampler + + +@pytest.mark.parametrize('openmlid', [2, 40984]) +@pytest.mark.parametrize('as_frame', [True, False]) +def test_data_validation_for_classification(openmlid, as_frame, n_samples): + X, _ = fetch_openml(data_id=openmlid, return_X_y=True, as_frame=as_frame) + X = subsampler(data=X, x=range(n_samples)) + X_train, X_test = train_test_split( + X, test_size=0.33, random_state=0) + X_converted, precision = reduce_dataset_size_if_too_large(X.copy(), memory_allocation=0) + np.allclose(X, X_converted) if not as_frame else assert_frame_equal(X, X_converted, check_dtype=False) + assert megabytes(X_converted) < megabytes(X) + if as_frame: + assert isinstance(precision, dict) + assert isinstance(list(precision.values())[0], type) + else: + assert isinstance(precision, type) + + +def test_validate_dataset_compression_arg(): + + data_compression_args = validate_dataset_compression_arg({}, 10) + # check whether the function uses default args + # to fill in case args is empty + assert data_compression_args is not None + + # assert memory allocation is an integer after validation + assert isinstance(data_compression_args['memory_allocation'], int) + + # check whether the function raises an error + # in case an unknown key is in args + with pytest.raises(ValueError, match=r'Unknown key in dataset_compression, .*'): + validate_dataset_compression_arg({'not_there': 1}, 1) + + # check whether the function raises an error + # in case memory_allocation is not int or float is in args + with pytest.raises(ValueError, match=r"key 'memory_allocation' must be an `int` or `float`.*"): + validate_dataset_compression_arg({'memory_allocation': 'not int'}, 1) + + # check whether the function raises an error + # in case memory_allocation is an int greater than memory limit + with pytest.raises(ValueError, match=r"key 'memory_allocation' if int must be in.*"): + validate_dataset_compression_arg({'memory_allocation': 1}, 0) + + # check whether the function raises an error + # in case memory_allocation is a float greater than 1 + with pytest.raises(ValueError, match=r"key 'memory_allocation' if float must be in.*"): + validate_dataset_compression_arg({'memory_allocation': 1.5}, 0) + + # check whether the function raises an error + # in case an unknown method is passed in args + with pytest.raises(ValueError, match=r"key 'methods' can only contain .*"): + validate_dataset_compression_arg({'methods': 'unknown'}, 1) + + # check whether the function raises an error + # in case an unknown key is in args + with pytest.raises(ValueError, match=r'Unknown type for `dataset_compression` .*'): + validate_dataset_compression_arg(1, 1) + + +def test_error_raised_reduce_precision(): + # check whether the function raises an error + # in case X is not an expected type + with pytest.raises(ValueError, match=r'Unrecognised data type of X, expected data type to .*'): + reduce_precision(X='not expected') From 1eda40d34fa7ec43bdb4230946dcb6704f068c6c Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 22 Feb 2022 17:20:51 +0100 Subject: [PATCH 03/12] improve documentation --- autoPyTorch/api/tabular_classification.py | 18 +++---- autoPyTorch/api/tabular_regression.py | 30 ++++++++++++ autoPyTorch/data/utils.py | 60 ++++++++++------------- test/test_data/test_utils.py | 4 +- 4 files changed, 67 insertions(+), 45 deletions(-) diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index 8bfa446c4..f37b8b228 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -396,15 +396,15 @@ def search( performed. If the dataset fits into the allocated memory, any further methods listed in ``"methods"`` will not be performed. - **methods** - We currently provide the following methods for reducing the dataset size. - These can be provided in a list and are performed in the order as given. - * ``"precision"`` - We reduce floating point precision as follows: - * ``np.float128 -> np.float64`` - * ``np.float96 -> np.float64`` - * ``np.float64 -> np.float32`` - * pandas dataframes are reduced using the downcast option of `pd.to_numeric` - to the lowest possible precision. + **methods** + We currently provide the following methods for reducing the dataset size. + These can be provided in a list and are performed in the order as given. + * ``"precision"`` - We reduce floating point precision as follows: + * ``np.float128 -> np.float64`` + * ``np.float96 -> np.float64`` + * ``np.float64 -> np.float32`` + * pandas dataframes are reduced using the downcast option of `pd.to_numeric` + to the lowest possible precision. Returns: self diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py index b4c0c3e1c..cdbf49339 100644 --- a/autoPyTorch/api/tabular_regression.py +++ b/autoPyTorch/api/tabular_regression.py @@ -376,6 +376,36 @@ def search( Additionally, the keyword 'greedy' is supported, which would use the default portfolio from `AutoPyTorch Tabular `_. + dataset_compression: Union[bool, Mapping[str, Any]] = True + We compress datasets so that they fit into some predefined amount of memory. + **NOTE** + + Default configuration when left as ``True``: + .. code-block:: python + { + "memory_allocation": 0.1, + "methods": ["precision"] + } + You can also pass your own configuration with the same keys and choosing + from the available ``"methods"``. + The available options are described here: + **memory_allocation** + By default, we attempt to fit the dataset into ``0.1 * memory_limit``. This + float value can be set with ``"memory_allocation": 0.1``. We also allow for + specifying absolute memory in MB, e.g. 10MB is ``"memory_allocation": 10``. + The memory used by the dataset is checked after each reduction method is + performed. If the dataset fits into the allocated memory, any further methods + listed in ``"methods"`` will not be performed. + + **methods** + We currently provide the following methods for reducing the dataset size. + These can be provided in a list and are performed in the order as given. + * ``"precision"`` - We reduce floating point precision as follows: + * ``np.float128 -> np.float64`` + * ``np.float96 -> np.float64`` + * ``np.float64 -> np.float32`` + * pandas dataframes are reduced using the downcast option of `pd.to_numeric` + to the lowest possible precision. Returns: self diff --git a/autoPyTorch/data/utils.py b/autoPyTorch/data/utils.py index 16b0439f6..f38a64c3f 100644 --- a/autoPyTorch/data/utils.py +++ b/autoPyTorch/data/utils.py @@ -46,15 +46,13 @@ def validate_dataset_compression_arg( The returned value can be safely used with `reduce_dataset_size_if_too_large`. - Parameters - ---------- - dataset_compression: Mapping[str, Any] - The argumnents to validate - - Returns - ------- - DatasetCompressionSpec - The validated and correct dataset compression spec + Args: + dataset_compression: Mapping[str, Any] + The argumnents to validate + + Returns: + DatasetCompressionSpec + The validated and correct dataset compression spec """ if isinstance(dataset_compression, Mapping): # Fill with defaults if they don't exist @@ -199,15 +197,13 @@ def reduce_precision( Note: For dataframe, the column's precision is reduced using pd.to_numeric. - Parameters - ---------- - X: DatasetCompressionInputType - The data to reduce precision of. + Args: + X: DatasetCompressionInputType + The data to reduce precision of. - Returns - ------- - Tuple[DatasetCompressionInputType, DatasetDTypeContainerType, DatasetDTypeContainerType] - Returns the reduced data X along with the dtypes it and the dtypes it was reduced to. + Returns: + Tuple[DatasetCompressionInputType, DatasetDTypeContainerType, DatasetDTypeContainerType] + Returns the reduced data X along with the dtypes it and the dtypes it was reduced to. """ precision: Optional[DatasetDTypeContainerType] = None if isinstance(X, np.ndarray) or issparse(X): @@ -274,27 +270,25 @@ def reduce_dataset_size_if_too_large( However, if that's the use case, it'd be advised to simply use the function `autoPyTorch.data.utils.reduce_precision`. - Parameters - ---------- - X: DatasetCompressionInputType - The features of the dataset. + Args: + X: DatasetCompressionInputType + The features of the dataset. - methods: List[str] = ['precision'] - A list of operations that are permitted to be performed to reduce - the size of the dataset. + methods: List[str] = ['precision'] + A list of operations that are permitted to be performed to reduce + the size of the dataset. - **precision** + **precision** - Reduce the precision of float types + Reduce the precision of float types - memory_allocation: int - The amount of memory to allocate to the dataset. It should specify an - absolute amount. + memory_allocation: int + The amount of memory to allocate to the dataset. It should specify an + absolute amount. - Returns - ------- - DatasetCompressionInputType - The reduced X if reductions were needed + Returns: + DatasetCompressionInputType + The reduced X if reductions were needed """ precision: Optional[DatasetDTypeContainerType] = None diff --git a/test/test_data/test_utils.py b/test/test_data/test_utils.py index 2c7b31419..f599cb604 100644 --- a/test/test_data/test_utils.py +++ b/test/test_data/test_utils.py @@ -5,7 +5,6 @@ import pytest from sklearn.datasets import fetch_openml -from sklearn.model_selection import train_test_split from autoPyTorch.data.utils import ( megabytes, @@ -21,8 +20,7 @@ def test_data_validation_for_classification(openmlid, as_frame, n_samples): X, _ = fetch_openml(data_id=openmlid, return_X_y=True, as_frame=as_frame) X = subsampler(data=X, x=range(n_samples)) - X_train, X_test = train_test_split( - X, test_size=0.33, random_state=0) + X_converted, precision = reduce_dataset_size_if_too_large(X.copy(), memory_allocation=0) np.allclose(X, X_converted) if not as_frame else assert_frame_equal(X, X_converted, check_dtype=False) assert megabytes(X_converted) < megabytes(X) From ed53a1fb29f32e8abf958144292fc6d35a2e7411 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 22 Feb 2022 18:34:05 +0100 Subject: [PATCH 04/12] fix tests --- test/test_data/test_utils.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/test/test_data/test_utils.py b/test/test_data/test_utils.py index f599cb604..7f47469ef 100644 --- a/test/test_data/test_utils.py +++ b/test/test_data/test_utils.py @@ -17,18 +17,13 @@ @pytest.mark.parametrize('openmlid', [2, 40984]) @pytest.mark.parametrize('as_frame', [True, False]) -def test_data_validation_for_classification(openmlid, as_frame, n_samples): +def test_reduce_dataset_if_too_large(openmlid, as_frame, n_samples): X, _ = fetch_openml(data_id=openmlid, return_X_y=True, as_frame=as_frame) X = subsampler(data=X, x=range(n_samples)) - X_converted, precision = reduce_dataset_size_if_too_large(X.copy(), memory_allocation=0) + X_converted = reduce_dataset_size_if_too_large(X.copy(), memory_allocation=0) np.allclose(X, X_converted) if not as_frame else assert_frame_equal(X, X_converted, check_dtype=False) assert megabytes(X_converted) < megabytes(X) - if as_frame: - assert isinstance(precision, dict) - assert isinstance(list(precision.values())[0], type) - else: - assert isinstance(precision, type) def test_validate_dataset_compression_arg(): From 6a74d9f8661ae1d72ad4b340455246635c62db29 Mon Sep 17 00:00:00 2001 From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com> Date: Thu, 24 Feb 2022 11:34:35 +0100 Subject: [PATCH 05/12] Apply suggestions from code review Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com> --- autoPyTorch/data/utils.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/autoPyTorch/data/utils.py b/autoPyTorch/data/utils.py index f38a64c3f..3915c2009 100644 --- a/autoPyTorch/data/utils.py +++ b/autoPyTorch/data/utils.py @@ -42,7 +42,7 @@ def validate_dataset_compression_arg( dataset_compression: Mapping[str, Any], memory_limit: int ) -> DatasetCompressionSpec: - """Validates and return a correct dataset_compression argument + """Validate and return a correct dataset_compression argument The returned value can be safely used with `reduce_dataset_size_if_too_large`. @@ -163,18 +163,11 @@ class _DtypeReductionMapping(Mapping): # provide only as much precision as np.longdouble, # that is, 80 bits on most x86 machines and 64 bits # in standard Windows builds. - if hasattr(np, 'float96'): - _mapping[np.float96] = np.float64 - - if hasattr(np, 'float128'): - _mapping[np.float128] = np.float64 + _mapping.update({getattr(np, s): np.float64 for s in ['float96', 'float128'] if hasattr(np, s)}) @classmethod def __getitem__(cls, item: type) -> type: - for k, v in cls._mapping.items(): - if k == item: - return v - raise KeyError(item) + return cls._mapping[item] @classmethod def __iter__(cls) -> Iterator[type]: @@ -192,7 +185,7 @@ def __len__(cls) -> int: def reduce_precision( X: DatasetCompressionInputType ) -> Tuple[DatasetCompressionInputType, DatasetDTypeContainerType, DatasetDTypeContainerType]: - """ Reduces the precision of a dataset containing floats or ints + """ Reduce the precision of a dataset containing floats or ints Note: For dataframe, the column's precision is reduced using pd.to_numeric. From 8353e221c78c488eeb7199d7daf164dcb07a9e6a Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Thu, 24 Feb 2022 12:28:54 +0100 Subject: [PATCH 06/12] undo change in as it causes tests to fail --- autoPyTorch/data/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/autoPyTorch/data/utils.py b/autoPyTorch/data/utils.py index 3915c2009..ce69e9d70 100644 --- a/autoPyTorch/data/utils.py +++ b/autoPyTorch/data/utils.py @@ -167,7 +167,10 @@ class _DtypeReductionMapping(Mapping): @classmethod def __getitem__(cls, item: type) -> type: - return cls._mapping[item] + for k, v in cls._mapping.items(): + if k == item: + return v + raise KeyError(item) @classmethod def __iter__(cls) -> Iterator[type]: From e61b9cb90cee55adba1757fe8dbea0207d4af3fb Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Thu, 24 Feb 2022 16:14:59 +0100 Subject: [PATCH 07/12] change name from InputValidator to input_validator --- autoPyTorch/api/base_task.py | 2 +- autoPyTorch/api/tabular_classification.py | 22 +++++++++++----------- autoPyTorch/api/tabular_regression.py | 16 ++++++++-------- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 905d795fd..a048e2054 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -243,7 +243,7 @@ def __init__( if self.n_jobs == 1: self._multiprocessing_context = 'fork' - self.InputValidator: Optional[BaseInputValidator] = None + self.input_validator: Optional[BaseInputValidator] = None self.search_space_updates = search_space_updates if search_space_updates is not None: diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index f37b8b228..61dc68151 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -204,7 +204,7 @@ def _get_dataset_input_validator( # Create a validator object to make sure that the data provided by # the user matches the autopytorch requirements - InputValidator = TabularInputValidator( + input_validator = TabularInputValidator( is_classification=True, logger_port=self._logger_port, dataset_compression=dataset_compression @@ -213,18 +213,18 @@ def _get_dataset_input_validator( # Fit a input validator to check the provided data # Also, an encoder is fit to both train and test data, # to prevent unseen categories during inference - InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) + input_validator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) dataset = TabularDataset( X=X_train, Y=y_train, X_test=X_test, Y_test=y_test, - validator=InputValidator, + validator=input_validator, resampling_strategy=resampling_strategy, resampling_strategy_args=resampling_strategy_args, dataset_name=dataset_name ) - return dataset, InputValidator + return dataset, input_validator def search( self, @@ -424,7 +424,7 @@ def search( self._dataset_compression = validate_dataset_compression_arg( self._dataset_compression, memory_limit=memory_limit) - self.dataset, self.InputValidator = self._get_dataset_input_validator( + self.dataset, self.input_validator = self._get_dataset_input_validator( X_train=X_train, y_train=y_train, X_test=X_test, @@ -469,28 +469,28 @@ def predict( Returns: Array with estimator predictions. """ - if self.InputValidator is None or not self.InputValidator._is_fitted: + if self.input_validator is None or not self.input_validator._is_fitted: raise ValueError("predict() is only supported after calling search. Kindly call first " "the estimator search() method.") - X_test = self.InputValidator.feature_validator.transform(X_test) + X_test = self.input_validator.feature_validator.transform(X_test) predicted_probabilities = super().predict(X_test, batch_size=batch_size, n_jobs=n_jobs) - if self.InputValidator.target_validator.is_single_column_target(): + if self.input_validator.target_validator.is_single_column_target(): predicted_indexes = np.argmax(predicted_probabilities, axis=1) else: predicted_indexes = (predicted_probabilities > 0.5).astype(int) # Allow to predict in the original domain -- that is, the user is not interested # in our encoded values - return self.InputValidator.target_validator.inverse_transform(predicted_indexes) + return self.input_validator.target_validator.inverse_transform(predicted_indexes) def predict_proba(self, X_test: Union[np.ndarray, pd.DataFrame, List], batch_size: Optional[int] = None, n_jobs: int = 1) -> np.ndarray: - if self.InputValidator is None or not self.InputValidator._is_fitted: + if self.input_validator is None or not self.input_validator._is_fitted: raise ValueError("predict() is only supported after calling search. Kindly call first " "the estimator search() method.") - X_test = self.InputValidator.feature_validator.transform(X_test) + X_test = self.input_validator.feature_validator.transform(X_test) return super().predict(X_test, batch_size=batch_size, n_jobs=n_jobs) diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py index cdbf49339..ec0ebdcad 100644 --- a/autoPyTorch/api/tabular_regression.py +++ b/autoPyTorch/api/tabular_regression.py @@ -205,7 +205,7 @@ def _get_dataset_input_validator( # Create a validator object to make sure that the data provided by # the user matches the autopytorch requirements - InputValidator = TabularInputValidator( + input_validator = TabularInputValidator( is_classification=False, logger_port=self._logger_port, dataset_compression=dataset_compression @@ -214,18 +214,18 @@ def _get_dataset_input_validator( # Fit a input validator to check the provided data # Also, an encoder is fit to both train and test data, # to prevent unseen categories during inference - InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) + input_validator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) dataset = TabularDataset( X=X_train, Y=y_train, X_test=X_test, Y_test=y_test, - validator=InputValidator, + validator=input_validator, resampling_strategy=resampling_strategy, resampling_strategy_args=resampling_strategy_args, dataset_name=dataset_name ) - return dataset, InputValidator + return dataset, input_validator def search( self, @@ -425,7 +425,7 @@ def search( self._dataset_compression = validate_dataset_compression_arg( self._dataset_compression, memory_limit=memory_limit) - self.dataset, self.InputValidator = self._get_dataset_input_validator( + self.dataset, self.input_validator = self._get_dataset_input_validator( X_train=X_train, y_train=y_train, X_test=X_test, @@ -460,14 +460,14 @@ def predict( batch_size: Optional[int] = None, n_jobs: int = 1 ) -> np.ndarray: - if self.InputValidator is None or not self.InputValidator._is_fitted: + if self.input_validator is None or not self.input_validator._is_fitted: raise ValueError("predict() is only supported after calling search. Kindly call first " "the estimator search() method.") - X_test = self.InputValidator.feature_validator.transform(X_test) + X_test = self.input_validator.feature_validator.transform(X_test) predicted_values = super().predict(X_test, batch_size=batch_size, n_jobs=n_jobs) # Allow to predict in the original domain -- that is, the user is not interested # in our encoded values - return self.InputValidator.target_validator.inverse_transform(predicted_values) + return self.input_validator.target_validator.inverse_transform(predicted_values) From 95f1c85856568fed7d1dec7d75cd724e4df9e3c0 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Thu, 24 Feb 2022 17:06:23 +0100 Subject: [PATCH 08/12] extract statements to methods --- autoPyTorch/api/tabular_classification.py | 52 ++++++++++++---- autoPyTorch/api/tabular_regression.py | 51 ++++++++++++---- autoPyTorch/data/tabular_feature_validator.py | 60 +++++++++++++------ autoPyTorch/data/utils.py | 22 +++---- test/test_data/test_feature_validator.py | 4 +- 5 files changed, 133 insertions(+), 56 deletions(-) diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index 61dc68151..a69471da4 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -12,6 +12,7 @@ ) from autoPyTorch.data.tabular_validator import TabularInputValidator from autoPyTorch.data.utils import ( + DatasetCompressionSpec, default_dataset_compression_arg, validate_dataset_compression_arg ) @@ -410,19 +411,7 @@ def search( self """ - self._dataset_compression: Optional[Mapping[str, Any]] - - if isinstance(dataset_compression, bool): - if dataset_compression is True: - self._dataset_compression = default_dataset_compression_arg - else: - self._dataset_compression = None - else: - self._dataset_compression = dataset_compression - - if self._dataset_compression is not None: - self._dataset_compression = validate_dataset_compression_arg( - self._dataset_compression, memory_limit=memory_limit) + self._dataset_compression = self._get_dataset_compression_mapping(memory_limit, dataset_compression) self.dataset, self.input_validator = self._get_dataset_input_validator( X_train=X_train, @@ -453,6 +442,43 @@ def search( portfolio_selection=portfolio_selection, ) + def _get_dataset_compression_mapping( + self, + memory_limit: int, + dataset_compression: Union[bool, Mapping[str, Any]] + ) -> Optional[DatasetCompressionSpec]: + """ + Internal function to get value for `self._dataset_compression` + based on the value of `dataset_compression` passed. + + If True, it returns the default_dataset_compression_arg. In case + of a mapping, it is validated and returned as a `DatasetCompressionSpec`. + + If False, it returns None. + + Args: + memory_limit (int): + memory limit of the current search. + dataset_compression (Union[bool, Mapping[str, Any]]): + mapping passed to the `search` function. + + Returns: + Optional[DatasetCompressionSpec]: + Validated data compression spec or None. + """ + dataset_compression_mapping: Optional[Mapping[str, Any]] = None + + if not isinstance(dataset_compression, bool): + dataset_compression_mapping = dataset_compression + elif dataset_compression: + dataset_compression_mapping = default_dataset_compression_arg + + if dataset_compression_mapping is not None: + dataset_compression_mapping = validate_dataset_compression_arg( + dataset_compression_mapping, memory_limit=memory_limit) + + return dataset_compression_mapping + def predict( self, X_test: np.ndarray, diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py index ec0ebdcad..6ea2def0d 100644 --- a/autoPyTorch/api/tabular_regression.py +++ b/autoPyTorch/api/tabular_regression.py @@ -12,6 +12,7 @@ ) from autoPyTorch.data.tabular_validator import TabularInputValidator from autoPyTorch.data.utils import ( + DatasetCompressionSpec, default_dataset_compression_arg, validate_dataset_compression_arg ) @@ -411,19 +412,8 @@ def search( self """ - self._dataset_compression: Optional[Mapping[str, Any]] - if isinstance(dataset_compression, bool): - if dataset_compression is True: - self._dataset_compression = default_dataset_compression_arg - else: - self._dataset_compression = None - else: - self._dataset_compression = dataset_compression - - if self._dataset_compression is not None: - self._dataset_compression = validate_dataset_compression_arg( - self._dataset_compression, memory_limit=memory_limit) + self._dataset_compression = self._get_dataset_compression_mapping(memory_limit, dataset_compression) self.dataset, self.input_validator = self._get_dataset_input_validator( X_train=X_train, @@ -454,6 +444,43 @@ def search( portfolio_selection=portfolio_selection, ) + def _get_dataset_compression_mapping( + self, + memory_limit: int, + dataset_compression: Union[bool, Mapping[str, Any]] + ) -> Optional[DatasetCompressionSpec]: + """ + Internal function to get value for `self._dataset_compression` + based on the value of `dataset_compression` passed. + + If True, it returns the default_dataset_compression_arg. In case + of a mapping, it is validated and returned as a `DatasetCompressionSpec`. + + If False, it returns None. + + Args: + memory_limit (int): + memory limit of the current search. + dataset_compression (Union[bool, Mapping[str, Any]]): + mapping passed to the `search` function. + + Returns: + Optional[DatasetCompressionSpec]: + Validated data compression spec or None. + """ + dataset_compression_mapping: Optional[Mapping[str, Any]] = None + + if not isinstance(dataset_compression, bool): + dataset_compression_mapping = dataset_compression + elif dataset_compression: + dataset_compression_mapping = default_dataset_compression_arg + + if dataset_compression_mapping is not None: + dataset_compression_mapping = validate_dataset_compression_arg( + dataset_compression_mapping, memory_limit=memory_limit) + + return dataset_compression_mapping + def predict( self, X_test: np.ndarray, diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index b9f211283..7da2bd8ed 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -7,7 +7,7 @@ import pandas as pd from pandas.api.types import is_numeric_dtype -import scipy.sparse +from scipy.sparse import issparse, spmatrix import sklearn.utils from sklearn import preprocessing @@ -18,7 +18,11 @@ from sklearn.pipeline import make_pipeline from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SupportedFeatTypes -from autoPyTorch.data.utils import DatasetDTypeContainerType, reduce_dataset_size_if_too_large +from autoPyTorch.data.utils import ( + DatasetCompressionInputType, + DatasetDTypeContainerType, + reduce_dataset_size_if_too_large +) from autoPyTorch.utils.logging_ import PicklableClientLogger @@ -101,7 +105,7 @@ def __init__( dataset_compression: Optional[Mapping[str, Any]] = None, ) -> None: self._dataset_compression = dataset_compression - self._precision: Optional[DatasetDTypeContainerType] = None + self._reduced_dtype: Optional[DatasetDTypeContainerType] = None super().__init__(logger) @staticmethod @@ -151,7 +155,7 @@ def _fit( if isinstance(X, np.ndarray): X = self.numpy_array_to_pandas(X) - if hasattr(X, "iloc") and not scipy.sparse.issparse(X): + if hasattr(X, "iloc") and not issparse(X): X = cast(pd.DataFrame, X) # Treat a column with all instances a NaN as numerical # This will prevent doing encoding to a categorical column made completely @@ -217,7 +221,7 @@ def _fit( def transform( self, X: SupportedFeatTypes, - ) -> np.ndarray: + ) -> Union[np.ndarray, spmatrix, pd.DataFrame]: """ Validates and fit a categorical encoder (if needed) to the features. The supported data types are List, numpy arrays and pandas DataFrames. @@ -241,7 +245,7 @@ def transform( if isinstance(X, np.ndarray): X = self.numpy_array_to_pandas(X) - if hasattr(X, "iloc") and not scipy.sparse.issparse(X): + if hasattr(X, "iloc") and not issparse(X): if np.any(pd.isnull(X)): for column in X.columns: if X[column].isna().all(): @@ -268,7 +272,7 @@ def transform( # Sparse related transformations # Not all sparse format support index sorting - if scipy.sparse.issparse(X) and hasattr(X, 'sort_indices'): + if issparse(X) and hasattr(X, 'sort_indices'): X.sort_indices() try: @@ -285,20 +289,38 @@ def transform( "numerical or categorical values.") raise e - if ( - ( - isinstance(X, np.ndarray) or scipy.sparse.issparse(X) or hasattr(X, 'iloc') - ) - and self._dataset_compression is not None - ): - if self._precision is not None: - X = X.astype(self._precision) - else: - X = reduce_dataset_size_if_too_large(X, **self._dataset_compression) - self._precision = dict(X.dtypes) if hasattr(X, 'iloc') else X.dtype + X = self._compress_dataset(X) return X + # TODO: modify once we have added subsampling as well. + def _compress_dataset(self, X: DatasetCompressionInputType) -> DatasetCompressionInputType: + """ + Compress the dataset. This function ensures that + the testing data is converted to the same dtype as + the training data. + + + Args: + X (DatasetCompressionInputType): + Dataset + + Returns: + DatasetCompressionInputType: + Compressed dataset. + """ + is_dataframe = hasattr(X, 'iloc') + is_reducible_type = isinstance(X, np.ndarray) or issparse(X) or is_dataframe + if not is_reducible_type or self._dataset_compression is None: + return X + elif self._reduced_dtype is not None: + X = X.astype(self._reduced_dtype) + return X + else: + X = reduce_dataset_size_if_too_large(X, **self._dataset_compression) + self._reduced_dtype = dict(X.dtypes) if is_dataframe else X.dtype + return X + def _check_data( self, X: SupportedFeatTypes, @@ -312,7 +334,7 @@ def _check_data( checks) and an encoder fitted in the case the data needs encoding """ - if not isinstance(X, (np.ndarray, pd.DataFrame)) and not scipy.sparse.issparse(X): + if not isinstance(X, (np.ndarray, pd.DataFrame)) and not issparse(X): raise ValueError("AutoPyTorch only supports Numpy arrays, Pandas DataFrames," " scipy sparse and Python Lists, yet, the provided input is" " of type {}".format(type(X)) diff --git a/autoPyTorch/data/utils.py b/autoPyTorch/data/utils.py index ce69e9d70..a46d55f61 100644 --- a/autoPyTorch/data/utils.py +++ b/autoPyTorch/data/utils.py @@ -194,21 +194,21 @@ def reduce_precision( For dataframe, the column's precision is reduced using pd.to_numeric. Args: - X: DatasetCompressionInputType + X (DatasetCompressionInputType): The data to reduce precision of. Returns: Tuple[DatasetCompressionInputType, DatasetDTypeContainerType, DatasetDTypeContainerType] Returns the reduced data X along with the dtypes it and the dtypes it was reduced to. """ - precision: Optional[DatasetDTypeContainerType] = None + reduced_dtypes: Optional[DatasetDTypeContainerType] = None if isinstance(X, np.ndarray) or issparse(X): dtypes = X.dtype if X.dtype not in supported_precision_reductions: raise ValueError(f"X.dtype = {X.dtype} not equal to any supported" f" {supported_precision_reductions}") - precision = reduction_mapping[X.dtype] - X = X.astype(precision) + reduced_dtypes = reduction_mapping[X.dtype] + X = X.astype(reduced_dtypes) elif hasattr(X, 'iloc'): dtypes = dict(X.dtypes) @@ -226,15 +226,16 @@ def reduce_precision( X[integer_columns] = X[integer_columns].apply(lambda column: pd.to_numeric(column, downcast='integer')) if len(float_columns) > 0: X[float_columns] = X[float_columns].apply(lambda column: pd.to_numeric(column, downcast='float')) - precision = dict(X.dtypes) + reduced_dtypes = dict(X.dtypes) else: raise ValueError(f"Unrecognised data type of X, expected data type to " f"be in (np.ndarray, spmatrix, pd.DataFrame), but got :{type(X)}") - return X, precision, dtypes + return X, reduced_dtypes, dtypes def megabytes(arr: DatasetCompressionInputType) -> float: + if isinstance(arr, np.ndarray): memory_in_bytes = arr.nbytes elif issparse(arr): @@ -242,7 +243,9 @@ def megabytes(arr: DatasetCompressionInputType) -> float: elif hasattr(arr, 'iloc'): memory_in_bytes = arr.memory_usage(index=True, deep=True).sum() else: - return 0 + raise ValueError(f"Unrecognised data type of X, expected data type to " + f"be in (np.ndarray, spmatrix, pd.DataFrame) but got :{type(arr)}") + return float(memory_in_bytes / (2**20)) @@ -287,17 +290,16 @@ def reduce_dataset_size_if_too_large( The reduced X if reductions were needed """ - precision: Optional[DatasetDTypeContainerType] = None for method in methods: if method == 'precision': # If the dataset is too big for the allocated memory, # we then try to reduce the precision if it's a high precision dataset if megabytes(X) > memory_allocation: - X, precision, dtypes = reduce_precision(X) + X, reduced_dtypes, dtypes = reduce_precision(X) warnings.warn( f'Dataset too large for allocated memory {memory_allocation}MB, ' - f'reduced the precision from {dtypes} to {precision}', + f'reduced the precision from {dtypes} to {reduced_dtypes}', ) else: raise ValueError(f"Unknown operation `{method}`") diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py index 5eb28309d..3d352d765 100644 --- a/test/test_data/test_feature_validator.py +++ b/test/test_data/test_feature_validator.py @@ -591,7 +591,7 @@ def test_featurevalidator_reduce_precision(input_data_featuretest): validator.fit(X_train=X_train) transformed_X_train = validator.transform(X_train.copy()) - assert validator._precision is not None + assert validator._reduced_dtype is not None assert megabytes(transformed_X_train) < megabytes(X_train) transformed_X_test = validator.transform(X_test.copy()) @@ -601,4 +601,4 @@ def test_featurevalidator_reduce_precision(input_data_featuretest): assert all(transformed_X_train.dtypes == validator._precision) else: assert transformed_X_train.dtype == transformed_X_test.dtype - assert transformed_X_test.dtype == validator._precision + assert transformed_X_test.dtype == validator._reduced_dtype From a0c9f71a96b5ebb5103ef46ad0d91f8c28114d0f Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Fri, 25 Feb 2022 12:48:55 +0100 Subject: [PATCH 09/12] refactor code --- autoPyTorch/api/tabular_classification.py | 43 +----- autoPyTorch/api/tabular_regression.py | 43 +----- autoPyTorch/data/utils.py | 155 ++++++++++++++-------- test/test_data/test_utils.py | 33 +++++ 4 files changed, 133 insertions(+), 141 deletions(-) diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index a69471da4..684c22a7b 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -12,9 +12,7 @@ ) from autoPyTorch.data.tabular_validator import TabularInputValidator from autoPyTorch.data.utils import ( - DatasetCompressionSpec, - default_dataset_compression_arg, - validate_dataset_compression_arg + get_dataset_compression_mapping ) from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType from autoPyTorch.datasets.resampling_strategy import ( @@ -411,7 +409,7 @@ def search( self """ - self._dataset_compression = self._get_dataset_compression_mapping(memory_limit, dataset_compression) + self._dataset_compression = get_dataset_compression_mapping(memory_limit, dataset_compression) self.dataset, self.input_validator = self._get_dataset_input_validator( X_train=X_train, @@ -442,43 +440,6 @@ def search( portfolio_selection=portfolio_selection, ) - def _get_dataset_compression_mapping( - self, - memory_limit: int, - dataset_compression: Union[bool, Mapping[str, Any]] - ) -> Optional[DatasetCompressionSpec]: - """ - Internal function to get value for `self._dataset_compression` - based on the value of `dataset_compression` passed. - - If True, it returns the default_dataset_compression_arg. In case - of a mapping, it is validated and returned as a `DatasetCompressionSpec`. - - If False, it returns None. - - Args: - memory_limit (int): - memory limit of the current search. - dataset_compression (Union[bool, Mapping[str, Any]]): - mapping passed to the `search` function. - - Returns: - Optional[DatasetCompressionSpec]: - Validated data compression spec or None. - """ - dataset_compression_mapping: Optional[Mapping[str, Any]] = None - - if not isinstance(dataset_compression, bool): - dataset_compression_mapping = dataset_compression - elif dataset_compression: - dataset_compression_mapping = default_dataset_compression_arg - - if dataset_compression_mapping is not None: - dataset_compression_mapping = validate_dataset_compression_arg( - dataset_compression_mapping, memory_limit=memory_limit) - - return dataset_compression_mapping - def predict( self, X_test: np.ndarray, diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py index 6ea2def0d..d766bad68 100644 --- a/autoPyTorch/api/tabular_regression.py +++ b/autoPyTorch/api/tabular_regression.py @@ -12,9 +12,7 @@ ) from autoPyTorch.data.tabular_validator import TabularInputValidator from autoPyTorch.data.utils import ( - DatasetCompressionSpec, - default_dataset_compression_arg, - validate_dataset_compression_arg + get_dataset_compression_mapping ) from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType from autoPyTorch.datasets.resampling_strategy import ( @@ -413,7 +411,7 @@ def search( """ - self._dataset_compression = self._get_dataset_compression_mapping(memory_limit, dataset_compression) + self._dataset_compression = get_dataset_compression_mapping(memory_limit, dataset_compression) self.dataset, self.input_validator = self._get_dataset_input_validator( X_train=X_train, @@ -444,43 +442,6 @@ def search( portfolio_selection=portfolio_selection, ) - def _get_dataset_compression_mapping( - self, - memory_limit: int, - dataset_compression: Union[bool, Mapping[str, Any]] - ) -> Optional[DatasetCompressionSpec]: - """ - Internal function to get value for `self._dataset_compression` - based on the value of `dataset_compression` passed. - - If True, it returns the default_dataset_compression_arg. In case - of a mapping, it is validated and returned as a `DatasetCompressionSpec`. - - If False, it returns None. - - Args: - memory_limit (int): - memory limit of the current search. - dataset_compression (Union[bool, Mapping[str, Any]]): - mapping passed to the `search` function. - - Returns: - Optional[DatasetCompressionSpec]: - Validated data compression spec or None. - """ - dataset_compression_mapping: Optional[Mapping[str, Any]] = None - - if not isinstance(dataset_compression, bool): - dataset_compression_mapping = dataset_compression - elif dataset_compression: - dataset_compression_mapping = default_dataset_compression_arg - - if dataset_compression_mapping is not None: - dataset_compression_mapping = validate_dataset_compression_arg( - dataset_compression_mapping, memory_limit=memory_limit) - - return dataset_compression_mapping - def predict( self, X_test: np.ndarray, diff --git a/autoPyTorch/data/utils.py b/autoPyTorch/data/utils.py index a46d55f61..a2d12e85c 100644 --- a/autoPyTorch/data/utils.py +++ b/autoPyTorch/data/utils.py @@ -38,6 +38,43 @@ } +def get_dataset_compression_mapping( + memory_limit: int, + dataset_compression: Union[bool, Mapping[str, Any]] +) -> Optional[DatasetCompressionSpec]: + """ + Internal function to get value for `BaseTask._dataset_compression` + based on the value of `dataset_compression` passed. + + If True, it returns the default_dataset_compression_arg. In case + of a mapping, it is validated and returned as a `DatasetCompressionSpec`. + + If False, it returns None. + + Args: + memory_limit (int): + memory limit of the current search. + dataset_compression (Union[bool, Mapping[str, Any]]): + mapping passed to the `search` function. + + Returns: + Optional[DatasetCompressionSpec]: + Validated data compression spec or None. + """ + dataset_compression_mapping: Optional[Mapping[str, Any]] = None + + if not isinstance(dataset_compression, bool): + dataset_compression_mapping = dataset_compression + elif dataset_compression: + dataset_compression_mapping = default_dataset_compression_arg + + if dataset_compression_mapping is not None: + dataset_compression_mapping = validate_dataset_compression_arg( + dataset_compression_mapping, memory_limit=memory_limit) + + return dataset_compression_mapping + + def validate_dataset_compression_arg( dataset_compression: Mapping[str, Any], memory_limit: int @@ -54,78 +91,78 @@ def validate_dataset_compression_arg( DatasetCompressionSpec The validated and correct dataset compression spec """ - if isinstance(dataset_compression, Mapping): - # Fill with defaults if they don't exist - dataset_compression = { - **default_dataset_compression_arg, - **dataset_compression - } - - # Must contain known keys - if set(dataset_compression.keys()) != set(default_dataset_compression_arg.keys()): - raise ValueError( - f"Unknown key in dataset_compression, {list(dataset_compression.keys())}." - f"\nPossible keys are {list(default_dataset_compression_arg.keys())}" - ) + if not isinstance(dataset_compression, Mapping): + raise ValueError( + f"Unknown type for `dataset_compression` {type(dataset_compression)}" + f"\ndataset_compression = {dataset_compression}" + ) - memory_allocation = dataset_compression["memory_allocation"] + # Fill with defaults if they don't exist + dataset_compression = { + **default_dataset_compression_arg, + **dataset_compression + } - # "memory_allocation" must be float or int - if not (isinstance(memory_allocation, float) or isinstance(memory_allocation, int)): - raise ValueError( - "key 'memory_allocation' must be an `int` or `float`" - f"\ntype = {memory_allocation}" - f"\ndataset_compression = {dataset_compression}" - ) + # Must contain known keys + if set(dataset_compression.keys()) != set(default_dataset_compression_arg.keys()): + raise ValueError( + f"Unknown key in dataset_compression, {list(dataset_compression.keys())}." + f"\nPossible keys are {list(default_dataset_compression_arg.keys())}" + ) - # "memory_allocation" if absolute, should be > 0 and < memory_limit - if isinstance(memory_allocation, int) and not (0 < memory_allocation < memory_limit): - raise ValueError( - f"key 'memory_allocation' if int must be in (0, memory_limit={memory_limit})" - f"\nmemory_allocation = {memory_allocation}" - f"\ndataset_compression = {dataset_compression}" - ) + memory_allocation = dataset_compression["memory_allocation"] - # "memory_allocation" must be in (0,1) if float - if isinstance(memory_allocation, float): - if not (0.0 < memory_allocation < 1.0): - raise ValueError( - "key 'memory_allocation' if float must be in (0, 1)" - f"\nmemory_allocation = {memory_allocation}" - f"\ndataset_compression = {dataset_compression}" - ) - # convert to int so we can directly use - dataset_compression["memory_allocation"] = floor(memory_allocation * memory_limit) - - # "methods" must be non-empty sequence - if ( - not isinstance(dataset_compression["methods"], Sequence) - or len(dataset_compression["methods"]) <= 0 - ): - raise ValueError( - "key 'methods' must be a non-empty list" - f"\nmethods = {dataset_compression['methods']}" - f"\ndataset_compression = {dataset_compression}" - ) + # "memory_allocation" must be float or int + if not (isinstance(memory_allocation, float) or isinstance(memory_allocation, int)): + raise ValueError( + "key 'memory_allocation' must be an `int` or `float`" + f"\ntype = {memory_allocation}" + f"\ndataset_compression = {dataset_compression}" + ) - # "methods" must contain known methods - if any( - method not in cast(Sequence, default_dataset_compression_arg["methods"]) # mypy - for method in dataset_compression["methods"] - ): + # "memory_allocation" if absolute, should be > 0 and < memory_limit + if isinstance(memory_allocation, int) and not (0 < memory_allocation < memory_limit): + raise ValueError( + f"key 'memory_allocation' if int must be in (0, memory_limit={memory_limit})" + f"\nmemory_allocation = {memory_allocation}" + f"\ndataset_compression = {dataset_compression}" + ) + + # "memory_allocation" must be in (0,1) if float + if isinstance(memory_allocation, float): + if not (0.0 < memory_allocation < 1.0): raise ValueError( - f"key 'methods' can only contain {default_dataset_compression_arg['methods']}" - f"\nmethods = {dataset_compression['methods']}" + "key 'memory_allocation' if float must be in (0, 1)" + f"\nmemory_allocation = {memory_allocation}" f"\ndataset_compression = {dataset_compression}" ) + # convert to int so we can directly use + dataset_compression["memory_allocation"] = floor(memory_allocation * memory_limit) + + # "methods" must be non-empty sequence + if ( + not isinstance(dataset_compression["methods"], Sequence) + or len(dataset_compression["methods"]) <= 0 + ): + raise ValueError( + "key 'methods' must be a non-empty list" + f"\nmethods = {dataset_compression['methods']}" + f"\ndataset_compression = {dataset_compression}" + ) - return cast(DatasetCompressionSpec, dataset_compression) - else: + # "methods" must contain known methods + if any( + method not in cast(Sequence, default_dataset_compression_arg["methods"]) # mypy + for method in dataset_compression["methods"] + ): raise ValueError( - f"Unknown type for `dataset_compression` {type(dataset_compression)}" + f"key 'methods' can only contain {default_dataset_compression_arg['methods']}" + f"\nmethods = {dataset_compression['methods']}" f"\ndataset_compression = {dataset_compression}" ) + return cast(DatasetCompressionSpec, dataset_compression) + class _DtypeReductionMapping(Mapping): """ diff --git a/test/test_data/test_utils.py b/test/test_data/test_utils.py index 7f47469ef..a598b139f 100644 --- a/test/test_data/test_utils.py +++ b/test/test_data/test_utils.py @@ -1,3 +1,5 @@ +from tkinter.tix import Tree +from typing import Mapping import numpy as np from pandas.testing import assert_frame_equal @@ -7,6 +9,8 @@ from sklearn.datasets import fetch_openml from autoPyTorch.data.utils import ( + DatasetCompressionSpec, + get_dataset_compression_mapping, megabytes, reduce_dataset_size_if_too_large, reduce_precision, @@ -72,3 +76,32 @@ def test_error_raised_reduce_precision(): # in case X is not an expected type with pytest.raises(ValueError, match=r'Unrecognised data type of X, expected data type to .*'): reduce_precision(X='not expected') + + +def _verify_dataset_compression_mapping(mapping): + assert isinstance(mapping, Mapping) + assert 'methods' in mapping + assert 'memory_allocation' in mapping + + +@pytest.mark.parametrize('memory_limit', [2048]) +def test_get_dataset_compression_mapping(memory_limit): + """ + Tests the functionalities of `get_dataset_compression_mapping` + """ + dataset_compression_mapping = get_dataset_compression_mapping( + dataset_compression=True, + memory_limit=memory_limit) + _verify_dataset_compression_mapping(dataset_compression_mapping) + + dataset_compression_mapping = get_dataset_compression_mapping( + dataset_compression={'memory_allocation': 0.01, 'methods': ['precision']}, + memory_limit=memory_limit + ) + _verify_dataset_compression_mapping(dataset_compression_mapping) + + dataset_compression_mapping = get_dataset_compression_mapping( + dataset_compression=False, + memory_limit=memory_limit + ) + assert dataset_compression_mapping is None From a67ac2ae230194dad83b3d0ae6dea3cafdf1c4c0 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Fri, 25 Feb 2022 13:25:33 +0100 Subject: [PATCH 10/12] check if mapping is the same as expected --- test/test_data/test_utils.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/test/test_data/test_utils.py b/test/test_data/test_utils.py index a598b139f..c26ad9fb3 100644 --- a/test/test_data/test_utils.py +++ b/test/test_data/test_utils.py @@ -1,5 +1,5 @@ -from tkinter.tix import Tree from typing import Mapping + import numpy as np from pandas.testing import assert_frame_equal @@ -9,7 +9,7 @@ from sklearn.datasets import fetch_openml from autoPyTorch.data.utils import ( - DatasetCompressionSpec, + default_dataset_compression_arg, get_dataset_compression_mapping, megabytes, reduce_dataset_size_if_too_large, @@ -78,10 +78,11 @@ def test_error_raised_reduce_precision(): reduce_precision(X='not expected') -def _verify_dataset_compression_mapping(mapping): +def _verify_dataset_compression_mapping(mapping, expected_mapping): assert isinstance(mapping, Mapping) assert 'methods' in mapping assert 'memory_allocation' in mapping + assert mapping == expected_mapping @pytest.mark.parametrize('memory_limit', [2048]) @@ -92,13 +93,17 @@ def test_get_dataset_compression_mapping(memory_limit): dataset_compression_mapping = get_dataset_compression_mapping( dataset_compression=True, memory_limit=memory_limit) - _verify_dataset_compression_mapping(dataset_compression_mapping) + # validation converts the memory allocation from float to integer based on the memory limit + expected_mapping = validate_dataset_compression_arg(default_dataset_compression_arg, memory_limit) + _verify_dataset_compression_mapping(dataset_compression_mapping, expected_mapping) + mapping = {'memory_allocation': 0.01, 'methods': ['precision']} dataset_compression_mapping = get_dataset_compression_mapping( - dataset_compression={'memory_allocation': 0.01, 'methods': ['precision']}, + dataset_compression=mapping, memory_limit=memory_limit ) - _verify_dataset_compression_mapping(dataset_compression_mapping) + expected_mapping = validate_dataset_compression_arg(mapping, memory_limit) + _verify_dataset_compression_mapping(dataset_compression_mapping, expected_mapping) dataset_compression_mapping = get_dataset_compression_mapping( dataset_compression=False, From 687a74a3e2176a8bc90e17435132c273765caf5a Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Fri, 25 Feb 2022 14:42:24 +0100 Subject: [PATCH 11/12] update precision reduction for dataframes and tests --- autoPyTorch/data/utils.py | 19 ++++++------------- test/test_data/test_utils.py | 15 +++++++++++++++ 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/autoPyTorch/data/utils.py b/autoPyTorch/data/utils.py index a2d12e85c..43dacf543 100644 --- a/autoPyTorch/data/utils.py +++ b/autoPyTorch/data/utils.py @@ -18,7 +18,6 @@ import numpy as np import pandas as pd -from pandas.api.types import is_float_dtype, is_numeric_dtype from scipy.sparse import issparse, spmatrix @@ -246,23 +245,17 @@ def reduce_precision( f" {supported_precision_reductions}") reduced_dtypes = reduction_mapping[X.dtype] X = X.astype(reduced_dtypes) + elif hasattr(X, 'iloc'): dtypes = dict(X.dtypes) - integer_columns = [] - float_columns = [] + col_names = X.dtypes.index - for col, dtype in dtypes.items(): - if is_numeric_dtype(dtype): - if is_float_dtype(dtype): - float_columns.append(col) - else: - integer_columns.append(col) + float_cols = col_names[[dt.name.startswith("float") for dt in X.dtypes.values]] + int_cols = col_names[[dt.name.startswith("int") for dt in X.dtypes.values]] + X[int_cols] = X[int_cols].apply(lambda column: pd.to_numeric(column, downcast='integer')) + X[float_cols] = X[float_cols].apply(lambda column: pd.to_numeric(column, downcast='float')) - if len(integer_columns) > 0: - X[integer_columns] = X[integer_columns].apply(lambda column: pd.to_numeric(column, downcast='integer')) - if len(float_columns) > 0: - X[float_columns] = X[float_columns].apply(lambda column: pd.to_numeric(column, downcast='float')) reduced_dtypes = dict(X.dtypes) else: raise ValueError(f"Unrecognised data type of X, expected data type to " diff --git a/test/test_data/test_utils.py b/test/test_data/test_utils.py index c26ad9fb3..ce584197b 100644 --- a/test/test_data/test_utils.py +++ b/test/test_data/test_utils.py @@ -110,3 +110,18 @@ def test_get_dataset_compression_mapping(memory_limit): memory_limit=memory_limit ) assert dataset_compression_mapping is None + + +def test_unsupported_errors(): + """ + Checks if errors are raised when unsupported data is passed to reduce + """ + X = np.array([ + ['a', 'b', 'c', 'a', 'b', 'c'], + ['a', 'b', 'd', 'r', 'b', 'c']]) + with pytest.raises(ValueError, match=r'X.dtype = .*'): + reduce_dataset_size_if_too_large(X, 0) + + X = [[1, 2], [2, 3]] + with pytest.raises(ValueError, match=r'Unrecognised data type of X, expected data type to be in .*'): + reduce_dataset_size_if_too_large(X, 0) \ No newline at end of file From b10465142f0c1aa82d9737cbe4725954f1d57dfe Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Fri, 25 Feb 2022 14:52:20 +0100 Subject: [PATCH 12/12] fix flake --- test/test_data/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_data/test_utils.py b/test/test_data/test_utils.py index ce584197b..505860a94 100644 --- a/test/test_data/test_utils.py +++ b/test/test_data/test_utils.py @@ -124,4 +124,4 @@ def test_unsupported_errors(): X = [[1, 2], [2, 3]] with pytest.raises(ValueError, match=r'Unrecognised data type of X, expected data type to be in .*'): - reduce_dataset_size_if_too_large(X, 0) \ No newline at end of file + reduce_dataset_size_if_too_large(X, 0)