Skip to content

[ADD] dataset compression #387

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Feb 25, 2022
52 changes: 39 additions & 13 deletions autoPyTorch/api/tabular_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
)
from autoPyTorch.data.tabular_validator import TabularInputValidator
from autoPyTorch.data.utils import (
DatasetCompressionSpec,
default_dataset_compression_arg,
validate_dataset_compression_arg
)
Expand Down Expand Up @@ -410,19 +411,7 @@ def search(
self

"""
self._dataset_compression: Optional[Mapping[str, Any]]

if isinstance(dataset_compression, bool):
if dataset_compression is True:
self._dataset_compression = default_dataset_compression_arg
else:
self._dataset_compression = None
else:
self._dataset_compression = dataset_compression

if self._dataset_compression is not None:
self._dataset_compression = validate_dataset_compression_arg(
self._dataset_compression, memory_limit=memory_limit)
self._dataset_compression = self._get_dataset_compression_mapping(memory_limit, dataset_compression)

self.dataset, self.input_validator = self._get_dataset_input_validator(
X_train=X_train,
Expand Down Expand Up @@ -453,6 +442,43 @@ def search(
portfolio_selection=portfolio_selection,
)

def _get_dataset_compression_mapping(
self,
memory_limit: int,
dataset_compression: Union[bool, Mapping[str, Any]]
) -> Optional[DatasetCompressionSpec]:
"""
Internal function to get value for `self._dataset_compression`
based on the value of `dataset_compression` passed.

If True, it returns the default_dataset_compression_arg. In case
of a mapping, it is validated and returned as a `DatasetCompressionSpec`.

If False, it returns None.

Args:
memory_limit (int):
memory limit of the current search.
dataset_compression (Union[bool, Mapping[str, Any]]):
mapping passed to the `search` function.

Returns:
Optional[DatasetCompressionSpec]:
Validated data compression spec or None.
"""
dataset_compression_mapping: Optional[Mapping[str, Any]] = None

if not isinstance(dataset_compression, bool):
dataset_compression_mapping = dataset_compression
elif dataset_compression:
dataset_compression_mapping = default_dataset_compression_arg

if dataset_compression_mapping is not None:
dataset_compression_mapping = validate_dataset_compression_arg(
dataset_compression_mapping, memory_limit=memory_limit)

return dataset_compression_mapping

def predict(
self,
X_test: np.ndarray,
Expand Down
51 changes: 39 additions & 12 deletions autoPyTorch/api/tabular_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
)
from autoPyTorch.data.tabular_validator import TabularInputValidator
from autoPyTorch.data.utils import (
DatasetCompressionSpec,
default_dataset_compression_arg,
validate_dataset_compression_arg
)
Expand Down Expand Up @@ -411,19 +412,8 @@ def search(
self

"""
self._dataset_compression: Optional[Mapping[str, Any]]

if isinstance(dataset_compression, bool):
if dataset_compression is True:
self._dataset_compression = default_dataset_compression_arg
else:
self._dataset_compression = None
else:
self._dataset_compression = dataset_compression

if self._dataset_compression is not None:
self._dataset_compression = validate_dataset_compression_arg(
self._dataset_compression, memory_limit=memory_limit)
self._dataset_compression = self._get_dataset_compression_mapping(memory_limit, dataset_compression)

self.dataset, self.input_validator = self._get_dataset_input_validator(
X_train=X_train,
Expand Down Expand Up @@ -454,6 +444,43 @@ def search(
portfolio_selection=portfolio_selection,
)

def _get_dataset_compression_mapping(
self,
memory_limit: int,
dataset_compression: Union[bool, Mapping[str, Any]]
) -> Optional[DatasetCompressionSpec]:
"""
Internal function to get value for `self._dataset_compression`
based on the value of `dataset_compression` passed.

If True, it returns the default_dataset_compression_arg. In case
of a mapping, it is validated and returned as a `DatasetCompressionSpec`.

If False, it returns None.

Args:
memory_limit (int):
memory limit of the current search.
dataset_compression (Union[bool, Mapping[str, Any]]):
mapping passed to the `search` function.

Returns:
Optional[DatasetCompressionSpec]:
Validated data compression spec or None.
"""
dataset_compression_mapping: Optional[Mapping[str, Any]] = None

if not isinstance(dataset_compression, bool):
dataset_compression_mapping = dataset_compression
elif dataset_compression:
dataset_compression_mapping = default_dataset_compression_arg

if dataset_compression_mapping is not None:
dataset_compression_mapping = validate_dataset_compression_arg(
dataset_compression_mapping, memory_limit=memory_limit)

return dataset_compression_mapping

def predict(
self,
X_test: np.ndarray,
Expand Down
60 changes: 41 additions & 19 deletions autoPyTorch/data/tabular_feature_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pandas as pd
from pandas.api.types import is_numeric_dtype

import scipy.sparse
from scipy.sparse import issparse, spmatrix

import sklearn.utils
from sklearn import preprocessing
Expand All @@ -18,7 +18,11 @@
from sklearn.pipeline import make_pipeline

from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SupportedFeatTypes
from autoPyTorch.data.utils import DatasetDTypeContainerType, reduce_dataset_size_if_too_large
from autoPyTorch.data.utils import (
DatasetCompressionInputType,
DatasetDTypeContainerType,
reduce_dataset_size_if_too_large
)
from autoPyTorch.utils.logging_ import PicklableClientLogger


Expand Down Expand Up @@ -101,7 +105,7 @@ def __init__(
dataset_compression: Optional[Mapping[str, Any]] = None,
) -> None:
self._dataset_compression = dataset_compression
self._precision: Optional[DatasetDTypeContainerType] = None
self._reduced_dtype: Optional[DatasetDTypeContainerType] = None
super().__init__(logger)

@staticmethod
Expand Down Expand Up @@ -151,7 +155,7 @@ def _fit(
if isinstance(X, np.ndarray):
X = self.numpy_array_to_pandas(X)

if hasattr(X, "iloc") and not scipy.sparse.issparse(X):
if hasattr(X, "iloc") and not issparse(X):
X = cast(pd.DataFrame, X)
# Treat a column with all instances a NaN as numerical
# This will prevent doing encoding to a categorical column made completely
Expand Down Expand Up @@ -217,7 +221,7 @@ def _fit(
def transform(
self,
X: SupportedFeatTypes,
) -> np.ndarray:
) -> Union[np.ndarray, spmatrix, pd.DataFrame]:
"""
Validates and fit a categorical encoder (if needed) to the features.
The supported data types are List, numpy arrays and pandas DataFrames.
Expand All @@ -241,7 +245,7 @@ def transform(
if isinstance(X, np.ndarray):
X = self.numpy_array_to_pandas(X)

if hasattr(X, "iloc") and not scipy.sparse.issparse(X):
if hasattr(X, "iloc") and not issparse(X):
if np.any(pd.isnull(X)):
for column in X.columns:
if X[column].isna().all():
Expand All @@ -268,7 +272,7 @@ def transform(

# Sparse related transformations
# Not all sparse format support index sorting
if scipy.sparse.issparse(X) and hasattr(X, 'sort_indices'):
if issparse(X) and hasattr(X, 'sort_indices'):
X.sort_indices()

try:
Expand All @@ -285,20 +289,38 @@ def transform(
"numerical or categorical values.")
raise e

if (
(
isinstance(X, np.ndarray) or scipy.sparse.issparse(X) or hasattr(X, 'iloc')
)
and self._dataset_compression is not None
):
if self._precision is not None:
X = X.astype(self._precision)
else:
X = reduce_dataset_size_if_too_large(X, **self._dataset_compression)
self._precision = dict(X.dtypes) if hasattr(X, 'iloc') else X.dtype
X = self._compress_dataset(X)

return X

# TODO: modify once we have added subsampling as well.
def _compress_dataset(self, X: DatasetCompressionInputType) -> DatasetCompressionInputType:
"""
Compress the dataset. This function ensures that
the testing data is converted to the same dtype as
the training data.


Args:
X (DatasetCompressionInputType):
Dataset

Returns:
DatasetCompressionInputType:
Compressed dataset.
"""
is_dataframe = hasattr(X, 'iloc')
is_reducible_type = isinstance(X, np.ndarray) or issparse(X) or is_dataframe
if not is_reducible_type or self._dataset_compression is None:
return X
elif self._reduced_dtype is not None:
X = X.astype(self._reduced_dtype)
return X
else:
X = reduce_dataset_size_if_too_large(X, **self._dataset_compression)
self._reduced_dtype = dict(X.dtypes) if is_dataframe else X.dtype
return X

def _check_data(
self,
X: SupportedFeatTypes,
Expand All @@ -312,7 +334,7 @@ def _check_data(
checks) and an encoder fitted in the case the data needs encoding
"""

if not isinstance(X, (np.ndarray, pd.DataFrame)) and not scipy.sparse.issparse(X):
if not isinstance(X, (np.ndarray, pd.DataFrame)) and not issparse(X):
raise ValueError("AutoPyTorch only supports Numpy arrays, Pandas DataFrames,"
" scipy sparse and Python Lists, yet, the provided input is"
" of type {}".format(type(X))
Expand Down
22 changes: 12 additions & 10 deletions autoPyTorch/data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,21 +194,21 @@ def reduce_precision(
For dataframe, the column's precision is reduced using pd.to_numeric.

Args:
X: DatasetCompressionInputType
X (DatasetCompressionInputType):
The data to reduce precision of.

Returns:
Tuple[DatasetCompressionInputType, DatasetDTypeContainerType, DatasetDTypeContainerType]
Returns the reduced data X along with the dtypes it and the dtypes it was reduced to.
"""
precision: Optional[DatasetDTypeContainerType] = None
reduced_dtypes: Optional[DatasetDTypeContainerType] = None
if isinstance(X, np.ndarray) or issparse(X):
dtypes = X.dtype
if X.dtype not in supported_precision_reductions:
raise ValueError(f"X.dtype = {X.dtype} not equal to any supported"
f" {supported_precision_reductions}")
precision = reduction_mapping[X.dtype]
X = X.astype(precision)
reduced_dtypes = reduction_mapping[X.dtype]
X = X.astype(reduced_dtypes)
elif hasattr(X, 'iloc'):
dtypes = dict(X.dtypes)

Expand All @@ -226,23 +226,26 @@ def reduce_precision(
X[integer_columns] = X[integer_columns].apply(lambda column: pd.to_numeric(column, downcast='integer'))
if len(float_columns) > 0:
X[float_columns] = X[float_columns].apply(lambda column: pd.to_numeric(column, downcast='float'))
precision = dict(X.dtypes)
reduced_dtypes = dict(X.dtypes)
else:
raise ValueError(f"Unrecognised data type of X, expected data type to "
f"be in (np.ndarray, spmatrix, pd.DataFrame), but got :{type(X)}")

return X, precision, dtypes
return X, reduced_dtypes, dtypes


def megabytes(arr: DatasetCompressionInputType) -> float:

if isinstance(arr, np.ndarray):
memory_in_bytes = arr.nbytes
elif issparse(arr):
memory_in_bytes = arr.data.nbytes
elif hasattr(arr, 'iloc'):
memory_in_bytes = arr.memory_usage(index=True, deep=True).sum()
else:
return 0
raise ValueError(f"Unrecognised data type of X, expected data type to "
f"be in (np.ndarray, spmatrix, pd.DataFrame) but got :{type(arr)}")

return float(memory_in_bytes / (2**20))


Expand Down Expand Up @@ -287,17 +290,16 @@ def reduce_dataset_size_if_too_large(
The reduced X if reductions were needed
"""

precision: Optional[DatasetDTypeContainerType] = None
for method in methods:

if method == 'precision':
# If the dataset is too big for the allocated memory,
# we then try to reduce the precision if it's a high precision dataset
if megabytes(X) > memory_allocation:
X, precision, dtypes = reduce_precision(X)
X, reduced_dtypes, dtypes = reduce_precision(X)
warnings.warn(
f'Dataset too large for allocated memory {memory_allocation}MB, '
f'reduced the precision from {dtypes} to {precision}',
f'reduced the precision from {dtypes} to {reduced_dtypes}',
)
else:
raise ValueError(f"Unknown operation `{method}`")
Expand Down
4 changes: 2 additions & 2 deletions test/test_data/test_feature_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -591,7 +591,7 @@ def test_featurevalidator_reduce_precision(input_data_featuretest):
validator.fit(X_train=X_train)
transformed_X_train = validator.transform(X_train.copy())

assert validator._precision is not None
assert validator._reduced_dtype is not None
assert megabytes(transformed_X_train) < megabytes(X_train)

transformed_X_test = validator.transform(X_test.copy())
Expand All @@ -601,4 +601,4 @@ def test_featurevalidator_reduce_precision(input_data_featuretest):
assert all(transformed_X_train.dtypes == validator._precision)
else:
assert transformed_X_train.dtype == transformed_X_test.dtype
assert transformed_X_test.dtype == validator._precision
assert transformed_X_test.dtype == validator._reduced_dtype