automl · nabenabe0928 · Feb 25, 2022 · Feb 22, 2022 · Feb 22, 2022 · Feb 22, 2022
diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py
@@ -12,6 +12,7 @@
 )
 from autoPyTorch.data.tabular_validator import TabularInputValidator
 from autoPyTorch.data.utils import (
+    DatasetCompressionSpec,
     default_dataset_compression_arg,
     validate_dataset_compression_arg
 )
@@ -410,19 +411,7 @@ def search(
             self
 
         """
-        self._dataset_compression: Optional[Mapping[str, Any]]
-
-        if isinstance(dataset_compression, bool):
-            if dataset_compression is True:
-                self._dataset_compression = default_dataset_compression_arg
-            else:
-                self._dataset_compression = None
-        else:
-            self._dataset_compression = dataset_compression
-
-        if self._dataset_compression is not None:
-            self._dataset_compression = validate_dataset_compression_arg(
-                self._dataset_compression, memory_limit=memory_limit)
+        self._dataset_compression = self._get_dataset_compression_mapping(memory_limit, dataset_compression)
 
         self.dataset, self.input_validator = self._get_dataset_input_validator(
             X_train=X_train,
@@ -453,6 +442,43 @@ def search(
             portfolio_selection=portfolio_selection,
         )
 
+    def _get_dataset_compression_mapping(
+        self,
+        memory_limit: int,
+        dataset_compression: Union[bool, Mapping[str, Any]]
+    ) -> Optional[DatasetCompressionSpec]:
+        """
+        Internal function to get value for `self._dataset_compression`
+        based on the value of `dataset_compression` passed.
+
+        If True, it returns the default_dataset_compression_arg. In case
+        of a mapping, it is validated and returned as a `DatasetCompressionSpec`.
+
+        If False, it returns None.
+
+        Args:
+            memory_limit (int):
+                memory limit of the current search.
+            dataset_compression (Union[bool, Mapping[str, Any]]):
+                mapping passed to the `search` function.
+
+        Returns:
+            Optional[DatasetCompressionSpec]:
+                Validated data compression spec or None.
+        """
+        dataset_compression_mapping: Optional[Mapping[str, Any]] = None
+
+        if not isinstance(dataset_compression, bool):
+            dataset_compression_mapping = dataset_compression
+        elif dataset_compression:
+            dataset_compression_mapping = default_dataset_compression_arg
+
+        if dataset_compression_mapping is not None:
+            dataset_compression_mapping = validate_dataset_compression_arg(
+                dataset_compression_mapping, memory_limit=memory_limit)
+
+        return dataset_compression_mapping
+
     def predict(
             self,
             X_test: np.ndarray,

diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py
@@ -12,6 +12,7 @@
 )
 from autoPyTorch.data.tabular_validator import TabularInputValidator
 from autoPyTorch.data.utils import (
+    DatasetCompressionSpec,
     default_dataset_compression_arg,
     validate_dataset_compression_arg
 )
@@ -411,19 +412,8 @@ def search(
             self
 
         """
-        self._dataset_compression: Optional[Mapping[str, Any]]
 
-        if isinstance(dataset_compression, bool):
-            if dataset_compression is True:
-                self._dataset_compression = default_dataset_compression_arg
-            else:
-                self._dataset_compression = None
-        else:
-            self._dataset_compression = dataset_compression
-
-        if self._dataset_compression is not None:
-            self._dataset_compression = validate_dataset_compression_arg(
-                self._dataset_compression, memory_limit=memory_limit)
+        self._dataset_compression = self._get_dataset_compression_mapping(memory_limit, dataset_compression)
 
         self.dataset, self.input_validator = self._get_dataset_input_validator(
             X_train=X_train,
@@ -454,6 +444,43 @@ def search(
             portfolio_selection=portfolio_selection,
         )
 
+    def _get_dataset_compression_mapping(
+        self,
+        memory_limit: int,
+        dataset_compression: Union[bool, Mapping[str, Any]]
+    ) -> Optional[DatasetCompressionSpec]:
+        """
+        Internal function to get value for `self._dataset_compression`
+        based on the value of `dataset_compression` passed.
+
+        If True, it returns the default_dataset_compression_arg. In case
+        of a mapping, it is validated and returned as a `DatasetCompressionSpec`.
+
+        If False, it returns None.
+
+        Args:
+            memory_limit (int):
+                memory limit of the current search.
+            dataset_compression (Union[bool, Mapping[str, Any]]):
+                mapping passed to the `search` function.
+
+        Returns:
+            Optional[DatasetCompressionSpec]:
+                Validated data compression spec or None.
+        """
+        dataset_compression_mapping: Optional[Mapping[str, Any]] = None
+
+        if not isinstance(dataset_compression, bool):
+            dataset_compression_mapping = dataset_compression
+        elif dataset_compression:
+            dataset_compression_mapping = default_dataset_compression_arg
+
+        if dataset_compression_mapping is not None:
+            dataset_compression_mapping = validate_dataset_compression_arg(
+                dataset_compression_mapping, memory_limit=memory_limit)
+
+        return dataset_compression_mapping
+
     def predict(
             self,
             X_test: np.ndarray,

diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
@@ -7,7 +7,7 @@
 import pandas as pd
 from pandas.api.types import is_numeric_dtype
 
-import scipy.sparse
+from scipy.sparse import issparse, spmatrix
 
 import sklearn.utils
 from sklearn import preprocessing
@@ -18,7 +18,11 @@
 from sklearn.pipeline import make_pipeline
 
 from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SupportedFeatTypes
-from autoPyTorch.data.utils import DatasetDTypeContainerType, reduce_dataset_size_if_too_large
+from autoPyTorch.data.utils import (
+    DatasetCompressionInputType,
+    DatasetDTypeContainerType,
+    reduce_dataset_size_if_too_large
+)
 from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 
@@ -101,7 +105,7 @@ def __init__(
         dataset_compression: Optional[Mapping[str, Any]] = None,
     ) -> None:
         self._dataset_compression = dataset_compression
-        self._precision: Optional[DatasetDTypeContainerType] = None
+        self._reduced_dtype: Optional[DatasetDTypeContainerType] = None
         super().__init__(logger)
 
     @staticmethod
@@ -151,7 +155,7 @@ def _fit(
         if isinstance(X, np.ndarray):
             X = self.numpy_array_to_pandas(X)
 
-        if hasattr(X, "iloc") and not scipy.sparse.issparse(X):
+        if hasattr(X, "iloc") and not issparse(X):
             X = cast(pd.DataFrame, X)
             # Treat a column with all instances a NaN as numerical
             # This will prevent doing encoding to a categorical column made completely
@@ -217,7 +221,7 @@ def _fit(
     def transform(
         self,
         X: SupportedFeatTypes,
-    ) -> np.ndarray:
+    ) -> Union[np.ndarray, spmatrix, pd.DataFrame]:
         """
         Validates and fit a categorical encoder (if needed) to the features.
         The supported data types are List, numpy arrays and pandas DataFrames.
@@ -241,7 +245,7 @@ def transform(
         if isinstance(X, np.ndarray):
             X = self.numpy_array_to_pandas(X)
 
-        if hasattr(X, "iloc") and not scipy.sparse.issparse(X):
+        if hasattr(X, "iloc") and not issparse(X):
             if np.any(pd.isnull(X)):
                 for column in X.columns:
                     if X[column].isna().all():
@@ -268,7 +272,7 @@ def transform(
 
         # Sparse related transformations
         # Not all sparse format support index sorting
-        if scipy.sparse.issparse(X) and hasattr(X, 'sort_indices'):
+        if issparse(X) and hasattr(X, 'sort_indices'):
             X.sort_indices()
 
         try:
@@ -285,20 +289,38 @@ def transform(
                                   "numerical or categorical values.")
             raise e
 
-        if (
-            (
-                isinstance(X, np.ndarray) or scipy.sparse.issparse(X) or hasattr(X, 'iloc')
-            )
-            and self._dataset_compression is not None
-        ):
-            if self._precision is not None:
-                X = X.astype(self._precision)
-            else:
-                X = reduce_dataset_size_if_too_large(X, **self._dataset_compression)
-                self._precision = dict(X.dtypes) if hasattr(X, 'iloc') else X.dtype
+        X = self._compress_dataset(X)
 
         return X
 
+    # TODO: modify once we have added subsampling as well.
+    def _compress_dataset(self, X: DatasetCompressionInputType) -> DatasetCompressionInputType:
+        """
+        Compress the dataset. This function ensures that
+        the testing data is converted to the same dtype as
+        the training data.
+
+
+        Args:
+            X (DatasetCompressionInputType):
+                Dataset
+
+        Returns:
+            DatasetCompressionInputType:
+                Compressed dataset.
+        """
+        is_dataframe = hasattr(X, 'iloc')
+        is_reducible_type = isinstance(X, np.ndarray) or issparse(X) or is_dataframe
+        if not is_reducible_type or self._dataset_compression is None:
+            return X
+        elif self._reduced_dtype is not None:
+            X = X.astype(self._reduced_dtype)
+            return X
+        else:
+            X = reduce_dataset_size_if_too_large(X, **self._dataset_compression)
+            self._reduced_dtype = dict(X.dtypes) if is_dataframe else X.dtype
+            return X
+
     def _check_data(
         self,
         X: SupportedFeatTypes,
@@ -312,7 +334,7 @@ def _check_data(
                 checks) and an encoder fitted in the case the data needs encoding
         """
 
-        if not isinstance(X, (np.ndarray, pd.DataFrame)) and not scipy.sparse.issparse(X):
+        if not isinstance(X, (np.ndarray, pd.DataFrame)) and not issparse(X):
             raise ValueError("AutoPyTorch only supports Numpy arrays, Pandas DataFrames,"
                              " scipy sparse and Python Lists, yet, the provided input is"
                              " of type {}".format(type(X))

diff --git a/autoPyTorch/data/utils.py b/autoPyTorch/data/utils.py
@@ -194,21 +194,21 @@ def reduce_precision(
         For dataframe, the column's precision is reduced using pd.to_numeric.
 
     Args:
-        X:  DatasetCompressionInputType
+        X (DatasetCompressionInputType):
             The data to reduce precision of.
 
     Returns:
         Tuple[DatasetCompressionInputType, DatasetDTypeContainerType, DatasetDTypeContainerType]
             Returns the reduced data X along with the dtypes it and the dtypes it was reduced to.
     """
-    precision: Optional[DatasetDTypeContainerType] = None
+    reduced_dtypes: Optional[DatasetDTypeContainerType] = None
     if isinstance(X, np.ndarray) or issparse(X):
         dtypes = X.dtype
         if X.dtype not in supported_precision_reductions:
             raise ValueError(f"X.dtype = {X.dtype} not equal to any supported"
                              f" {supported_precision_reductions}")
-        precision = reduction_mapping[X.dtype]
-        X = X.astype(precision)
+        reduced_dtypes = reduction_mapping[X.dtype]
+        X = X.astype(reduced_dtypes)
     elif hasattr(X, 'iloc'):
         dtypes = dict(X.dtypes)
 
@@ -226,23 +226,26 @@ def reduce_precision(
             X[integer_columns] = X[integer_columns].apply(lambda column: pd.to_numeric(column, downcast='integer'))
         if len(float_columns) > 0:
             X[float_columns] = X[float_columns].apply(lambda column: pd.to_numeric(column, downcast='float'))
-        precision = dict(X.dtypes)
+        reduced_dtypes = dict(X.dtypes)
     else:
         raise ValueError(f"Unrecognised data type of X, expected data type to "
                          f"be in (np.ndarray, spmatrix, pd.DataFrame), but got :{type(X)}")
 
-    return X, precision, dtypes
+    return X, reduced_dtypes, dtypes
 
 
 def megabytes(arr: DatasetCompressionInputType) -> float:
+
     if isinstance(arr, np.ndarray):
         memory_in_bytes = arr.nbytes
     elif issparse(arr):
         memory_in_bytes = arr.data.nbytes
     elif hasattr(arr, 'iloc'):
         memory_in_bytes = arr.memory_usage(index=True, deep=True).sum()
     else:
-        return 0
+        raise ValueError(f"Unrecognised data type of X, expected data type to "
+                         f"be in (np.ndarray, spmatrix, pd.DataFrame) but got :{type(arr)}")
+
     return float(memory_in_bytes / (2**20))
 
 
@@ -287,17 +290,16 @@ def reduce_dataset_size_if_too_large(
             The reduced X if reductions were needed
     """
 
-    precision: Optional[DatasetDTypeContainerType] = None
     for method in methods:
 
         if method == 'precision':
             # If the dataset is too big for the allocated memory,
             # we then try to reduce the precision if it's a high precision dataset
             if megabytes(X) > memory_allocation:
-                X, precision, dtypes = reduce_precision(X)
+                X, reduced_dtypes, dtypes = reduce_precision(X)
                 warnings.warn(
                     f'Dataset too large for allocated memory {memory_allocation}MB, '
-                    f'reduced the precision from {dtypes} to {precision}',
+                    f'reduced the precision from {dtypes} to {reduced_dtypes}',
                 )
         else:
             raise ValueError(f"Unknown operation `{method}`")

diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py
@@ -591,7 +591,7 @@ def test_featurevalidator_reduce_precision(input_data_featuretest):
     validator.fit(X_train=X_train)
     transformed_X_train = validator.transform(X_train.copy())
 
-    assert validator._precision is not None
+    assert validator._reduced_dtype is not None
     assert megabytes(transformed_X_train) < megabytes(X_train)
 
     transformed_X_test = validator.transform(X_test.copy())
@@ -601,4 +601,4 @@ def test_featurevalidator_reduce_precision(input_data_featuretest):
         assert all(transformed_X_train.dtypes == validator._precision)
     else:
         assert transformed_X_train.dtype == transformed_X_test.dtype
-        assert transformed_X_test.dtype == validator._precision
+    assert transformed_X_test.dtype == validator._reduced_dtype