From 751d08688f99f1c3c54700baf6c6b5d1d4caa821 Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Tue, 22 Feb 2022 11:20:41 +0100
Subject: [PATCH 01/12] Initial implementation without tests

---
 autoPyTorch/api/tabular_classification.py     |  28 +-
 autoPyTorch/data/tabular_feature_validator.py |  25 +-
 autoPyTorch/data/tabular_validator.py         |   8 +-
 autoPyTorch/data/utils.py                     | 302 ++++++++++++++++++
 4 files changed, 358 insertions(+), 5 deletions(-)
 create mode 100644 autoPyTorch/data/utils.py

diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py
index 03519bef8..ef8a80b79 100644
--- a/autoPyTorch/api/tabular_classification.py
+++ b/autoPyTorch/api/tabular_classification.py
@@ -1,4 +1,4 @@
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union
 
 import numpy as np
 
@@ -11,6 +11,11 @@
     TASK_TYPES_TO_STRING,
 )
 from autoPyTorch.data.tabular_validator import TabularInputValidator
+from autoPyTorch.data.utils import (
+    DatasetCompressionSpec,
+    default_dataset_compression_arg,
+    validate_dataset_compression_arg
+)
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.datasets.resampling_strategy import (
     HoldoutValTypes,
@@ -163,6 +168,7 @@ def _get_dataset_input_validator(
         resampling_strategy: Optional[ResamplingStrategies] = None,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         dataset_name: Optional[str] = None,
+        dataset_compression: Optional[Mapping[str, Any]] = None,
     ) -> Tuple[TabularDataset, TabularInputValidator]:
         """
         Returns an object of `TabularDataset` and an object of
@@ -202,6 +208,7 @@ def _get_dataset_input_validator(
         InputValidator = TabularInputValidator(
             is_classification=True,
             logger_port=self._logger_port,
+            dataset_compression=dataset_compression
         )
 
         # Fit a input validator to check the provided data
@@ -242,6 +249,7 @@ def search(
         disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
         load_models: bool = True,
         portfolio_selection: Optional[str] = None,
+        dataset_compression: Optional[Mapping[str, Any]] = None,
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -374,6 +382,20 @@ def search(
 
         """
 
+        self._dataset_compression: Optional[DatasetCompressionSpec]
+
+        if isinstance(dataset_compression, bool):
+            if dataset_compression is True:
+                self._dataset_compression = default_dataset_compression_arg
+            else:
+                self._dataset_compression = None
+        else:
+            self._dataset_compression = dataset_compression
+
+        if self._dataset_compression is not None:
+            self._dataset_compression = validate_dataset_compression_arg(
+                self._dataset_compression, memory_limit=memory_limit)
+
         self.dataset, self.InputValidator = self._get_dataset_input_validator(
             X_train=X_train,
             y_train=y_train,
@@ -381,7 +403,9 @@ def search(
             y_test=y_test,
             resampling_strategy=self.resampling_strategy,
             resampling_strategy_args=self.resampling_strategy_args,
-            dataset_name=dataset_name)
+            dataset_name=dataset_name,
+            dataset_compression=self._dataset_compression)
+
 
         return self._search(
             dataset=self.dataset,
diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
index 4bab001c6..8927ff013 100644
--- a/autoPyTorch/data/tabular_feature_validator.py
+++ b/autoPyTorch/data/tabular_feature_validator.py
@@ -1,5 +1,6 @@
 import functools
-from typing import Dict, List, Optional, Tuple, cast
+from logging import Logger
+from typing import Any, Dict, List, Mapping, Optional, Tuple, Type, Union, cast
 
 import numpy as np
 
@@ -17,6 +18,8 @@
 from sklearn.pipeline import make_pipeline
 
 from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SupportedFeatTypes
+from autoPyTorch.data.utils import DatasetDTypeContainerType, reduce_dataset_size_if_too_large
+from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 
 def _create_column_transformer(
@@ -92,6 +95,15 @@ class TabularFeatureValidator(BaseFeatureValidator):
         categorical_columns (List[int]):
             List of indices of categorical columns
     """
+    def __init__(
+        self,
+        logger: Optional[Union[PicklableClientLogger, Logger]] = None,
+        dataset_compression: Optional[Mapping[str, Any]] = None,
+    ) -> None:
+        self._dataset_compression = dataset_compression
+        self._precision: Optional[DatasetDTypeContainerType] = None
+        super().__init__(logger)
+
     @staticmethod
     def _comparator(cmp1: str, cmp2: str) -> int:
         """Order so that categorical columns come left and numerical columns come right
@@ -259,6 +271,17 @@ def transform(
         if scipy.sparse.issparse(X) and hasattr(X, 'sort_indices'):
             X.sort_indices()
 
+        if (
+            (
+                isinstance(X, np.ndarray) or scipy.sparse.issparse(X) or hasattr(X, 'iloc')
+            )
+            and self._dataset_compression is not None
+        ):
+            if self._precision is not None:
+                X.astype(self._precision)
+            else:
+                X, self._precision = reduce_dataset_size_if_too_large(X, **self._dataset_compression)
+
         try:
             X = sklearn.utils.check_array(
                 X,
diff --git a/autoPyTorch/data/tabular_validator.py b/autoPyTorch/data/tabular_validator.py
index 677b55d4b..4db415f93 100644
--- a/autoPyTorch/data/tabular_validator.py
+++ b/autoPyTorch/data/tabular_validator.py
@@ -1,6 +1,6 @@
 # -*- encoding: utf-8 -*-
 import logging
-from typing import Optional, Union
+from typing import Any, Mapping, Optional, Union
 
 from autoPyTorch.data.base_validator import BaseInputValidator
 from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator
@@ -32,9 +32,11 @@ def __init__(
         self,
         is_classification: bool = False,
         logger_port: Optional[int] = None,
+        dataset_compression: Optional[Mapping[str, Any]] = None,
     ) -> None:
         self.is_classification = is_classification
         self.logger_port = logger_port
+        self.dataset_compression = dataset_compression
         if self.logger_port is not None:
             self.logger: Union[logging.Logger, PicklableClientLogger] = get_named_client_logger(
                 name='Validation',
@@ -43,7 +45,9 @@ def __init__(
         else:
             self.logger = logging.getLogger('Validation')
 
-        self.feature_validator = TabularFeatureValidator(logger=self.logger)
+        self.feature_validator = TabularFeatureValidator(
+            dataset_compression=self.dataset_compression,
+            logger=self.logger)
         self.target_validator = TabularTargetValidator(
             is_classification=self.is_classification,
             logger=self.logger
diff --git a/autoPyTorch/data/utils.py b/autoPyTorch/data/utils.py
new file mode 100644
index 000000000..fde1df95e
--- /dev/null
+++ b/autoPyTorch/data/utils.py
@@ -0,0 +1,302 @@
+# Implementation used from https://github.com/automl/auto-sklearn/blob/development/autosklearn/util/data.py
+from math import floor
+import warnings
+from typing import (
+    Any,
+    Dict,
+    Iterator,
+    List,
+    Mapping,
+    Optional,
+    Sequence,
+    Tuple,
+    Type,
+    Union,
+    cast
+)
+
+import numpy as np
+
+import pandas as pd
+
+from scipy.sparse import spmatrix, issparse
+
+
+# TODO: TypedDict with python 3.8
+#
+#   When upgrading to python 3.8 as minimum version, this should be a TypedDict
+#   so that mypy can identify the fields types
+DatasetCompressionSpec = Dict[str, Union[int, float, List[str]]]
+DatasetDTypeContainerType = Union[Type, Dict[str, Type]]
+DatasetCompressionInputType = Union[np.ndarray, spmatrix, pd.DataFrame]
+
+# Default specification for arg `dataset_compression`
+default_dataset_compression_arg: DatasetCompressionSpec = {
+    "memory_allocation": 0.1,
+    "methods": ["precision"]
+}
+
+
+def validate_dataset_compression_arg(
+    dataset_compression: Mapping[str, Any],
+    memory_limit: int
+) -> DatasetCompressionSpec:
+    """Validates and return a correct dataset_compression argument
+
+    The returned value can be safely used with `reduce_dataset_size_if_too_large`.
+
+    Parameters
+    ----------
+    dataset_compression: Mapping[str, Any]
+        The argumnents to validate
+
+    Returns
+    -------
+    DatasetCompressionSpec
+        The validated and correct dataset compression spec
+    """
+    if isinstance(dataset_compression, Mapping):
+        # Fill with defaults if they don't exist
+        dataset_compression = {
+            **default_dataset_compression_arg,
+            **dataset_compression
+        }
+
+        # Must contain known keys
+        if set(dataset_compression.keys()) != set(default_dataset_compression_arg.keys()):
+            raise ValueError(
+                f"Unknown key in dataset_compression, {list(dataset_compression.keys())}."
+                f"\nPossible keys are {list(default_dataset_compression_arg.keys())}"
+            )
+
+        memory_allocation = dataset_compression["memory_allocation"]
+
+        # "memory_allocation" must be float or int
+        if not (isinstance(memory_allocation, float) or isinstance(memory_allocation, int)):
+            raise ValueError(
+                "key 'memory_allocation' must be an `int` or `float`"
+                f"\ntype = {memory_allocation}"
+                f"\ndataset_compression = {dataset_compression}"
+            )
+
+        # "memory_allocation" if absolute, should be > 0 and < memory_limit
+        if isinstance(memory_allocation, int) and not (0 < memory_allocation < memory_limit):
+            raise ValueError(
+                f"key 'memory_allocation' if int must be in (0, memory_limit={memory_limit})"
+                f"\nmemory_allocation = {memory_allocation}"
+                f"\ndataset_compression = {dataset_compression}"
+            )
+
+        # "memory_allocation" must be in (0,1) if float
+        if isinstance(memory_allocation, float):
+            if not (0.0 < memory_allocation < 1.0):
+                raise ValueError(
+                    "key 'memory_allocation' if float must be in (0, 1)"
+                    f"\nmemory_allocation = {memory_allocation}"
+                    f"\ndataset_compression = {dataset_compression}"
+                )
+            # convert to int so we can directly use
+            dataset_compression["memory_allocation"] = floor(memory_allocation * memory_limit)
+
+        # "methods" must be non-empty sequence
+        if (
+            not isinstance(dataset_compression["methods"], Sequence)
+            or len(dataset_compression["methods"]) <= 0
+        ):
+            raise ValueError(
+                "key 'methods' must be a non-empty list"
+                f"\nmethods = {dataset_compression['methods']}"
+                f"\ndataset_compression = {dataset_compression}"
+            )
+
+        # "methods" must contain known methods
+        if any(
+            method not in cast(Sequence, default_dataset_compression_arg["methods"])  # mypy
+            for method in dataset_compression["methods"]
+        ):
+            raise ValueError(
+                f"key 'methods' can only contain {default_dataset_compression_arg['methods']}"
+                f"\nmethods = {dataset_compression['methods']}"
+                f"\ndataset_compression = {dataset_compression}"
+            )
+
+        return cast(DatasetCompressionSpec, dataset_compression)
+    else:
+        raise ValueError(
+            f"Unknown type for `dataset_compression` {type(dataset_compression)}"
+            f"\ndataset_compression = {dataset_compression}"
+        )
+
+
+class _DtypeReductionMapping(Mapping):
+    """
+    Unfortuantly, mappings compare by hash(item) and not the __eq__ operator
+    between the key and the item.
+
+    Hence we wrap the dict in a Mapping class and implement our own __getitem__
+    such that we do use __eq__ between keys and query items.
+
+    >>> np.float32 == dtype('float32') # True, they are considered equal
+    >>>
+    >>> mydict = { np.float32: 'hello' }
+    >>>
+    >>> # Equal by __eq__ but dict operations fail
+    >>> np.dtype('float32') in mydict # False
+    >>> mydict[dtype('float32')]  # KeyError
+
+    This mapping class fixes that supporting the `in` operator as well as `__getitem__`
+
+    >>> reduction_mapping = _DtypeReductionMapping()
+    >>>
+    >>> reduction_mapping[np.dtype('float64')] # np.float32
+    >>> np.dtype('float32') in reduction_mapping # True
+    """
+
+    # Information about dtype support
+    _mapping: Dict[type, type] = {
+        np.float32: np.float32,
+        np.float64: np.float32,
+        np.int32: np.int32,
+        np.int64: np.int32
+    }
+
+    # In spite of the names, np.float96 and np.float128
+    # provide only as much precision as np.longdouble,
+    # that is, 80 bits on most x86 machines and 64 bits
+    # in standard Windows builds.
+    if hasattr(np, 'float96'):
+        _mapping[np.float96] = np.float64
+
+    if hasattr(np, 'float128'):
+        _mapping[np.float128] = np.float64
+
+    @classmethod
+    def __getitem__(cls, item: type) -> type:
+        for k, v in cls._mapping.items():
+            if k == item:
+                return v
+        raise KeyError(item)
+
+    @classmethod
+    def __iter__(cls) -> Iterator[type]:
+        return iter(cls._mapping.keys())
+
+    @classmethod
+    def __len__(cls) -> int:
+        return len(cls._mapping)
+
+
+reduction_mapping = _DtypeReductionMapping()
+supported_precision_reductions = list(reduction_mapping)
+
+
+def reduce_precision(
+    X: DatasetCompressionInputType
+) -> Tuple[DatasetCompressionInputType, DatasetDTypeContainerType, DatasetDTypeContainerType]:
+    """ Reduces the precision of a dataset containing floats or ints
+
+    Parameters
+    ----------
+    X:  DatasetCompressionInputType
+        The data to reduce precision of.
+
+    Returns
+    -------
+    Tuple[DatasetCompressionInputType, DatasetDTypeContainerType, DatasetDTypeContainerType]
+        Returns the reduced data X along with the dtypes it and the dtypes it was reduced to.
+    """
+    precision: Optional[DatasetDTypeContainerType] = None
+    if isinstance(X, np.ndarray) or issparse(X):
+        dtypes = X.dtype
+        if X.dtype not in supported_precision_reductions:
+            raise ValueError(f"X.dtype = {X.dtype} not equal to any supported"
+                             f" {supported_precision_reductions}")
+        precision = reduction_mapping[X.dtype]
+        X = X.astype(precision)
+    elif hasattr(X, 'iloc'):
+        dtypes = {col: X[col].dtype for col in X.columns}
+        precision = {col: reduction_mapping[dtype] for col, dtype in dtypes.items()
+                     if dtype in supported_precision_reductions}
+        X = X.astype(precision)
+    else:
+        raise ValueError(f"Unrecognised data type of X, expected data type to "
+                         f"be in (ndarray, spmatrix, pd.DataFrame), but got :{type(X)}")
+
+    return X, precision, dtypes
+
+
+def reduce_dataset_size_if_too_large(
+    X: DatasetCompressionInputType,
+    memory_allocation: int,
+    methods: List[str] = ['precision'],
+) -> Tuple[DatasetCompressionInputType, Optional[DatasetDTypeContainerType]]:
+    f""" Reduces the size of the dataset if it's too close to the memory limit.
+
+    Follows the order of the operations passed in and retains the type of its
+    input.
+
+    Precision reduction will only work on the following data types:
+    -   {supported_precision_reductions}
+
+    Precision reduction will only perform one level of precision reduction.
+    Technically, you could supply multiple rounds of precision reduction, i.e.
+    to reduce np.float128 to np.float32 you could use `methods = ['precision'] * 2`.
+
+    However, if that's the use case, it'd be advised to simply use the function
+    `autoPyTorch.data.utils.reduce_precision`.
+
+    Parameters
+    ----------
+    X: DatasetCompressionInputType
+        The features of the dataset.
+
+    methods: List[str] = ['precision']
+        A list of operations that are permitted to be performed to reduce
+        the size of the dataset.
+
+        **precision**
+
+        Reduce the precision of float types
+
+    memory_allocation: int
+        The amount of memory to allocate to the dataset. It should specify an
+        absolute amount.
+
+    Returns
+    -------
+    DatasetCompressionInputType
+        The reduced X if reductions were needed
+    Optional[DatasetDTypeContainerType]
+        If the precision of the dataset is reduced,
+        we return the precision dtype container that can be
+        used for any other dataset in the current experiment.
+    """
+
+    def megabytes(arr: DatasetCompressionInputType) -> float:
+        memory_in_bytes: Optional[int] = None
+        if isinstance(arr, np.ndarray):
+            memory_in_bytes = arr.nbytes
+        elif issparse(arr):
+            memory_in_bytes = arr.data.nbytes
+        elif hasattr(arr, 'iloc'):
+            memory_in_bytes = arr.memory_usage(index=True, deep=True).sum()
+        else:
+            return 0
+        return memory_in_bytes / (2**20)
+
+    precision: Optional[DatasetDTypeContainerType] = None
+    for method in methods:
+
+        if method == 'precision':
+            # If the dataset is too big for the allocated memory,
+            # we then try to reduce the precision if it's a high precision dataset
+            if megabytes(X) > memory_allocation:
+                X, precision, dtypes = reduce_precision(X)
+                warnings.warn(
+                    f'Dataset too large for allocated memory {memory_allocation}MB, '
+                    f'reduced the precision from {dtypes} to {precision}',
+                )
+        else:
+            raise ValueError(f"Unknown operation `{method}`")
+
+    return X, precision

From ac98a8d1c4c3061316dc496f103dfe298b29946e Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Tue, 22 Feb 2022 15:59:41 +0100
Subject: [PATCH 02/12] add tests and make necessary changes

---
 autoPyTorch/api/tabular_classification.py     | 41 ++++++++--
 autoPyTorch/api/tabular_regression.py         | 30 ++++++-
 autoPyTorch/data/tabular_feature_validator.py | 26 +++---
 autoPyTorch/data/utils.py                     | 63 +++++++++------
 test/test_data/test_feature_validator.py      | 45 +++++++++++
 test/test_data/test_utils.py                  | 81 +++++++++++++++++++
 6 files changed, 238 insertions(+), 48 deletions(-)
 create mode 100644 test/test_data/test_utils.py

diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py
index ef8a80b79..8bfa446c4 100644
--- a/autoPyTorch/api/tabular_classification.py
+++ b/autoPyTorch/api/tabular_classification.py
@@ -12,7 +12,6 @@
 )
 from autoPyTorch.data.tabular_validator import TabularInputValidator
 from autoPyTorch.data.utils import (
-    DatasetCompressionSpec,
     default_dataset_compression_arg,
     validate_dataset_compression_arg
 )
@@ -241,7 +240,7 @@ def search(
         total_walltime_limit: int = 100,
         func_eval_time_limit_secs: Optional[int] = None,
         enable_traditional_pipeline: bool = True,
-        memory_limit: Optional[int] = 4096,
+        memory_limit: int = 4096,
         smac_scenario_args: Optional[Dict[str, Any]] = None,
         get_smac_object_callback: Optional[Callable] = None,
         all_supported_metrics: bool = True,
@@ -249,7 +248,7 @@ def search(
         disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
         load_models: bool = True,
         portfolio_selection: Optional[str] = None,
-        dataset_compression: Optional[Mapping[str, Any]] = None,
+        dataset_compression: Union[Mapping[str, Any], bool] = False,
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -318,7 +317,7 @@ def search(
                 feature by turning this flag to False. All machine learning
                 algorithms that are fitted during search() are considered for
                 ensemble building.
-            memory_limit (Optional[int]: default=4096):
+            memory_limit (int: default=4096):
                 Memory limit in MB for the machine learning algorithm.
                 Autopytorch will stop fitting the machine learning algorithm
                 if it tries to allocate more than memory_limit MB. If None
@@ -376,13 +375,42 @@ def search(
                 Additionally, the keyword 'greedy' is supported,
                 which would use the default portfolio from
                 `AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`_.
+            dataset_compression: Union[bool, Mapping[str, Any]] = True
+                We compress datasets so that they fit into some predefined amount of memory.
+                **NOTE**
+
+                Default configuration when left as ``True``:
+                .. code-block:: python
+                    {
+                        "memory_allocation": 0.1,
+                        "methods": ["precision"]
+                    }
+                You can also pass your own configuration with the same keys and choosing
+                from the available ``"methods"``.
+                The available options are described here:
+                **memory_allocation**
+                    By default, we attempt to fit the dataset into ``0.1 * memory_limit``. This
+                    float value can be set with ``"memory_allocation": 0.1``. We also allow for
+                    specifying absolute memory in MB, e.g. 10MB is ``"memory_allocation": 10``.
+                    The memory used by the dataset is checked after each reduction method is
+                    performed. If the dataset fits into the allocated memory, any further methods
+                    listed in ``"methods"`` will not be performed.
+
+                    **methods**
+                    We currently provide the following methods for reducing the dataset size.
+                    These can be provided in a list and are performed in the order as given.
+                    *   ``"precision"`` - We reduce floating point precision as follows:
+                        *   ``np.float128 -> np.float64``
+                        *   ``np.float96 -> np.float64``
+                        *   ``np.float64 -> np.float32``
+                        *   pandas dataframes are reduced using the downcast option of `pd.to_numeric`
+                            to the lowest possible precision.
 
         Returns:
             self
 
         """
-
-        self._dataset_compression: Optional[DatasetCompressionSpec]
+        self._dataset_compression: Optional[Mapping[str, Any]]
 
         if isinstance(dataset_compression, bool):
             if dataset_compression is True:
@@ -406,7 +434,6 @@ def search(
             dataset_name=dataset_name,
             dataset_compression=self._dataset_compression)
 
-
         return self._search(
             dataset=self.dataset,
             optimize_metric=optimize_metric,
diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py
index 8c0637e39..b4c0c3e1c 100644
--- a/autoPyTorch/api/tabular_regression.py
+++ b/autoPyTorch/api/tabular_regression.py
@@ -1,4 +1,4 @@
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union
 
 import numpy as np
 
@@ -11,6 +11,10 @@
     TASK_TYPES_TO_STRING
 )
 from autoPyTorch.data.tabular_validator import TabularInputValidator
+from autoPyTorch.data.utils import (
+    default_dataset_compression_arg,
+    validate_dataset_compression_arg
+)
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.datasets.resampling_strategy import (
     HoldoutValTypes,
@@ -164,6 +168,7 @@ def _get_dataset_input_validator(
         resampling_strategy: Optional[ResamplingStrategies] = None,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         dataset_name: Optional[str] = None,
+        dataset_compression: Optional[Mapping[str, Any]] = None,
     ) -> Tuple[TabularDataset, TabularInputValidator]:
         """
         Returns an object of `TabularDataset` and an object of
@@ -203,6 +208,7 @@ def _get_dataset_input_validator(
         InputValidator = TabularInputValidator(
             is_classification=False,
             logger_port=self._logger_port,
+            dataset_compression=dataset_compression
         )
 
         # Fit a input validator to check the provided data
@@ -235,7 +241,7 @@ def search(
         total_walltime_limit: int = 100,
         func_eval_time_limit_secs: Optional[int] = None,
         enable_traditional_pipeline: bool = True,
-        memory_limit: Optional[int] = 4096,
+        memory_limit: int = 4096,
         smac_scenario_args: Optional[Dict[str, Any]] = None,
         get_smac_object_callback: Optional[Callable] = None,
         all_supported_metrics: bool = True,
@@ -243,6 +249,7 @@ def search(
         disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
         load_models: bool = True,
         portfolio_selection: Optional[str] = None,
+        dataset_compression: Union[Mapping[str, Any], bool] = False,
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -311,7 +318,7 @@ def search(
                 feature by turning this flag to False. All machine learning
                 algorithms that are fitted during search() are considered for
                 ensemble building.
-            memory_limit (Optional[int]: default=4096):
+            memory_limit (int: default=4096):
                 Memory limit in MB for the machine learning algorithm.
                 Autopytorch will stop fitting the machine learning algorithm
                 if it tries to allocate more than memory_limit MB. If None
@@ -374,6 +381,20 @@ def search(
             self
 
         """
+        self._dataset_compression: Optional[Mapping[str, Any]]
+
+        if isinstance(dataset_compression, bool):
+            if dataset_compression is True:
+                self._dataset_compression = default_dataset_compression_arg
+            else:
+                self._dataset_compression = None
+        else:
+            self._dataset_compression = dataset_compression
+
+        if self._dataset_compression is not None:
+            self._dataset_compression = validate_dataset_compression_arg(
+                self._dataset_compression, memory_limit=memory_limit)
+
         self.dataset, self.InputValidator = self._get_dataset_input_validator(
             X_train=X_train,
             y_train=y_train,
@@ -381,7 +402,8 @@ def search(
             y_test=y_test,
             resampling_strategy=self.resampling_strategy,
             resampling_strategy_args=self.resampling_strategy_args,
-            dataset_name=dataset_name)
+            dataset_name=dataset_name,
+            dataset_compression=self._dataset_compression)
 
         return self._search(
             dataset=self.dataset,
diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
index 8927ff013..b9f211283 100644
--- a/autoPyTorch/data/tabular_feature_validator.py
+++ b/autoPyTorch/data/tabular_feature_validator.py
@@ -1,6 +1,6 @@
 import functools
 from logging import Logger
-from typing import Any, Dict, List, Mapping, Optional, Tuple, Type, Union, cast
+from typing import Any, Dict, List, Mapping, Optional, Tuple, Union, cast
 
 import numpy as np
 
@@ -271,17 +271,6 @@ def transform(
         if scipy.sparse.issparse(X) and hasattr(X, 'sort_indices'):
             X.sort_indices()
 
-        if (
-            (
-                isinstance(X, np.ndarray) or scipy.sparse.issparse(X) or hasattr(X, 'iloc')
-            )
-            and self._dataset_compression is not None
-        ):
-            if self._precision is not None:
-                X.astype(self._precision)
-            else:
-                X, self._precision = reduce_dataset_size_if_too_large(X, **self._dataset_compression)
-
         try:
             X = sklearn.utils.check_array(
                 X,
@@ -295,6 +284,19 @@ def transform(
                                   "Please try to manually cast it to a supported "
                                   "numerical or categorical values.")
             raise e
+
+        if (
+            (
+                isinstance(X, np.ndarray) or scipy.sparse.issparse(X) or hasattr(X, 'iloc')
+            )
+            and self._dataset_compression is not None
+        ):
+            if self._precision is not None:
+                X = X.astype(self._precision)
+            else:
+                X = reduce_dataset_size_if_too_large(X, **self._dataset_compression)
+                self._precision = dict(X.dtypes) if hasattr(X, 'iloc') else X.dtype
+
         return X
 
     def _check_data(
diff --git a/autoPyTorch/data/utils.py b/autoPyTorch/data/utils.py
index fde1df95e..16b0439f6 100644
--- a/autoPyTorch/data/utils.py
+++ b/autoPyTorch/data/utils.py
@@ -1,6 +1,6 @@
 # Implementation used from https://github.com/automl/auto-sklearn/blob/development/autosklearn/util/data.py
-from math import floor
 import warnings
+from math import floor
 from typing import (
     Any,
     Dict,
@@ -18,8 +18,9 @@
 import numpy as np
 
 import pandas as pd
+from pandas.api.types import is_float_dtype, is_numeric_dtype
 
-from scipy.sparse import spmatrix, issparse
+from scipy.sparse import issparse, spmatrix
 
 
 # TODO: TypedDict with python 3.8
@@ -195,6 +196,9 @@ def reduce_precision(
 ) -> Tuple[DatasetCompressionInputType, DatasetDTypeContainerType, DatasetDTypeContainerType]:
     """ Reduces the precision of a dataset containing floats or ints
 
+    Note:
+        For dataframe, the column's precision is reduced using pd.to_numeric.
+
     Parameters
     ----------
     X:  DatasetCompressionInputType
@@ -214,22 +218,47 @@ def reduce_precision(
         precision = reduction_mapping[X.dtype]
         X = X.astype(precision)
     elif hasattr(X, 'iloc'):
-        dtypes = {col: X[col].dtype for col in X.columns}
-        precision = {col: reduction_mapping[dtype] for col, dtype in dtypes.items()
-                     if dtype in supported_precision_reductions}
-        X = X.astype(precision)
+        dtypes = dict(X.dtypes)
+
+        integer_columns = []
+        float_columns = []
+
+        for col, dtype in dtypes.items():
+            if is_numeric_dtype(dtype):
+                if is_float_dtype(dtype):
+                    float_columns.append(col)
+                else:
+                    integer_columns.append(col)
+
+        if len(integer_columns) > 0:
+            X[integer_columns] = X[integer_columns].apply(lambda column: pd.to_numeric(column, downcast='integer'))
+        if len(float_columns) > 0:
+            X[float_columns] = X[float_columns].apply(lambda column: pd.to_numeric(column, downcast='float'))
+        precision = dict(X.dtypes)
     else:
         raise ValueError(f"Unrecognised data type of X, expected data type to "
-                         f"be in (ndarray, spmatrix, pd.DataFrame), but got :{type(X)}")
+                         f"be in (np.ndarray, spmatrix, pd.DataFrame), but got :{type(X)}")
 
     return X, precision, dtypes
 
 
+def megabytes(arr: DatasetCompressionInputType) -> float:
+    if isinstance(arr, np.ndarray):
+        memory_in_bytes = arr.nbytes
+    elif issparse(arr):
+        memory_in_bytes = arr.data.nbytes
+    elif hasattr(arr, 'iloc'):
+        memory_in_bytes = arr.memory_usage(index=True, deep=True).sum()
+    else:
+        return 0
+    return float(memory_in_bytes / (2**20))
+
+
 def reduce_dataset_size_if_too_large(
     X: DatasetCompressionInputType,
     memory_allocation: int,
     methods: List[str] = ['precision'],
-) -> Tuple[DatasetCompressionInputType, Optional[DatasetDTypeContainerType]]:
+) -> DatasetCompressionInputType:
     f""" Reduces the size of the dataset if it's too close to the memory limit.
 
     Follows the order of the operations passed in and retains the type of its
@@ -266,24 +295,8 @@ def reduce_dataset_size_if_too_large(
     -------
     DatasetCompressionInputType
         The reduced X if reductions were needed
-    Optional[DatasetDTypeContainerType]
-        If the precision of the dataset is reduced,
-        we return the precision dtype container that can be
-        used for any other dataset in the current experiment.
     """
 
-    def megabytes(arr: DatasetCompressionInputType) -> float:
-        memory_in_bytes: Optional[int] = None
-        if isinstance(arr, np.ndarray):
-            memory_in_bytes = arr.nbytes
-        elif issparse(arr):
-            memory_in_bytes = arr.data.nbytes
-        elif hasattr(arr, 'iloc'):
-            memory_in_bytes = arr.memory_usage(index=True, deep=True).sum()
-        else:
-            return 0
-        return memory_in_bytes / (2**20)
-
     precision: Optional[DatasetDTypeContainerType] = None
     for method in methods:
 
@@ -299,4 +312,4 @@ def megabytes(arr: DatasetCompressionInputType) -> float:
         else:
             raise ValueError(f"Unknown operation `{method}`")
 
-    return X, precision
+    return X
diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py
index 7f2ff2507..5eb28309d 100644
--- a/test/test_data/test_feature_validator.py
+++ b/test/test_data/test_feature_validator.py
@@ -13,6 +13,7 @@
 import sklearn.model_selection
 
 from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator
+from autoPyTorch.data.utils import megabytes
 
 
 # Fixtures to be used in this class. By default all elements have 100 datapoints
@@ -557,3 +558,47 @@ def test_comparator():
         key=functools.cmp_to_key(validator._comparator)
     )
     assert ans == feat_type
+
+
+# Actual checks for the features
+@pytest.mark.parametrize(
+    'input_data_featuretest',
+    (
+        'numpy_numericalonly_nonan',
+        'numpy_numericalonly_nan',
+        'numpy_mixed_nan',
+        'pandas_numericalonly_nan',
+        'sparse_bsr_nonan',
+        'sparse_bsr_nan',
+        'sparse_coo_nonan',
+        'sparse_coo_nan',
+        'sparse_csc_nonan',
+        'sparse_csc_nan',
+        'sparse_csr_nonan',
+        'sparse_csr_nan',
+        'sparse_dia_nonan',
+        'sparse_dia_nan',
+        'sparse_dok_nonan',
+        'sparse_dok_nan',
+        'openml_40981',  # Australian
+    ),
+    indirect=True
+)
+def test_featurevalidator_reduce_precision(input_data_featuretest):
+    X_train, X_test = sklearn.model_selection.train_test_split(
+        input_data_featuretest, test_size=0.1, random_state=1)
+    validator = TabularFeatureValidator(dataset_compression={'memory_allocation': 0, 'methods': ['precision']})
+    validator.fit(X_train=X_train)
+    transformed_X_train = validator.transform(X_train.copy())
+
+    assert validator._precision is not None
+    assert megabytes(transformed_X_train) < megabytes(X_train)
+
+    transformed_X_test = validator.transform(X_test.copy())
+    assert megabytes(transformed_X_test) < megabytes(X_test)
+    if hasattr(transformed_X_train, 'iloc'):
+        assert all(transformed_X_train.dtypes == transformed_X_test.dtypes)
+        assert all(transformed_X_train.dtypes == validator._precision)
+    else:
+        assert transformed_X_train.dtype == transformed_X_test.dtype
+        assert transformed_X_test.dtype == validator._precision
diff --git a/test/test_data/test_utils.py b/test/test_data/test_utils.py
new file mode 100644
index 000000000..2c7b31419
--- /dev/null
+++ b/test/test_data/test_utils.py
@@ -0,0 +1,81 @@
+import numpy as np
+
+from pandas.testing import assert_frame_equal
+
+import pytest
+
+from sklearn.datasets import fetch_openml
+from sklearn.model_selection import train_test_split
+
+from autoPyTorch.data.utils import (
+    megabytes,
+    reduce_dataset_size_if_too_large,
+    reduce_precision,
+    validate_dataset_compression_arg
+)
+from autoPyTorch.utils.common import subsampler
+
+
+@pytest.mark.parametrize('openmlid', [2, 40984])
+@pytest.mark.parametrize('as_frame', [True, False])
+def test_data_validation_for_classification(openmlid, as_frame, n_samples):
+    X, _ = fetch_openml(data_id=openmlid, return_X_y=True, as_frame=as_frame)
+    X = subsampler(data=X, x=range(n_samples))
+    X_train, X_test = train_test_split(
+        X, test_size=0.33, random_state=0)
+    X_converted, precision = reduce_dataset_size_if_too_large(X.copy(), memory_allocation=0)
+    np.allclose(X, X_converted) if not as_frame else assert_frame_equal(X, X_converted, check_dtype=False)
+    assert megabytes(X_converted) < megabytes(X)
+    if as_frame:
+        assert isinstance(precision, dict)
+        assert isinstance(list(precision.values())[0], type)
+    else:
+        assert isinstance(precision, type)
+
+
+def test_validate_dataset_compression_arg():
+
+    data_compression_args = validate_dataset_compression_arg({}, 10)
+    # check whether the function uses default args
+    # to fill in case args is empty
+    assert data_compression_args is not None
+
+    # assert memory allocation is an integer after validation
+    assert isinstance(data_compression_args['memory_allocation'], int)
+
+    # check whether the function raises an error
+    # in case an unknown key is in args
+    with pytest.raises(ValueError, match=r'Unknown key in dataset_compression, .*'):
+        validate_dataset_compression_arg({'not_there': 1}, 1)
+
+    # check whether the function raises an error
+    # in case memory_allocation is not int or float is in args
+    with pytest.raises(ValueError, match=r"key 'memory_allocation' must be an `int` or `float`.*"):
+        validate_dataset_compression_arg({'memory_allocation': 'not int'}, 1)
+
+    # check whether the function raises an error
+    # in case memory_allocation is an int greater than memory limit
+    with pytest.raises(ValueError, match=r"key 'memory_allocation' if int must be in.*"):
+        validate_dataset_compression_arg({'memory_allocation': 1}, 0)
+
+    # check whether the function raises an error
+    # in case memory_allocation is a float greater than 1
+    with pytest.raises(ValueError, match=r"key 'memory_allocation' if float must be in.*"):
+        validate_dataset_compression_arg({'memory_allocation': 1.5}, 0)
+
+    # check whether the function raises an error
+    # in case an unknown method is passed in args
+    with pytest.raises(ValueError, match=r"key 'methods' can only contain .*"):
+        validate_dataset_compression_arg({'methods': 'unknown'}, 1)
+
+    # check whether the function raises an error
+    # in case an unknown key is in args
+    with pytest.raises(ValueError, match=r'Unknown type for `dataset_compression` .*'):
+        validate_dataset_compression_arg(1, 1)
+
+
+def test_error_raised_reduce_precision():
+    # check whether the function raises an error
+    # in case X is not an expected type
+    with pytest.raises(ValueError, match=r'Unrecognised data type of X, expected data type to .*'):
+        reduce_precision(X='not expected')

From 1eda40d34fa7ec43bdb4230946dcb6704f068c6c Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Tue, 22 Feb 2022 17:20:51 +0100
Subject: [PATCH 03/12] improve documentation

---
 autoPyTorch/api/tabular_classification.py | 18 +++----
 autoPyTorch/api/tabular_regression.py     | 30 ++++++++++++
 autoPyTorch/data/utils.py                 | 60 ++++++++++-------------
 test/test_data/test_utils.py              |  4 +-
 4 files changed, 67 insertions(+), 45 deletions(-)

diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py
index 8bfa446c4..f37b8b228 100644
--- a/autoPyTorch/api/tabular_classification.py
+++ b/autoPyTorch/api/tabular_classification.py
@@ -396,15 +396,15 @@ def search(
                     performed. If the dataset fits into the allocated memory, any further methods
                     listed in ``"methods"`` will not be performed.
 
-                    **methods**
-                    We currently provide the following methods for reducing the dataset size.
-                    These can be provided in a list and are performed in the order as given.
-                    *   ``"precision"`` - We reduce floating point precision as follows:
-                        *   ``np.float128 -> np.float64``
-                        *   ``np.float96 -> np.float64``
-                        *   ``np.float64 -> np.float32``
-                        *   pandas dataframes are reduced using the downcast option of `pd.to_numeric`
-                            to the lowest possible precision.
+                **methods**
+                We currently provide the following methods for reducing the dataset size.
+                These can be provided in a list and are performed in the order as given.
+                *   ``"precision"`` - We reduce floating point precision as follows:
+                    *   ``np.float128 -> np.float64``
+                    *   ``np.float96 -> np.float64``
+                    *   ``np.float64 -> np.float32``
+                    *   pandas dataframes are reduced using the downcast option of `pd.to_numeric`
+                        to the lowest possible precision.
 
         Returns:
             self
diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py
index b4c0c3e1c..cdbf49339 100644
--- a/autoPyTorch/api/tabular_regression.py
+++ b/autoPyTorch/api/tabular_regression.py
@@ -376,6 +376,36 @@ def search(
                 Additionally, the keyword 'greedy' is supported,
                 which would use the default portfolio from
                 `AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`_.
+            dataset_compression: Union[bool, Mapping[str, Any]] = True
+                We compress datasets so that they fit into some predefined amount of memory.
+                **NOTE**
+
+                Default configuration when left as ``True``:
+                .. code-block:: python
+                    {
+                        "memory_allocation": 0.1,
+                        "methods": ["precision"]
+                    }
+                You can also pass your own configuration with the same keys and choosing
+                from the available ``"methods"``.
+                The available options are described here:
+                **memory_allocation**
+                    By default, we attempt to fit the dataset into ``0.1 * memory_limit``. This
+                    float value can be set with ``"memory_allocation": 0.1``. We also allow for
+                    specifying absolute memory in MB, e.g. 10MB is ``"memory_allocation": 10``.
+                    The memory used by the dataset is checked after each reduction method is
+                    performed. If the dataset fits into the allocated memory, any further methods
+                    listed in ``"methods"`` will not be performed.
+
+                **methods**
+                We currently provide the following methods for reducing the dataset size.
+                These can be provided in a list and are performed in the order as given.
+                *   ``"precision"`` - We reduce floating point precision as follows:
+                    *   ``np.float128 -> np.float64``
+                    *   ``np.float96 -> np.float64``
+                    *   ``np.float64 -> np.float32``
+                    *   pandas dataframes are reduced using the downcast option of `pd.to_numeric`
+                        to the lowest possible precision.
 
         Returns:
             self
diff --git a/autoPyTorch/data/utils.py b/autoPyTorch/data/utils.py
index 16b0439f6..f38a64c3f 100644
--- a/autoPyTorch/data/utils.py
+++ b/autoPyTorch/data/utils.py
@@ -46,15 +46,13 @@ def validate_dataset_compression_arg(
 
     The returned value can be safely used with `reduce_dataset_size_if_too_large`.
 
-    Parameters
-    ----------
-    dataset_compression: Mapping[str, Any]
-        The argumnents to validate
-
-    Returns
-    -------
-    DatasetCompressionSpec
-        The validated and correct dataset compression spec
+    Args:
+        dataset_compression: Mapping[str, Any]
+            The argumnents to validate
+
+    Returns:
+        DatasetCompressionSpec
+            The validated and correct dataset compression spec
     """
     if isinstance(dataset_compression, Mapping):
         # Fill with defaults if they don't exist
@@ -199,15 +197,13 @@ def reduce_precision(
     Note:
         For dataframe, the column's precision is reduced using pd.to_numeric.
 
-    Parameters
-    ----------
-    X:  DatasetCompressionInputType
-        The data to reduce precision of.
+    Args:
+        X:  DatasetCompressionInputType
+            The data to reduce precision of.
 
-    Returns
-    -------
-    Tuple[DatasetCompressionInputType, DatasetDTypeContainerType, DatasetDTypeContainerType]
-        Returns the reduced data X along with the dtypes it and the dtypes it was reduced to.
+    Returns:
+        Tuple[DatasetCompressionInputType, DatasetDTypeContainerType, DatasetDTypeContainerType]
+            Returns the reduced data X along with the dtypes it and the dtypes it was reduced to.
     """
     precision: Optional[DatasetDTypeContainerType] = None
     if isinstance(X, np.ndarray) or issparse(X):
@@ -274,27 +270,25 @@ def reduce_dataset_size_if_too_large(
     However, if that's the use case, it'd be advised to simply use the function
     `autoPyTorch.data.utils.reduce_precision`.
 
-    Parameters
-    ----------
-    X: DatasetCompressionInputType
-        The features of the dataset.
+    Args:
+        X: DatasetCompressionInputType
+            The features of the dataset.
 
-    methods: List[str] = ['precision']
-        A list of operations that are permitted to be performed to reduce
-        the size of the dataset.
+        methods: List[str] = ['precision']
+            A list of operations that are permitted to be performed to reduce
+            the size of the dataset.
 
-        **precision**
+            **precision**
 
-        Reduce the precision of float types
+            Reduce the precision of float types
 
-    memory_allocation: int
-        The amount of memory to allocate to the dataset. It should specify an
-        absolute amount.
+        memory_allocation: int
+            The amount of memory to allocate to the dataset. It should specify an
+            absolute amount.
 
-    Returns
-    -------
-    DatasetCompressionInputType
-        The reduced X if reductions were needed
+    Returns:
+        DatasetCompressionInputType
+            The reduced X if reductions were needed
     """
 
     precision: Optional[DatasetDTypeContainerType] = None
diff --git a/test/test_data/test_utils.py b/test/test_data/test_utils.py
index 2c7b31419..f599cb604 100644
--- a/test/test_data/test_utils.py
+++ b/test/test_data/test_utils.py
@@ -5,7 +5,6 @@
 import pytest
 
 from sklearn.datasets import fetch_openml
-from sklearn.model_selection import train_test_split
 
 from autoPyTorch.data.utils import (
     megabytes,
@@ -21,8 +20,7 @@
 def test_data_validation_for_classification(openmlid, as_frame, n_samples):
     X, _ = fetch_openml(data_id=openmlid, return_X_y=True, as_frame=as_frame)
     X = subsampler(data=X, x=range(n_samples))
-    X_train, X_test = train_test_split(
-        X, test_size=0.33, random_state=0)
+
     X_converted, precision = reduce_dataset_size_if_too_large(X.copy(), memory_allocation=0)
     np.allclose(X, X_converted) if not as_frame else assert_frame_equal(X, X_converted, check_dtype=False)
     assert megabytes(X_converted) < megabytes(X)

From ed53a1fb29f32e8abf958144292fc6d35a2e7411 Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Tue, 22 Feb 2022 18:34:05 +0100
Subject: [PATCH 04/12] fix tests

---
 test/test_data/test_utils.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/test/test_data/test_utils.py b/test/test_data/test_utils.py
index f599cb604..7f47469ef 100644
--- a/test/test_data/test_utils.py
+++ b/test/test_data/test_utils.py
@@ -17,18 +17,13 @@
 
 @pytest.mark.parametrize('openmlid', [2, 40984])
 @pytest.mark.parametrize('as_frame', [True, False])
-def test_data_validation_for_classification(openmlid, as_frame, n_samples):
+def test_reduce_dataset_if_too_large(openmlid, as_frame, n_samples):
     X, _ = fetch_openml(data_id=openmlid, return_X_y=True, as_frame=as_frame)
     X = subsampler(data=X, x=range(n_samples))
 
-    X_converted, precision = reduce_dataset_size_if_too_large(X.copy(), memory_allocation=0)
+    X_converted = reduce_dataset_size_if_too_large(X.copy(), memory_allocation=0)
     np.allclose(X, X_converted) if not as_frame else assert_frame_equal(X, X_converted, check_dtype=False)
     assert megabytes(X_converted) < megabytes(X)
-    if as_frame:
-        assert isinstance(precision, dict)
-        assert isinstance(list(precision.values())[0], type)
-    else:
-        assert isinstance(precision, type)
 
 
 def test_validate_dataset_compression_arg():

From 6a74d9f8661ae1d72ad4b340455246635c62db29 Mon Sep 17 00:00:00 2001
From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com>
Date: Thu, 24 Feb 2022 11:34:35 +0100
Subject: [PATCH 05/12] Apply suggestions from code review

Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com>
---
 autoPyTorch/data/utils.py | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/autoPyTorch/data/utils.py b/autoPyTorch/data/utils.py
index f38a64c3f..3915c2009 100644
--- a/autoPyTorch/data/utils.py
+++ b/autoPyTorch/data/utils.py
@@ -42,7 +42,7 @@ def validate_dataset_compression_arg(
     dataset_compression: Mapping[str, Any],
     memory_limit: int
 ) -> DatasetCompressionSpec:
-    """Validates and return a correct dataset_compression argument
+    """Validate and return a correct dataset_compression argument
 
     The returned value can be safely used with `reduce_dataset_size_if_too_large`.
 
@@ -163,18 +163,11 @@ class _DtypeReductionMapping(Mapping):
     # provide only as much precision as np.longdouble,
     # that is, 80 bits on most x86 machines and 64 bits
     # in standard Windows builds.
-    if hasattr(np, 'float96'):
-        _mapping[np.float96] = np.float64
-
-    if hasattr(np, 'float128'):
-        _mapping[np.float128] = np.float64
+    _mapping.update({getattr(np, s): np.float64 for s in ['float96', 'float128'] if hasattr(np, s)})
 
     @classmethod
     def __getitem__(cls, item: type) -> type:
-        for k, v in cls._mapping.items():
-            if k == item:
-                return v
-        raise KeyError(item)
+        return cls._mapping[item]
 
     @classmethod
     def __iter__(cls) -> Iterator[type]:
@@ -192,7 +185,7 @@ def __len__(cls) -> int:
 def reduce_precision(
     X: DatasetCompressionInputType
 ) -> Tuple[DatasetCompressionInputType, DatasetDTypeContainerType, DatasetDTypeContainerType]:
-    """ Reduces the precision of a dataset containing floats or ints
+    """ Reduce the precision of a dataset containing floats or ints
 
     Note:
         For dataframe, the column's precision is reduced using pd.to_numeric.

From 8353e221c78c488eeb7199d7daf164dcb07a9e6a Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Thu, 24 Feb 2022 12:28:54 +0100
Subject: [PATCH 06/12] undo change in  as it causes tests to fail

---
 autoPyTorch/data/utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/autoPyTorch/data/utils.py b/autoPyTorch/data/utils.py
index 3915c2009..ce69e9d70 100644
--- a/autoPyTorch/data/utils.py
+++ b/autoPyTorch/data/utils.py
@@ -167,7 +167,10 @@ class _DtypeReductionMapping(Mapping):
 
     @classmethod
     def __getitem__(cls, item: type) -> type:
-        return cls._mapping[item]
+        for k, v in cls._mapping.items():
+            if k == item:
+                return v
+        raise KeyError(item)
 
     @classmethod
     def __iter__(cls) -> Iterator[type]:

From e61b9cb90cee55adba1757fe8dbea0207d4af3fb Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Thu, 24 Feb 2022 16:14:59 +0100
Subject: [PATCH 07/12] change name from InputValidator to input_validator

---
 autoPyTorch/api/base_task.py              |  2 +-
 autoPyTorch/api/tabular_classification.py | 22 +++++++++++-----------
 autoPyTorch/api/tabular_regression.py     | 16 ++++++++--------
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index 905d795fd..a048e2054 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -243,7 +243,7 @@ def __init__(
         if self.n_jobs == 1:
             self._multiprocessing_context = 'fork'
 
-        self.InputValidator: Optional[BaseInputValidator] = None
+        self.input_validator: Optional[BaseInputValidator] = None
 
         self.search_space_updates = search_space_updates
         if search_space_updates is not None:
diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py
index f37b8b228..61dc68151 100644
--- a/autoPyTorch/api/tabular_classification.py
+++ b/autoPyTorch/api/tabular_classification.py
@@ -204,7 +204,7 @@ def _get_dataset_input_validator(
 
         # Create a validator object to make sure that the data provided by
         # the user matches the autopytorch requirements
-        InputValidator = TabularInputValidator(
+        input_validator = TabularInputValidator(
             is_classification=True,
             logger_port=self._logger_port,
             dataset_compression=dataset_compression
@@ -213,18 +213,18 @@ def _get_dataset_input_validator(
         # Fit a input validator to check the provided data
         # Also, an encoder is fit to both train and test data,
         # to prevent unseen categories during inference
-        InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
+        input_validator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
 
         dataset = TabularDataset(
             X=X_train, Y=y_train,
             X_test=X_test, Y_test=y_test,
-            validator=InputValidator,
+            validator=input_validator,
             resampling_strategy=resampling_strategy,
             resampling_strategy_args=resampling_strategy_args,
             dataset_name=dataset_name
         )
 
-        return dataset, InputValidator
+        return dataset, input_validator
 
     def search(
         self,
@@ -424,7 +424,7 @@ def search(
             self._dataset_compression = validate_dataset_compression_arg(
                 self._dataset_compression, memory_limit=memory_limit)
 
-        self.dataset, self.InputValidator = self._get_dataset_input_validator(
+        self.dataset, self.input_validator = self._get_dataset_input_validator(
             X_train=X_train,
             y_train=y_train,
             X_test=X_test,
@@ -469,28 +469,28 @@ def predict(
         Returns:
             Array with estimator predictions.
         """
-        if self.InputValidator is None or not self.InputValidator._is_fitted:
+        if self.input_validator is None or not self.input_validator._is_fitted:
             raise ValueError("predict() is only supported after calling search. Kindly call first "
                              "the estimator search() method.")
 
-        X_test = self.InputValidator.feature_validator.transform(X_test)
+        X_test = self.input_validator.feature_validator.transform(X_test)
         predicted_probabilities = super().predict(X_test, batch_size=batch_size,
                                                   n_jobs=n_jobs)
 
-        if self.InputValidator.target_validator.is_single_column_target():
+        if self.input_validator.target_validator.is_single_column_target():
             predicted_indexes = np.argmax(predicted_probabilities, axis=1)
         else:
             predicted_indexes = (predicted_probabilities > 0.5).astype(int)
 
         # Allow to predict in the original domain -- that is, the user is not interested
         # in our encoded values
-        return self.InputValidator.target_validator.inverse_transform(predicted_indexes)
+        return self.input_validator.target_validator.inverse_transform(predicted_indexes)
 
     def predict_proba(self,
                       X_test: Union[np.ndarray, pd.DataFrame, List],
                       batch_size: Optional[int] = None, n_jobs: int = 1) -> np.ndarray:
-        if self.InputValidator is None or not self.InputValidator._is_fitted:
+        if self.input_validator is None or not self.input_validator._is_fitted:
             raise ValueError("predict() is only supported after calling search. Kindly call first "
                              "the estimator search() method.")
-        X_test = self.InputValidator.feature_validator.transform(X_test)
+        X_test = self.input_validator.feature_validator.transform(X_test)
         return super().predict(X_test, batch_size=batch_size, n_jobs=n_jobs)
diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py
index cdbf49339..ec0ebdcad 100644
--- a/autoPyTorch/api/tabular_regression.py
+++ b/autoPyTorch/api/tabular_regression.py
@@ -205,7 +205,7 @@ def _get_dataset_input_validator(
 
         # Create a validator object to make sure that the data provided by
         # the user matches the autopytorch requirements
-        InputValidator = TabularInputValidator(
+        input_validator = TabularInputValidator(
             is_classification=False,
             logger_port=self._logger_port,
             dataset_compression=dataset_compression
@@ -214,18 +214,18 @@ def _get_dataset_input_validator(
         # Fit a input validator to check the provided data
         # Also, an encoder is fit to both train and test data,
         # to prevent unseen categories during inference
-        InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
+        input_validator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
 
         dataset = TabularDataset(
             X=X_train, Y=y_train,
             X_test=X_test, Y_test=y_test,
-            validator=InputValidator,
+            validator=input_validator,
             resampling_strategy=resampling_strategy,
             resampling_strategy_args=resampling_strategy_args,
             dataset_name=dataset_name
         )
 
-        return dataset, InputValidator
+        return dataset, input_validator
 
     def search(
         self,
@@ -425,7 +425,7 @@ def search(
             self._dataset_compression = validate_dataset_compression_arg(
                 self._dataset_compression, memory_limit=memory_limit)
 
-        self.dataset, self.InputValidator = self._get_dataset_input_validator(
+        self.dataset, self.input_validator = self._get_dataset_input_validator(
             X_train=X_train,
             y_train=y_train,
             X_test=X_test,
@@ -460,14 +460,14 @@ def predict(
             batch_size: Optional[int] = None,
             n_jobs: int = 1
     ) -> np.ndarray:
-        if self.InputValidator is None or not self.InputValidator._is_fitted:
+        if self.input_validator is None or not self.input_validator._is_fitted:
             raise ValueError("predict() is only supported after calling search. Kindly call first "
                              "the estimator search() method.")
 
-        X_test = self.InputValidator.feature_validator.transform(X_test)
+        X_test = self.input_validator.feature_validator.transform(X_test)
         predicted_values = super().predict(X_test, batch_size=batch_size,
                                            n_jobs=n_jobs)
 
         # Allow to predict in the original domain -- that is, the user is not interested
         # in our encoded values
-        return self.InputValidator.target_validator.inverse_transform(predicted_values)
+        return self.input_validator.target_validator.inverse_transform(predicted_values)

From 95f1c85856568fed7d1dec7d75cd724e4df9e3c0 Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Thu, 24 Feb 2022 17:06:23 +0100
Subject: [PATCH 08/12] extract statements to methods

---
 autoPyTorch/api/tabular_classification.py     | 52 ++++++++++++----
 autoPyTorch/api/tabular_regression.py         | 51 ++++++++++++----
 autoPyTorch/data/tabular_feature_validator.py | 60 +++++++++++++------
 autoPyTorch/data/utils.py                     | 22 +++----
 test/test_data/test_feature_validator.py      |  4 +-
 5 files changed, 133 insertions(+), 56 deletions(-)

diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py
index 61dc68151..a69471da4 100644
--- a/autoPyTorch/api/tabular_classification.py
+++ b/autoPyTorch/api/tabular_classification.py
@@ -12,6 +12,7 @@
 )
 from autoPyTorch.data.tabular_validator import TabularInputValidator
 from autoPyTorch.data.utils import (
+    DatasetCompressionSpec,
     default_dataset_compression_arg,
     validate_dataset_compression_arg
 )
@@ -410,19 +411,7 @@ def search(
             self
 
         """
-        self._dataset_compression: Optional[Mapping[str, Any]]
-
-        if isinstance(dataset_compression, bool):
-            if dataset_compression is True:
-                self._dataset_compression = default_dataset_compression_arg
-            else:
-                self._dataset_compression = None
-        else:
-            self._dataset_compression = dataset_compression
-
-        if self._dataset_compression is not None:
-            self._dataset_compression = validate_dataset_compression_arg(
-                self._dataset_compression, memory_limit=memory_limit)
+        self._dataset_compression = self._get_dataset_compression_mapping(memory_limit, dataset_compression)
 
         self.dataset, self.input_validator = self._get_dataset_input_validator(
             X_train=X_train,
@@ -453,6 +442,43 @@ def search(
             portfolio_selection=portfolio_selection,
         )
 
+    def _get_dataset_compression_mapping(
+        self,
+        memory_limit: int,
+        dataset_compression: Union[bool, Mapping[str, Any]]
+    ) -> Optional[DatasetCompressionSpec]:
+        """
+        Internal function to get value for `self._dataset_compression`
+        based on the value of `dataset_compression` passed.
+
+        If True, it returns the default_dataset_compression_arg. In case
+        of a mapping, it is validated and returned as a `DatasetCompressionSpec`.
+
+        If False, it returns None.
+
+        Args:
+            memory_limit (int):
+                memory limit of the current search.
+            dataset_compression (Union[bool, Mapping[str, Any]]):
+                mapping passed to the `search` function.
+
+        Returns:
+            Optional[DatasetCompressionSpec]:
+                Validated data compression spec or None.
+        """
+        dataset_compression_mapping: Optional[Mapping[str, Any]] = None
+
+        if not isinstance(dataset_compression, bool):
+            dataset_compression_mapping = dataset_compression
+        elif dataset_compression:
+            dataset_compression_mapping = default_dataset_compression_arg
+
+        if dataset_compression_mapping is not None:
+            dataset_compression_mapping = validate_dataset_compression_arg(
+                dataset_compression_mapping, memory_limit=memory_limit)
+
+        return dataset_compression_mapping
+
     def predict(
             self,
             X_test: np.ndarray,
diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py
index ec0ebdcad..6ea2def0d 100644
--- a/autoPyTorch/api/tabular_regression.py
+++ b/autoPyTorch/api/tabular_regression.py
@@ -12,6 +12,7 @@
 )
 from autoPyTorch.data.tabular_validator import TabularInputValidator
 from autoPyTorch.data.utils import (
+    DatasetCompressionSpec,
     default_dataset_compression_arg,
     validate_dataset_compression_arg
 )
@@ -411,19 +412,8 @@ def search(
             self
 
         """
-        self._dataset_compression: Optional[Mapping[str, Any]]
 
-        if isinstance(dataset_compression, bool):
-            if dataset_compression is True:
-                self._dataset_compression = default_dataset_compression_arg
-            else:
-                self._dataset_compression = None
-        else:
-            self._dataset_compression = dataset_compression
-
-        if self._dataset_compression is not None:
-            self._dataset_compression = validate_dataset_compression_arg(
-                self._dataset_compression, memory_limit=memory_limit)
+        self._dataset_compression = self._get_dataset_compression_mapping(memory_limit, dataset_compression)
 
         self.dataset, self.input_validator = self._get_dataset_input_validator(
             X_train=X_train,
@@ -454,6 +444,43 @@ def search(
             portfolio_selection=portfolio_selection,
         )
 
+    def _get_dataset_compression_mapping(
+        self,
+        memory_limit: int,
+        dataset_compression: Union[bool, Mapping[str, Any]]
+    ) -> Optional[DatasetCompressionSpec]:
+        """
+        Internal function to get value for `self._dataset_compression`
+        based on the value of `dataset_compression` passed.
+
+        If True, it returns the default_dataset_compression_arg. In case
+        of a mapping, it is validated and returned as a `DatasetCompressionSpec`.
+
+        If False, it returns None.
+
+        Args:
+            memory_limit (int):
+                memory limit of the current search.
+            dataset_compression (Union[bool, Mapping[str, Any]]):
+                mapping passed to the `search` function.
+
+        Returns:
+            Optional[DatasetCompressionSpec]:
+                Validated data compression spec or None.
+        """
+        dataset_compression_mapping: Optional[Mapping[str, Any]] = None
+
+        if not isinstance(dataset_compression, bool):
+            dataset_compression_mapping = dataset_compression
+        elif dataset_compression:
+            dataset_compression_mapping = default_dataset_compression_arg
+
+        if dataset_compression_mapping is not None:
+            dataset_compression_mapping = validate_dataset_compression_arg(
+                dataset_compression_mapping, memory_limit=memory_limit)
+
+        return dataset_compression_mapping
+
     def predict(
             self,
             X_test: np.ndarray,
diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
index b9f211283..7da2bd8ed 100644
--- a/autoPyTorch/data/tabular_feature_validator.py
+++ b/autoPyTorch/data/tabular_feature_validator.py
@@ -7,7 +7,7 @@
 import pandas as pd
 from pandas.api.types import is_numeric_dtype
 
-import scipy.sparse
+from scipy.sparse import issparse, spmatrix
 
 import sklearn.utils
 from sklearn import preprocessing
@@ -18,7 +18,11 @@
 from sklearn.pipeline import make_pipeline
 
 from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SupportedFeatTypes
-from autoPyTorch.data.utils import DatasetDTypeContainerType, reduce_dataset_size_if_too_large
+from autoPyTorch.data.utils import (
+    DatasetCompressionInputType,
+    DatasetDTypeContainerType,
+    reduce_dataset_size_if_too_large
+)
 from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 
@@ -101,7 +105,7 @@ def __init__(
         dataset_compression: Optional[Mapping[str, Any]] = None,
     ) -> None:
         self._dataset_compression = dataset_compression
-        self._precision: Optional[DatasetDTypeContainerType] = None
+        self._reduced_dtype: Optional[DatasetDTypeContainerType] = None
         super().__init__(logger)
 
     @staticmethod
@@ -151,7 +155,7 @@ def _fit(
         if isinstance(X, np.ndarray):
             X = self.numpy_array_to_pandas(X)
 
-        if hasattr(X, "iloc") and not scipy.sparse.issparse(X):
+        if hasattr(X, "iloc") and not issparse(X):
             X = cast(pd.DataFrame, X)
             # Treat a column with all instances a NaN as numerical
             # This will prevent doing encoding to a categorical column made completely
@@ -217,7 +221,7 @@ def _fit(
     def transform(
         self,
         X: SupportedFeatTypes,
-    ) -> np.ndarray:
+    ) -> Union[np.ndarray, spmatrix, pd.DataFrame]:
         """
         Validates and fit a categorical encoder (if needed) to the features.
         The supported data types are List, numpy arrays and pandas DataFrames.
@@ -241,7 +245,7 @@ def transform(
         if isinstance(X, np.ndarray):
             X = self.numpy_array_to_pandas(X)
 
-        if hasattr(X, "iloc") and not scipy.sparse.issparse(X):
+        if hasattr(X, "iloc") and not issparse(X):
             if np.any(pd.isnull(X)):
                 for column in X.columns:
                     if X[column].isna().all():
@@ -268,7 +272,7 @@ def transform(
 
         # Sparse related transformations
         # Not all sparse format support index sorting
-        if scipy.sparse.issparse(X) and hasattr(X, 'sort_indices'):
+        if issparse(X) and hasattr(X, 'sort_indices'):
             X.sort_indices()
 
         try:
@@ -285,20 +289,38 @@ def transform(
                                   "numerical or categorical values.")
             raise e
 
-        if (
-            (
-                isinstance(X, np.ndarray) or scipy.sparse.issparse(X) or hasattr(X, 'iloc')
-            )
-            and self._dataset_compression is not None
-        ):
-            if self._precision is not None:
-                X = X.astype(self._precision)
-            else:
-                X = reduce_dataset_size_if_too_large(X, **self._dataset_compression)
-                self._precision = dict(X.dtypes) if hasattr(X, 'iloc') else X.dtype
+        X = self._compress_dataset(X)
 
         return X
 
+    # TODO: modify once we have added subsampling as well.
+    def _compress_dataset(self, X: DatasetCompressionInputType) -> DatasetCompressionInputType:
+        """
+        Compress the dataset. This function ensures that
+        the testing data is converted to the same dtype as
+        the training data.
+
+
+        Args:
+            X (DatasetCompressionInputType):
+                Dataset
+
+        Returns:
+            DatasetCompressionInputType:
+                Compressed dataset.
+        """
+        is_dataframe = hasattr(X, 'iloc')
+        is_reducible_type = isinstance(X, np.ndarray) or issparse(X) or is_dataframe
+        if not is_reducible_type or self._dataset_compression is None:
+            return X
+        elif self._reduced_dtype is not None:
+            X = X.astype(self._reduced_dtype)
+            return X
+        else:
+            X = reduce_dataset_size_if_too_large(X, **self._dataset_compression)
+            self._reduced_dtype = dict(X.dtypes) if is_dataframe else X.dtype
+            return X
+
     def _check_data(
         self,
         X: SupportedFeatTypes,
@@ -312,7 +334,7 @@ def _check_data(
                 checks) and an encoder fitted in the case the data needs encoding
         """
 
-        if not isinstance(X, (np.ndarray, pd.DataFrame)) and not scipy.sparse.issparse(X):
+        if not isinstance(X, (np.ndarray, pd.DataFrame)) and not issparse(X):
             raise ValueError("AutoPyTorch only supports Numpy arrays, Pandas DataFrames,"
                              " scipy sparse and Python Lists, yet, the provided input is"
                              " of type {}".format(type(X))
diff --git a/autoPyTorch/data/utils.py b/autoPyTorch/data/utils.py
index ce69e9d70..a46d55f61 100644
--- a/autoPyTorch/data/utils.py
+++ b/autoPyTorch/data/utils.py
@@ -194,21 +194,21 @@ def reduce_precision(
         For dataframe, the column's precision is reduced using pd.to_numeric.
 
     Args:
-        X:  DatasetCompressionInputType
+        X (DatasetCompressionInputType):
             The data to reduce precision of.
 
     Returns:
         Tuple[DatasetCompressionInputType, DatasetDTypeContainerType, DatasetDTypeContainerType]
             Returns the reduced data X along with the dtypes it and the dtypes it was reduced to.
     """
-    precision: Optional[DatasetDTypeContainerType] = None
+    reduced_dtypes: Optional[DatasetDTypeContainerType] = None
     if isinstance(X, np.ndarray) or issparse(X):
         dtypes = X.dtype
         if X.dtype not in supported_precision_reductions:
             raise ValueError(f"X.dtype = {X.dtype} not equal to any supported"
                              f" {supported_precision_reductions}")
-        precision = reduction_mapping[X.dtype]
-        X = X.astype(precision)
+        reduced_dtypes = reduction_mapping[X.dtype]
+        X = X.astype(reduced_dtypes)
     elif hasattr(X, 'iloc'):
         dtypes = dict(X.dtypes)
 
@@ -226,15 +226,16 @@ def reduce_precision(
             X[integer_columns] = X[integer_columns].apply(lambda column: pd.to_numeric(column, downcast='integer'))
         if len(float_columns) > 0:
             X[float_columns] = X[float_columns].apply(lambda column: pd.to_numeric(column, downcast='float'))
-        precision = dict(X.dtypes)
+        reduced_dtypes = dict(X.dtypes)
     else:
         raise ValueError(f"Unrecognised data type of X, expected data type to "
                          f"be in (np.ndarray, spmatrix, pd.DataFrame), but got :{type(X)}")
 
-    return X, precision, dtypes
+    return X, reduced_dtypes, dtypes
 
 
 def megabytes(arr: DatasetCompressionInputType) -> float:
+
     if isinstance(arr, np.ndarray):
         memory_in_bytes = arr.nbytes
     elif issparse(arr):
@@ -242,7 +243,9 @@ def megabytes(arr: DatasetCompressionInputType) -> float:
     elif hasattr(arr, 'iloc'):
         memory_in_bytes = arr.memory_usage(index=True, deep=True).sum()
     else:
-        return 0
+        raise ValueError(f"Unrecognised data type of X, expected data type to "
+                         f"be in (np.ndarray, spmatrix, pd.DataFrame) but got :{type(arr)}")
+
     return float(memory_in_bytes / (2**20))
 
 
@@ -287,17 +290,16 @@ def reduce_dataset_size_if_too_large(
             The reduced X if reductions were needed
     """
 
-    precision: Optional[DatasetDTypeContainerType] = None
     for method in methods:
 
         if method == 'precision':
             # If the dataset is too big for the allocated memory,
             # we then try to reduce the precision if it's a high precision dataset
             if megabytes(X) > memory_allocation:
-                X, precision, dtypes = reduce_precision(X)
+                X, reduced_dtypes, dtypes = reduce_precision(X)
                 warnings.warn(
                     f'Dataset too large for allocated memory {memory_allocation}MB, '
-                    f'reduced the precision from {dtypes} to {precision}',
+                    f'reduced the precision from {dtypes} to {reduced_dtypes}',
                 )
         else:
             raise ValueError(f"Unknown operation `{method}`")
diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py
index 5eb28309d..3d352d765 100644
--- a/test/test_data/test_feature_validator.py
+++ b/test/test_data/test_feature_validator.py
@@ -591,7 +591,7 @@ def test_featurevalidator_reduce_precision(input_data_featuretest):
     validator.fit(X_train=X_train)
     transformed_X_train = validator.transform(X_train.copy())
 
-    assert validator._precision is not None
+    assert validator._reduced_dtype is not None
     assert megabytes(transformed_X_train) < megabytes(X_train)
 
     transformed_X_test = validator.transform(X_test.copy())
@@ -601,4 +601,4 @@ def test_featurevalidator_reduce_precision(input_data_featuretest):
         assert all(transformed_X_train.dtypes == validator._precision)
     else:
         assert transformed_X_train.dtype == transformed_X_test.dtype
-        assert transformed_X_test.dtype == validator._precision
+    assert transformed_X_test.dtype == validator._reduced_dtype

From a0c9f71a96b5ebb5103ef46ad0d91f8c28114d0f Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Fri, 25 Feb 2022 12:48:55 +0100
Subject: [PATCH 09/12] refactor code

---
 autoPyTorch/api/tabular_classification.py |  43 +-----
 autoPyTorch/api/tabular_regression.py     |  43 +-----
 autoPyTorch/data/utils.py                 | 155 ++++++++++++++--------
 test/test_data/test_utils.py              |  33 +++++
 4 files changed, 133 insertions(+), 141 deletions(-)

diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py
index a69471da4..684c22a7b 100644
--- a/autoPyTorch/api/tabular_classification.py
+++ b/autoPyTorch/api/tabular_classification.py
@@ -12,9 +12,7 @@
 )
 from autoPyTorch.data.tabular_validator import TabularInputValidator
 from autoPyTorch.data.utils import (
-    DatasetCompressionSpec,
-    default_dataset_compression_arg,
-    validate_dataset_compression_arg
+    get_dataset_compression_mapping
 )
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.datasets.resampling_strategy import (
@@ -411,7 +409,7 @@ def search(
             self
 
         """
-        self._dataset_compression = self._get_dataset_compression_mapping(memory_limit, dataset_compression)
+        self._dataset_compression = get_dataset_compression_mapping(memory_limit, dataset_compression)
 
         self.dataset, self.input_validator = self._get_dataset_input_validator(
             X_train=X_train,
@@ -442,43 +440,6 @@ def search(
             portfolio_selection=portfolio_selection,
         )
 
-    def _get_dataset_compression_mapping(
-        self,
-        memory_limit: int,
-        dataset_compression: Union[bool, Mapping[str, Any]]
-    ) -> Optional[DatasetCompressionSpec]:
-        """
-        Internal function to get value for `self._dataset_compression`
-        based on the value of `dataset_compression` passed.
-
-        If True, it returns the default_dataset_compression_arg. In case
-        of a mapping, it is validated and returned as a `DatasetCompressionSpec`.
-
-        If False, it returns None.
-
-        Args:
-            memory_limit (int):
-                memory limit of the current search.
-            dataset_compression (Union[bool, Mapping[str, Any]]):
-                mapping passed to the `search` function.
-
-        Returns:
-            Optional[DatasetCompressionSpec]:
-                Validated data compression spec or None.
-        """
-        dataset_compression_mapping: Optional[Mapping[str, Any]] = None
-
-        if not isinstance(dataset_compression, bool):
-            dataset_compression_mapping = dataset_compression
-        elif dataset_compression:
-            dataset_compression_mapping = default_dataset_compression_arg
-
-        if dataset_compression_mapping is not None:
-            dataset_compression_mapping = validate_dataset_compression_arg(
-                dataset_compression_mapping, memory_limit=memory_limit)
-
-        return dataset_compression_mapping
-
     def predict(
             self,
             X_test: np.ndarray,
diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py
index 6ea2def0d..d766bad68 100644
--- a/autoPyTorch/api/tabular_regression.py
+++ b/autoPyTorch/api/tabular_regression.py
@@ -12,9 +12,7 @@
 )
 from autoPyTorch.data.tabular_validator import TabularInputValidator
 from autoPyTorch.data.utils import (
-    DatasetCompressionSpec,
-    default_dataset_compression_arg,
-    validate_dataset_compression_arg
+    get_dataset_compression_mapping
 )
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.datasets.resampling_strategy import (
@@ -413,7 +411,7 @@ def search(
 
         """
 
-        self._dataset_compression = self._get_dataset_compression_mapping(memory_limit, dataset_compression)
+        self._dataset_compression = get_dataset_compression_mapping(memory_limit, dataset_compression)
 
         self.dataset, self.input_validator = self._get_dataset_input_validator(
             X_train=X_train,
@@ -444,43 +442,6 @@ def search(
             portfolio_selection=portfolio_selection,
         )
 
-    def _get_dataset_compression_mapping(
-        self,
-        memory_limit: int,
-        dataset_compression: Union[bool, Mapping[str, Any]]
-    ) -> Optional[DatasetCompressionSpec]:
-        """
-        Internal function to get value for `self._dataset_compression`
-        based on the value of `dataset_compression` passed.
-
-        If True, it returns the default_dataset_compression_arg. In case
-        of a mapping, it is validated and returned as a `DatasetCompressionSpec`.
-
-        If False, it returns None.
-
-        Args:
-            memory_limit (int):
-                memory limit of the current search.
-            dataset_compression (Union[bool, Mapping[str, Any]]):
-                mapping passed to the `search` function.
-
-        Returns:
-            Optional[DatasetCompressionSpec]:
-                Validated data compression spec or None.
-        """
-        dataset_compression_mapping: Optional[Mapping[str, Any]] = None
-
-        if not isinstance(dataset_compression, bool):
-            dataset_compression_mapping = dataset_compression
-        elif dataset_compression:
-            dataset_compression_mapping = default_dataset_compression_arg
-
-        if dataset_compression_mapping is not None:
-            dataset_compression_mapping = validate_dataset_compression_arg(
-                dataset_compression_mapping, memory_limit=memory_limit)
-
-        return dataset_compression_mapping
-
     def predict(
             self,
             X_test: np.ndarray,
diff --git a/autoPyTorch/data/utils.py b/autoPyTorch/data/utils.py
index a46d55f61..a2d12e85c 100644
--- a/autoPyTorch/data/utils.py
+++ b/autoPyTorch/data/utils.py
@@ -38,6 +38,43 @@
 }
 
 
+def get_dataset_compression_mapping(
+    memory_limit: int,
+    dataset_compression: Union[bool, Mapping[str, Any]]
+) -> Optional[DatasetCompressionSpec]:
+    """
+    Internal function to get value for `BaseTask._dataset_compression`
+    based on the value of `dataset_compression` passed.
+
+    If True, it returns the default_dataset_compression_arg. In case
+    of a mapping, it is validated and returned as a `DatasetCompressionSpec`.
+
+    If False, it returns None.
+
+    Args:
+        memory_limit (int):
+            memory limit of the current search.
+        dataset_compression (Union[bool, Mapping[str, Any]]):
+            mapping passed to the `search` function.
+
+    Returns:
+        Optional[DatasetCompressionSpec]:
+            Validated data compression spec or None.
+    """
+    dataset_compression_mapping: Optional[Mapping[str, Any]] = None
+
+    if not isinstance(dataset_compression, bool):
+        dataset_compression_mapping = dataset_compression
+    elif dataset_compression:
+        dataset_compression_mapping = default_dataset_compression_arg
+
+    if dataset_compression_mapping is not None:
+        dataset_compression_mapping = validate_dataset_compression_arg(
+            dataset_compression_mapping, memory_limit=memory_limit)
+
+    return dataset_compression_mapping
+
+
 def validate_dataset_compression_arg(
     dataset_compression: Mapping[str, Any],
     memory_limit: int
@@ -54,78 +91,78 @@ def validate_dataset_compression_arg(
         DatasetCompressionSpec
             The validated and correct dataset compression spec
     """
-    if isinstance(dataset_compression, Mapping):
-        # Fill with defaults if they don't exist
-        dataset_compression = {
-            **default_dataset_compression_arg,
-            **dataset_compression
-        }
-
-        # Must contain known keys
-        if set(dataset_compression.keys()) != set(default_dataset_compression_arg.keys()):
-            raise ValueError(
-                f"Unknown key in dataset_compression, {list(dataset_compression.keys())}."
-                f"\nPossible keys are {list(default_dataset_compression_arg.keys())}"
-            )
+    if not isinstance(dataset_compression, Mapping):
+        raise ValueError(
+            f"Unknown type for `dataset_compression` {type(dataset_compression)}"
+            f"\ndataset_compression = {dataset_compression}"
+        )
 
-        memory_allocation = dataset_compression["memory_allocation"]
+    # Fill with defaults if they don't exist
+    dataset_compression = {
+        **default_dataset_compression_arg,
+        **dataset_compression
+    }
 
-        # "memory_allocation" must be float or int
-        if not (isinstance(memory_allocation, float) or isinstance(memory_allocation, int)):
-            raise ValueError(
-                "key 'memory_allocation' must be an `int` or `float`"
-                f"\ntype = {memory_allocation}"
-                f"\ndataset_compression = {dataset_compression}"
-            )
+    # Must contain known keys
+    if set(dataset_compression.keys()) != set(default_dataset_compression_arg.keys()):
+        raise ValueError(
+            f"Unknown key in dataset_compression, {list(dataset_compression.keys())}."
+            f"\nPossible keys are {list(default_dataset_compression_arg.keys())}"
+        )
 
-        # "memory_allocation" if absolute, should be > 0 and < memory_limit
-        if isinstance(memory_allocation, int) and not (0 < memory_allocation < memory_limit):
-            raise ValueError(
-                f"key 'memory_allocation' if int must be in (0, memory_limit={memory_limit})"
-                f"\nmemory_allocation = {memory_allocation}"
-                f"\ndataset_compression = {dataset_compression}"
-            )
+    memory_allocation = dataset_compression["memory_allocation"]
 
-        # "memory_allocation" must be in (0,1) if float
-        if isinstance(memory_allocation, float):
-            if not (0.0 < memory_allocation < 1.0):
-                raise ValueError(
-                    "key 'memory_allocation' if float must be in (0, 1)"
-                    f"\nmemory_allocation = {memory_allocation}"
-                    f"\ndataset_compression = {dataset_compression}"
-                )
-            # convert to int so we can directly use
-            dataset_compression["memory_allocation"] = floor(memory_allocation * memory_limit)
-
-        # "methods" must be non-empty sequence
-        if (
-            not isinstance(dataset_compression["methods"], Sequence)
-            or len(dataset_compression["methods"]) <= 0
-        ):
-            raise ValueError(
-                "key 'methods' must be a non-empty list"
-                f"\nmethods = {dataset_compression['methods']}"
-                f"\ndataset_compression = {dataset_compression}"
-            )
+    # "memory_allocation" must be float or int
+    if not (isinstance(memory_allocation, float) or isinstance(memory_allocation, int)):
+        raise ValueError(
+            "key 'memory_allocation' must be an `int` or `float`"
+            f"\ntype = {memory_allocation}"
+            f"\ndataset_compression = {dataset_compression}"
+        )
 
-        # "methods" must contain known methods
-        if any(
-            method not in cast(Sequence, default_dataset_compression_arg["methods"])  # mypy
-            for method in dataset_compression["methods"]
-        ):
+    # "memory_allocation" if absolute, should be > 0 and < memory_limit
+    if isinstance(memory_allocation, int) and not (0 < memory_allocation < memory_limit):
+        raise ValueError(
+            f"key 'memory_allocation' if int must be in (0, memory_limit={memory_limit})"
+            f"\nmemory_allocation = {memory_allocation}"
+            f"\ndataset_compression = {dataset_compression}"
+        )
+
+    # "memory_allocation" must be in (0,1) if float
+    if isinstance(memory_allocation, float):
+        if not (0.0 < memory_allocation < 1.0):
             raise ValueError(
-                f"key 'methods' can only contain {default_dataset_compression_arg['methods']}"
-                f"\nmethods = {dataset_compression['methods']}"
+                "key 'memory_allocation' if float must be in (0, 1)"
+                f"\nmemory_allocation = {memory_allocation}"
                 f"\ndataset_compression = {dataset_compression}"
             )
+        # convert to int so we can directly use
+        dataset_compression["memory_allocation"] = floor(memory_allocation * memory_limit)
+
+    # "methods" must be non-empty sequence
+    if (
+        not isinstance(dataset_compression["methods"], Sequence)
+        or len(dataset_compression["methods"]) <= 0
+    ):
+        raise ValueError(
+            "key 'methods' must be a non-empty list"
+            f"\nmethods = {dataset_compression['methods']}"
+            f"\ndataset_compression = {dataset_compression}"
+        )
 
-        return cast(DatasetCompressionSpec, dataset_compression)
-    else:
+    # "methods" must contain known methods
+    if any(
+        method not in cast(Sequence, default_dataset_compression_arg["methods"])  # mypy
+        for method in dataset_compression["methods"]
+    ):
         raise ValueError(
-            f"Unknown type for `dataset_compression` {type(dataset_compression)}"
+            f"key 'methods' can only contain {default_dataset_compression_arg['methods']}"
+            f"\nmethods = {dataset_compression['methods']}"
             f"\ndataset_compression = {dataset_compression}"
         )
 
+    return cast(DatasetCompressionSpec, dataset_compression)
+
 
 class _DtypeReductionMapping(Mapping):
     """
diff --git a/test/test_data/test_utils.py b/test/test_data/test_utils.py
index 7f47469ef..a598b139f 100644
--- a/test/test_data/test_utils.py
+++ b/test/test_data/test_utils.py
@@ -1,3 +1,5 @@
+from tkinter.tix import Tree
+from typing import Mapping
 import numpy as np
 
 from pandas.testing import assert_frame_equal
@@ -7,6 +9,8 @@
 from sklearn.datasets import fetch_openml
 
 from autoPyTorch.data.utils import (
+    DatasetCompressionSpec,
+    get_dataset_compression_mapping,
     megabytes,
     reduce_dataset_size_if_too_large,
     reduce_precision,
@@ -72,3 +76,32 @@ def test_error_raised_reduce_precision():
     # in case X is not an expected type
     with pytest.raises(ValueError, match=r'Unrecognised data type of X, expected data type to .*'):
         reduce_precision(X='not expected')
+
+
+def _verify_dataset_compression_mapping(mapping):
+    assert isinstance(mapping, Mapping)
+    assert 'methods' in mapping
+    assert 'memory_allocation' in mapping
+
+
+@pytest.mark.parametrize('memory_limit', [2048])
+def test_get_dataset_compression_mapping(memory_limit):
+    """
+    Tests the functionalities of `get_dataset_compression_mapping`
+    """
+    dataset_compression_mapping = get_dataset_compression_mapping(
+        dataset_compression=True,
+        memory_limit=memory_limit)
+    _verify_dataset_compression_mapping(dataset_compression_mapping)
+
+    dataset_compression_mapping = get_dataset_compression_mapping(
+        dataset_compression={'memory_allocation': 0.01, 'methods': ['precision']},
+        memory_limit=memory_limit
+    )
+    _verify_dataset_compression_mapping(dataset_compression_mapping)
+
+    dataset_compression_mapping = get_dataset_compression_mapping(
+        dataset_compression=False,
+        memory_limit=memory_limit
+    )
+    assert dataset_compression_mapping is None

From a67ac2ae230194dad83b3d0ae6dea3cafdf1c4c0 Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Fri, 25 Feb 2022 13:25:33 +0100
Subject: [PATCH 10/12] check if mapping is the same as expected

---
 test/test_data/test_utils.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/test/test_data/test_utils.py b/test/test_data/test_utils.py
index a598b139f..c26ad9fb3 100644
--- a/test/test_data/test_utils.py
+++ b/test/test_data/test_utils.py
@@ -1,5 +1,5 @@
-from tkinter.tix import Tree
 from typing import Mapping
+
 import numpy as np
 
 from pandas.testing import assert_frame_equal
@@ -9,7 +9,7 @@
 from sklearn.datasets import fetch_openml
 
 from autoPyTorch.data.utils import (
-    DatasetCompressionSpec,
+    default_dataset_compression_arg,
     get_dataset_compression_mapping,
     megabytes,
     reduce_dataset_size_if_too_large,
@@ -78,10 +78,11 @@ def test_error_raised_reduce_precision():
         reduce_precision(X='not expected')
 
 
-def _verify_dataset_compression_mapping(mapping):
+def _verify_dataset_compression_mapping(mapping, expected_mapping):
     assert isinstance(mapping, Mapping)
     assert 'methods' in mapping
     assert 'memory_allocation' in mapping
+    assert mapping == expected_mapping
 
 
 @pytest.mark.parametrize('memory_limit', [2048])
@@ -92,13 +93,17 @@ def test_get_dataset_compression_mapping(memory_limit):
     dataset_compression_mapping = get_dataset_compression_mapping(
         dataset_compression=True,
         memory_limit=memory_limit)
-    _verify_dataset_compression_mapping(dataset_compression_mapping)
+    # validation converts the memory allocation from float to integer based on the memory limit
+    expected_mapping = validate_dataset_compression_arg(default_dataset_compression_arg, memory_limit)
+    _verify_dataset_compression_mapping(dataset_compression_mapping, expected_mapping)
 
+    mapping = {'memory_allocation': 0.01, 'methods': ['precision']}
     dataset_compression_mapping = get_dataset_compression_mapping(
-        dataset_compression={'memory_allocation': 0.01, 'methods': ['precision']},
+        dataset_compression=mapping,
         memory_limit=memory_limit
     )
-    _verify_dataset_compression_mapping(dataset_compression_mapping)
+    expected_mapping = validate_dataset_compression_arg(mapping, memory_limit)
+    _verify_dataset_compression_mapping(dataset_compression_mapping, expected_mapping)
 
     dataset_compression_mapping = get_dataset_compression_mapping(
         dataset_compression=False,

From 687a74a3e2176a8bc90e17435132c273765caf5a Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Fri, 25 Feb 2022 14:42:24 +0100
Subject: [PATCH 11/12] update precision reduction for dataframes and tests

---
 autoPyTorch/data/utils.py    | 19 ++++++-------------
 test/test_data/test_utils.py | 15 +++++++++++++++
 2 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/autoPyTorch/data/utils.py b/autoPyTorch/data/utils.py
index a2d12e85c..43dacf543 100644
--- a/autoPyTorch/data/utils.py
+++ b/autoPyTorch/data/utils.py
@@ -18,7 +18,6 @@
 import numpy as np
 
 import pandas as pd
-from pandas.api.types import is_float_dtype, is_numeric_dtype
 
 from scipy.sparse import issparse, spmatrix
 
@@ -246,23 +245,17 @@ def reduce_precision(
                              f" {supported_precision_reductions}")
         reduced_dtypes = reduction_mapping[X.dtype]
         X = X.astype(reduced_dtypes)
+
     elif hasattr(X, 'iloc'):
         dtypes = dict(X.dtypes)
 
-        integer_columns = []
-        float_columns = []
+        col_names = X.dtypes.index
 
-        for col, dtype in dtypes.items():
-            if is_numeric_dtype(dtype):
-                if is_float_dtype(dtype):
-                    float_columns.append(col)
-                else:
-                    integer_columns.append(col)
+        float_cols = col_names[[dt.name.startswith("float") for dt in X.dtypes.values]]
+        int_cols = col_names[[dt.name.startswith("int") for dt in X.dtypes.values]]
+        X[int_cols] = X[int_cols].apply(lambda column: pd.to_numeric(column, downcast='integer'))
+        X[float_cols] = X[float_cols].apply(lambda column: pd.to_numeric(column, downcast='float'))
 
-        if len(integer_columns) > 0:
-            X[integer_columns] = X[integer_columns].apply(lambda column: pd.to_numeric(column, downcast='integer'))
-        if len(float_columns) > 0:
-            X[float_columns] = X[float_columns].apply(lambda column: pd.to_numeric(column, downcast='float'))
         reduced_dtypes = dict(X.dtypes)
     else:
         raise ValueError(f"Unrecognised data type of X, expected data type to "
diff --git a/test/test_data/test_utils.py b/test/test_data/test_utils.py
index c26ad9fb3..ce584197b 100644
--- a/test/test_data/test_utils.py
+++ b/test/test_data/test_utils.py
@@ -110,3 +110,18 @@ def test_get_dataset_compression_mapping(memory_limit):
         memory_limit=memory_limit
     )
     assert dataset_compression_mapping is None
+
+
+def test_unsupported_errors():
+    """
+    Checks if errors are raised when unsupported data is passed to reduce
+    """
+    X = np.array([
+        ['a', 'b', 'c', 'a', 'b', 'c'],
+        ['a', 'b', 'd', 'r', 'b', 'c']])
+    with pytest.raises(ValueError, match=r'X.dtype = .*'):
+        reduce_dataset_size_if_too_large(X, 0)
+
+    X = [[1, 2], [2, 3]]
+    with pytest.raises(ValueError, match=r'Unrecognised data type of X, expected data type to be in .*'):
+        reduce_dataset_size_if_too_large(X, 0)
\ No newline at end of file

From b10465142f0c1aa82d9737cbe4725954f1d57dfe Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Fri, 25 Feb 2022 14:52:20 +0100
Subject: [PATCH 12/12] fix flake

---
 test/test_data/test_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_data/test_utils.py b/test/test_data/test_utils.py
index ce584197b..505860a94 100644
--- a/test/test_data/test_utils.py
+++ b/test/test_data/test_utils.py
@@ -124,4 +124,4 @@ def test_unsupported_errors():
 
     X = [[1, 2], [2, 3]]
     with pytest.raises(ValueError, match=r'Unrecognised data type of X, expected data type to be in .*'):
-        reduce_dataset_size_if_too_large(X, 0)
\ No newline at end of file
+        reduce_dataset_size_if_too_large(X, 0)