From 295a307387a0ab34c52177c024151c47a5365244 Mon Sep 17 00:00:00 2001
From: nabenabe0928 <shuhei.watanabe.utokyo@gmail.com>
Date: Tue, 22 Feb 2022 20:29:22 +0900
Subject: [PATCH 1/4] [fix] Fix the task inference issue mentioned in #352

Since sklearn task inference regards targets with integers as
a classification task, I modified target_validator so that we always
cast targets for regression to float.
This workaround is mentioned in the reference below:
https://github.com/scikit-learn/scikit-learn/issues/8952
---
 autoPyTorch/data/base_feature_validator.py    | 18 ++---
 autoPyTorch/data/base_target_validator.py     | 24 +++---
 autoPyTorch/data/base_validator.py            | 28 +++----
 autoPyTorch/data/tabular_feature_validator.py | 22 ++---
 autoPyTorch/data/tabular_target_validator.py  | 80 ++++++++++---------
 5 files changed, 88 insertions(+), 84 deletions(-)

diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py
index 6ef7cae6b..2d0ecf988 100644
--- a/autoPyTorch/data/base_feature_validator.py
+++ b/autoPyTorch/data/base_feature_validator.py
@@ -12,7 +12,7 @@
 from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 
-SUPPORTED_FEAT_TYPES = Union[
+SupportedFeatTypes = Union[
     List,
     pd.DataFrame,
     np.ndarray,
@@ -68,8 +68,8 @@ def __init__(
 
     def fit(
         self,
-        X_train: SUPPORTED_FEAT_TYPES,
-        X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
+        X_train: SupportedFeatTypes,
+        X_test: Optional[SupportedFeatTypes] = None,
     ) -> BaseEstimator:
         """
         Validates and fit a categorical encoder (if needed) to the features.
@@ -77,10 +77,10 @@ def fit(
         CSR sparse data types are also supported
 
         Args:
-            X_train (SUPPORTED_FEAT_TYPES):
+            X_train (SupportedFeatTypes):
                 A set of features that are going to be validated (type and dimensionality
                 checks) and a encoder fitted in the case the data needs encoding
-            X_test (Optional[SUPPORTED_FEAT_TYPES]):
+            X_test (Optional[SupportedFeatTypes]):
                 A hold out set of data used for checking
         """
 
@@ -109,11 +109,11 @@ def fit(
 
     def _fit(
         self,
-        X: SUPPORTED_FEAT_TYPES,
+        X: SupportedFeatTypes,
     ) -> BaseEstimator:
         """
         Args:
-            X (SUPPORTED_FEAT_TYPES):
+            X (SupportedFeatTypes):
                 A set of features that are going to be validated (type and dimensionality
                 checks) and a encoder fitted in the case the data needs encoding
         Returns:
@@ -124,11 +124,11 @@ def _fit(
 
     def transform(
         self,
-        X: SUPPORTED_FEAT_TYPES,
+        X: SupportedFeatTypes,
     ) -> np.ndarray:
         """
         Args:
-            X_train (SUPPORTED_FEAT_TYPES):
+            X_train (SupportedFeatTypes):
                 A set of features, whose categorical features are going to be
                 transformed
 
diff --git a/autoPyTorch/data/base_target_validator.py b/autoPyTorch/data/base_target_validator.py
index 393f3d85b..1b8ce124a 100644
--- a/autoPyTorch/data/base_target_validator.py
+++ b/autoPyTorch/data/base_target_validator.py
@@ -12,7 +12,7 @@
 from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 
-SUPPORTED_TARGET_TYPES = Union[
+SupportedTargetTypes = Union[
     List,
     pd.Series,
     pd.DataFrame,
@@ -69,17 +69,17 @@ def __init__(self,
 
     def fit(
         self,
-        y_train: SUPPORTED_TARGET_TYPES,
-        y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
+        y_train: SupportedTargetTypes,
+        y_test: Optional[SupportedTargetTypes] = None,
     ) -> BaseEstimator:
         """
         Validates and fit a categorical encoder (if needed) to the targets
         The supported data types are List, numpy arrays and pandas DataFrames.
 
         Args:
-            y_train (SUPPORTED_TARGET_TYPES)
+            y_train (SupportedTargetTypes)
                 A set of targets set aside for training
-            y_test (Union[SUPPORTED_TARGET_TYPES])
+            y_test (Union[SupportedTargetTypes])
                 A hold out set of data used of the targets. It is also used to fit the
                 categories of the encoder.
         """
@@ -128,26 +128,26 @@ def fit(
 
     def _fit(
         self,
-        y_train: SUPPORTED_TARGET_TYPES,
-        y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
+        y_train: SupportedTargetTypes,
+        y_test: Optional[SupportedTargetTypes] = None,
     ) -> BaseEstimator:
         """
         Args:
-            y_train (SUPPORTED_TARGET_TYPES)
+            y_train (SupportedTargetTypes)
                 The labels of the current task. They are going to be encoded in case
                 of classification
-            y_test (Optional[SUPPORTED_TARGET_TYPES])
+            y_test (Optional[SupportedTargetTypes])
                 A holdout set of labels
         """
         raise NotImplementedError()
 
     def transform(
         self,
-        y: Union[SUPPORTED_TARGET_TYPES],
+        y: Union[SupportedTargetTypes],
     ) -> np.ndarray:
         """
         Args:
-            y (SUPPORTED_TARGET_TYPES)
+            y (SupportedTargetTypes)
                 A set of targets that are going to be encoded if the current task
                 is classification
         Returns:
@@ -158,7 +158,7 @@ def transform(
 
     def inverse_transform(
         self,
-        y: SUPPORTED_TARGET_TYPES,
+        y: SupportedTargetTypes,
     ) -> np.ndarray:
         """
         Revert any encoding transformation done on a target array
diff --git a/autoPyTorch/data/base_validator.py b/autoPyTorch/data/base_validator.py
index 13bb421c7..bebddff49 100644
--- a/autoPyTorch/data/base_validator.py
+++ b/autoPyTorch/data/base_validator.py
@@ -7,8 +7,8 @@
 from sklearn.base import BaseEstimator
 from sklearn.exceptions import NotFittedError
 
-from autoPyTorch.data.base_feature_validator import SUPPORTED_FEAT_TYPES
-from autoPyTorch.data.base_target_validator import SUPPORTED_TARGET_TYPES
+from autoPyTorch.data.base_feature_validator import SupportedFeatTypes
+from autoPyTorch.data.base_target_validator import SupportedTargetTypes
 
 
 class BaseInputValidator(BaseEstimator):
@@ -40,10 +40,10 @@ def __init__(
 
     def fit(
         self,
-        X_train: SUPPORTED_FEAT_TYPES,
-        y_train: SUPPORTED_TARGET_TYPES,
-        X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
-        y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
+        X_train: SupportedFeatTypes,
+        y_train: SupportedTargetTypes,
+        X_test: Optional[SupportedFeatTypes] = None,
+        y_test: Optional[SupportedTargetTypes] = None,
     ) -> BaseEstimator:
         """
         Validates and fit a categorical encoder (if needed) to the features, and
@@ -59,15 +59,15 @@ def fit(
             + If performing a classification task, the data is going to be encoded
 
         Args:
-            X_train (SUPPORTED_FEAT_TYPES):
+            X_train (SupportedFeatTypes):
                 A set of features that are going to be validated (type and dimensionality
                 checks). If this data contains categorical columns, an encoder is going to
                 be instantiated and trained with this data.
-            y_train (SUPPORTED_TARGET_TYPES):
+            y_train (SupportedTargetTypes):
                 A set of targets that are going to be encoded if the task is for classification
-            X_test (Optional[SUPPORTED_FEAT_TYPES]):
+            X_test (Optional[SupportedFeatTypes]):
                 A hold out set of features used for checking
-            y_test (SUPPORTED_TARGET_TYPES):
+            y_test (SupportedTargetTypes):
                 A hold out set of targets used for checking. Additionally, if the current task
                 is a classification task, this y_test categories are also going to be used to
                 fit a pre-processing encoding (to prevent errors on unseen classes).
@@ -96,16 +96,16 @@ def fit(
 
     def transform(
         self,
-        X: SUPPORTED_FEAT_TYPES,
-        y: Optional[SUPPORTED_TARGET_TYPES] = None,
+        X: SupportedFeatTypes,
+        y: Optional[SupportedTargetTypes] = None,
     ) -> Tuple[np.ndarray, Optional[np.ndarray]]:
         """
         Transform the given target or features to a numpy array
 
         Args:
-            X (SUPPORTED_FEAT_TYPES):
+            X (SupportedFeatTypes):
                 A set of features to transform
-            y (Optional[SUPPORTED_TARGET_TYPES]):
+            y (Optional[SupportedTargetTypes]):
                 A set of targets to transform
 
         Returns:
diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
index 27ed18cfc..4bab001c6 100644
--- a/autoPyTorch/data/tabular_feature_validator.py
+++ b/autoPyTorch/data/tabular_feature_validator.py
@@ -16,7 +16,7 @@
 from sklearn.impute import SimpleImputer
 from sklearn.pipeline import make_pipeline
 
-from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SUPPORTED_FEAT_TYPES
+from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SupportedFeatTypes
 
 
 def _create_column_transformer(
@@ -117,7 +117,7 @@ def _comparator(cmp1: str, cmp2: str) -> int:
 
     def _fit(
         self,
-        X: SUPPORTED_FEAT_TYPES,
+        X: SupportedFeatTypes,
     ) -> BaseEstimator:
         """
         In case input data is a pandas DataFrame, this utility encodes the user provided
@@ -125,7 +125,7 @@ def _fit(
         will be able to use
 
         Args:
-            X (SUPPORTED_FEAT_TYPES):
+            X (SupportedFeatTypes):
                 A set of features that are going to be validated (type and dimensionality
                 checks) and an encoder fitted in the case the data needs encoding
 
@@ -204,14 +204,14 @@ def _fit(
 
     def transform(
         self,
-        X: SUPPORTED_FEAT_TYPES,
+        X: SupportedFeatTypes,
     ) -> np.ndarray:
         """
         Validates and fit a categorical encoder (if needed) to the features.
         The supported data types are List, numpy arrays and pandas DataFrames.
 
         Args:
-            X_train (SUPPORTED_FEAT_TYPES):
+            X_train (SupportedFeatTypes):
                 A set of features, whose categorical features are going to be
                 transformed
 
@@ -276,13 +276,13 @@ def transform(
 
     def _check_data(
         self,
-        X: SUPPORTED_FEAT_TYPES,
+        X: SupportedFeatTypes,
     ) -> None:
         """
         Feature dimensionality and data type checks
 
         Args:
-            X (SUPPORTED_FEAT_TYPES):
+            X (SupportedFeatTypes):
                 A set of features that are going to be validated (type and dimensionality
                 checks) and an encoder fitted in the case the data needs encoding
         """
@@ -429,8 +429,8 @@ def _get_columns_to_encode(
 
     def list_to_dataframe(
         self,
-        X_train: SUPPORTED_FEAT_TYPES,
-        X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
+        X_train: SupportedFeatTypes,
+        X_test: Optional[SupportedFeatTypes] = None,
     ) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
         """
         Converts a list to a pandas DataFrame. In this process, column types are inferred.
@@ -438,10 +438,10 @@ def list_to_dataframe(
         If test data is provided, we proactively match it to train data
 
         Args:
-            X_train (SUPPORTED_FEAT_TYPES):
+            X_train (SupportedFeatTypes):
                 A set of features that are going to be validated (type and dimensionality
                 checks) and a encoder fitted in the case the data needs encoding
-            X_test (Optional[SUPPORTED_FEAT_TYPES]):
+            X_test (Optional[SupportedFeatTypes]):
                 A hold out set of data used for checking
 
         Returns:
diff --git a/autoPyTorch/data/tabular_target_validator.py b/autoPyTorch/data/tabular_target_validator.py
index c37dc81c3..a60c45831 100644
--- a/autoPyTorch/data/tabular_target_validator.py
+++ b/autoPyTorch/data/tabular_target_validator.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Union, cast
+from typing import List, Optional, cast
 
 import numpy as np
 
@@ -13,14 +13,19 @@
 from sklearn.exceptions import NotFittedError
 from sklearn.utils.multiclass import type_of_target
 
-from autoPyTorch.data.base_target_validator import BaseTargetValidator, SUPPORTED_TARGET_TYPES
+from autoPyTorch.data.base_target_validator import BaseTargetValidator, SupportedTargetTypes
+
+
+def _check_and_to_numpy(y: SupportedTargetTypes) -> np.ndarray:
+    """ sklearn check array will make sure we have the correct numerical features for the array """
+    return sklearn.utils.check_array(y, force_all_finite=True, accept_sparse='csr', ensure_2d=False)
 
 
 class TabularTargetValidator(BaseTargetValidator):
     def _fit(
         self,
-        y_train: SUPPORTED_TARGET_TYPES,
-        y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
+        y_train: SupportedTargetTypes,
+        y_test: Optional[SupportedTargetTypes] = None,
     ) -> BaseEstimator:
         """
         If dealing with classification, this utility encodes the targets.
@@ -29,10 +34,10 @@ def _fit(
         errors
 
         Args:
-            y_train (SUPPORTED_TARGET_TYPES)
+            y_train (SupportedTargetTypes)
                 The labels of the current task. They are going to be encoded in case
                 of classification
-            y_test (Optional[SUPPORTED_TARGET_TYPES])
+            y_test (Optional[SupportedTargetTypes])
                 A holdout set of labels
         """
         if not self.is_classification or self.type_of_target == 'multilabel-indicator':
@@ -94,16 +99,34 @@ def _fit(
 
         return self
 
+    def _transform_by_encoder(self, y: SupportedTargetTypes) -> np.ndarray:
+        if self.encoder is None:
+            return _check_and_to_numpy(y)
+
+        # remove ravel warning from pandas Series
+        shape = np.shape(y)
+        if len(shape) > 1:
+            y = self.encoder.transform(y)
+        elif hasattr(y, 'iloc'):
+            # The Ordinal encoder expects a 2 dimensional input.
+            # The targets are 1 dimensional, so reshape to match the expected shape
+            y = cast(pd.DataFrame, y)
+            y = self.encoder.transform(y.to_numpy().reshape(-1, 1)).reshape(-1)
+        else:
+            y = self.encoder.transform(np.array(y).reshape(-1, 1)).reshape(-1)
+
+        return _check_and_to_numpy(y)
+
     def transform(
         self,
-        y: Union[SUPPORTED_TARGET_TYPES],
+        y: SupportedTargetTypes,
     ) -> np.ndarray:
         """
         Validates and fit a categorical encoder (if needed) to the features.
         The supported data types are List, numpy arrays and pandas DataFrames.
 
         Args:
-            y (SUPPORTED_TARGET_TYPES)
+            y (SupportedTargetTypes)
                 A set of targets that are going to be encoded if the current task
                 is classification
 
@@ -116,47 +139,28 @@ def transform(
 
         # Check the data here so we catch problems on new test data
         self._check_data(y)
+        y = self._transform_by_encoder(y)
 
-        if self.encoder is not None:
-            # remove ravel warning from pandas Series
-            shape = np.shape(y)
-            if len(shape) > 1:
-                y = self.encoder.transform(y)
-            else:
-                # The Ordinal encoder expects a 2 dimensional input.
-                # The targets are 1 dimensional, so reshape to match the expected shape
-                if hasattr(y, 'iloc'):
-                    y = cast(pd.DataFrame, y)
-                    y = self.encoder.transform(y.to_numpy().reshape(-1, 1)).reshape(-1)
-                else:
-                    y = self.encoder.transform(np.array(y).reshape(-1, 1)).reshape(-1)
-
-        # sklearn check array will make sure we have the
-        # correct numerical features for the array
-        # Also, a numpy array will be created
-        y = sklearn.utils.check_array(
-            y,
-            force_all_finite=True,
-            accept_sparse='csr',
-            ensure_2d=False,
-        )
-
-        # When translating a dataframe to numpy, make sure we
-        # honor the ravel requirement
+        # When translating a dataframe to numpy, make sure we honor the ravel requirement
         if y.ndim == 2 and y.shape[1] == 1:
             y = np.ravel(y)
 
+        if not self.is_classification:
+            # Regression targets must be cast to float
+            # Ref: https://github.com/scikit-learn/scikit-learn/issues/8952
+            y = y.astype(dtype=np.float64)
+
         return y
 
     def inverse_transform(
         self,
-        y: SUPPORTED_TARGET_TYPES,
+        y: SupportedTargetTypes,
     ) -> np.ndarray:
         """
         Revert any encoding transformation done on a target array
 
         Args:
-            y (Union[np.ndarray, pd.DataFrame, pd.Series]):
+            y (SupportedTargetTypes):
                 Target array to be transformed back to original form before encoding
         Returns:
             np.ndarray:
@@ -187,13 +191,13 @@ def inverse_transform(
 
     def _check_data(
         self,
-        y: SUPPORTED_TARGET_TYPES,
+        y: SupportedTargetTypes,
     ) -> None:
         """
         Perform dimensionality and data type checks on the targets
 
         Args:
-            y (Union[np.ndarray, pd.DataFrame, pd.Series]):
+            y (SupportedTargetTypes):
                 A set of features whose dimensionality and data type is going to be checked
         """
 

From c926036257f2edfab492f5d83348063c822f1ea5 Mon Sep 17 00:00:00 2001
From: nabenabe0928 <shuhei.watanabe.utokyo@gmail.com>
Date: Wed, 23 Feb 2022 04:12:57 +0900
Subject: [PATCH 2/4] [fix] [test] Add a small number to label for regression
 and add tests

Since target labels are required to be float and sklearn requires
numbers after a decimal point, I added a workaround to add the almost
possible minimum fraction to array so that we can avoid a mis-inference
of task type from sklearn.
Plus, I added tests to check if we get the expected results for
extreme cases.
---
 autoPyTorch/data/tabular_target_validator.py | 14 +++++--
 autoPyTorch/datasets/base_dataset.py         | 40 +++++++++++++++-----
 test/test_api/test_api.py                    | 28 ++++++++++++++
 3 files changed, 70 insertions(+), 12 deletions(-)

diff --git a/autoPyTorch/data/tabular_target_validator.py b/autoPyTorch/data/tabular_target_validator.py
index a60c45831..81835fccd 100644
--- a/autoPyTorch/data/tabular_target_validator.py
+++ b/autoPyTorch/data/tabular_target_validator.py
@@ -145,10 +145,18 @@ def transform(
         if y.ndim == 2 and y.shape[1] == 1:
             y = np.ravel(y)
 
-        if not self.is_classification:
-            # Regression targets must be cast to float
+        if not self.is_classification and "continuous" not in type_of_target(y):
+            # Regression targets must have numbers after a decimal point.
             # Ref: https://github.com/scikit-learn/scikit-learn/issues/8952
-            y = y.astype(dtype=np.float64)
+            y_min = np.abs(y).min()
+            offset = y_min * 1e-16  # Sufficiently small number
+            if y_min > 1e15:
+                raise ValueError(
+                    "The minimum value for the target labels of regression tasks must be smaller than "
+                    f"1e15 to avoid errors caused by an overflow, but got {y_min}"
+                )
+
+            y = y.astype(dtype=np.float64) + offset  # Since it is all integer, we can just add a random small number
 
         return y
 
diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py
index 0f37e7938..d08a4f5b4 100644
--- a/autoPyTorch/datasets/base_dataset.py
+++ b/autoPyTorch/datasets/base_dataset.py
@@ -49,6 +49,36 @@ def type_check(train_tensors: BaseDatasetInputType,
             check_valid_data(val_tensors[i])
 
 
+def _get_output_properties(train_tensors: BaseDatasetInputType) -> Tuple[int, str]:
+    """
+    Return the dimension of output given a target_labels and output_type.
+
+    Args:
+        train_tensors (BaseDatasetInputType):
+            Training data.
+
+    Returns:
+        output_dim (int):
+            The dimension of outputs.
+        output_type (str):
+            The output type according to sklearn specification.
+    """
+    if isinstance(train_tensors, Dataset):
+        target_labels = np.array([sample[-1] for sample in train_tensors])
+    else:
+        target_labels = np.array(train_tensors[1])
+
+    output_type: str = type_of_target(target_labels)
+    if STRING_TO_OUTPUT_TYPES.get(output_type, None) in CLASSIFICATION_OUTPUTS:
+        output_dim = len(np.unique(target_labels))
+    elif target_labels.ndim > 1:
+        output_dim = target_labels.shape[-1]
+    else:
+        output_dim = 1
+
+    return output_dim, output_type
+
+
 class TransformSubset(Subset):
     """Wrapper of BaseDataset for splitted datasets
 
@@ -132,15 +162,7 @@ def __init__(
         self.issparse: bool = issparse(self.train_tensors[0])
         self.input_shape: Tuple[int] = self.train_tensors[0].shape[1:]
         if len(self.train_tensors) == 2 and self.train_tensors[1] is not None:
-            self.output_type: str = type_of_target(self.train_tensors[1])
-
-            if (
-                self.output_type in STRING_TO_OUTPUT_TYPES
-                and STRING_TO_OUTPUT_TYPES[self.output_type] in CLASSIFICATION_OUTPUTS
-            ):
-                self.output_shape = len(np.unique(self.train_tensors[1]))
-            else:
-                self.output_shape = self.train_tensors[1].shape[-1] if self.train_tensors[1].ndim > 1 else 1
+            self.output_shape, self.output_type = _get_output_properties(self.train_tensors)
 
         # TODO: Look for a criteria to define small enough to preprocess
         self.is_small_preprocess = True
diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py
index e3603f668..a538115fc 100644
--- a/test/test_api/test_api.py
+++ b/test/test_api/test_api.py
@@ -904,3 +904,31 @@ def test_tabular_classification_test_evaluator(openml_id, backend, n_samples):
     assert 'opt_loss' in incumbent_results, "run history: {}, successful_num_run: {}".format(estimator.run_history.data,
                                                                                              successful_num_run)
     assert 'train_loss' in incumbent_results
+
+
+@pytest.mark.parametrize("ans,task_class", (
+    ("continuous", TabularRegressionTask),
+    ("multiclass", TabularClassificationTask))
+)
+def test_task_inference(ans, task_class, backend):
+    # Get the data and check that contents of data-manager make sense
+    X = np.random.random((5, 1))
+    y = np.array([0, 1, 2, 3, 4]) + 10 ** 15
+
+    X_train, _, y_train, _ = sklearn.model_selection.train_test_split(X, y, random_state=42)
+
+    estimator = task_class(
+        backend=backend,
+        resampling_strategy=HoldoutValTypes.holdout_validation,
+        resampling_strategy_args=None,
+        seed=42,
+    )
+    dataset = estimator.get_dataset(X_train, y_train)
+    assert dataset.output_type == ans
+
+    y_train += 1
+    if ans == 'continuous':
+        with pytest.raises(ValueError):  # ValueError due to `Too large value`
+            estimator.get_dataset(X_train, y_train)
+    else:
+        estimator.get_dataset(X_train, y_train)

From d0bd582c5f9c2f65d82c71b31cb377768244a09f Mon Sep 17 00:00:00 2001
From: nabenabe0928 <shuhei.watanabe.utokyo@gmail.com>
Date: Wed, 23 Feb 2022 21:10:26 +0900
Subject: [PATCH 3/4] [fix] [test] Adapt the modification of targets to
 scipy.sparse.xxx_matrix

---
 autoPyTorch/data/base_feature_validator.py   | 16 +-----
 autoPyTorch/data/base_target_validator.py    | 17 +-----
 autoPyTorch/data/tabular_target_validator.py | 59 +++++++++++---------
 autoPyTorch/utils/common.py                  |  9 +++
 test/test_data/test_target_validator.py      |  6 +-
 test/test_datasets/test_base_dataset.py      | 19 +++++++
 6 files changed, 67 insertions(+), 59 deletions(-)
 create mode 100644 test/test_datasets/test_base_dataset.py

diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py
index 2d0ecf988..2c4ce4de9 100644
--- a/autoPyTorch/data/base_feature_validator.py
+++ b/autoPyTorch/data/base_feature_validator.py
@@ -5,25 +5,13 @@
 
 import pandas as pd
 
-import scipy.sparse
-
 from sklearn.base import BaseEstimator
 
+from autoPyTorch.utils.common import SparseMatrixType
 from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 
-SupportedFeatTypes = Union[
-    List,
-    pd.DataFrame,
-    np.ndarray,
-    scipy.sparse.bsr_matrix,
-    scipy.sparse.coo_matrix,
-    scipy.sparse.csc_matrix,
-    scipy.sparse.csr_matrix,
-    scipy.sparse.dia_matrix,
-    scipy.sparse.dok_matrix,
-    scipy.sparse.lil_matrix,
-]
+SupportedFeatTypes = Union[List, pd.DataFrame, np.ndarray, SparseMatrixType]
 
 
 class BaseFeatureValidator(BaseEstimator):
diff --git a/autoPyTorch/data/base_target_validator.py b/autoPyTorch/data/base_target_validator.py
index 1b8ce124a..ddbe384cb 100644
--- a/autoPyTorch/data/base_target_validator.py
+++ b/autoPyTorch/data/base_target_validator.py
@@ -5,26 +5,13 @@
 
 import pandas as pd
 
-import scipy.sparse
-
 from sklearn.base import BaseEstimator
 
+from autoPyTorch.utils.common import SparseMatrixType
 from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 
-SupportedTargetTypes = Union[
-    List,
-    pd.Series,
-    pd.DataFrame,
-    np.ndarray,
-    scipy.sparse.bsr_matrix,
-    scipy.sparse.coo_matrix,
-    scipy.sparse.csc_matrix,
-    scipy.sparse.csr_matrix,
-    scipy.sparse.dia_matrix,
-    scipy.sparse.dok_matrix,
-    scipy.sparse.lil_matrix,
-]
+SupportedTargetTypes = Union[List, pd.Series, pd.DataFrame, np.ndarray, SparseMatrixType]
 
 
 class BaseTargetValidator(BaseEstimator):
diff --git a/autoPyTorch/data/tabular_target_validator.py b/autoPyTorch/data/tabular_target_validator.py
index 81835fccd..9ff7828f5 100644
--- a/autoPyTorch/data/tabular_target_validator.py
+++ b/autoPyTorch/data/tabular_target_validator.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, cast
+from typing import List, Optional, Union, cast
 
 import numpy as np
 
@@ -14,13 +14,37 @@
 from sklearn.utils.multiclass import type_of_target
 
 from autoPyTorch.data.base_target_validator import BaseTargetValidator, SupportedTargetTypes
+from autoPyTorch.utils.common import SparseMatrixType
 
 
-def _check_and_to_numpy(y: SupportedTargetTypes) -> np.ndarray:
+ArrayType = Union[np.ndarray, SparseMatrixType]
+
+
+def _check_and_to_array(y: SupportedTargetTypes) -> ArrayType:
     """ sklearn check array will make sure we have the correct numerical features for the array """
     return sklearn.utils.check_array(y, force_all_finite=True, accept_sparse='csr', ensure_2d=False)
 
 
+def _modify_regression_target(y: ArrayType) -> ArrayType:
+    # Regression targets must have numbers after a decimal point.
+    # Ref: https://github.com/scikit-learn/scikit-learn/issues/8952
+    y_min = np.abs(y).min()
+    offset = y_min * 1e-16  # Sufficiently small number
+    if y_min > 1e15:
+        raise ValueError(
+            "The minimum value for the target labels of regression tasks must be smaller than "
+            f"1e15 to avoid errors caused by an overflow, but got {y_min}"
+        )
+
+    # Since it is all integer, we can just add a random small number
+    if isinstance(y, np.ndarray):
+        y = y.astype(dtype=np.float64) + offset
+    else:
+        y.data = y.data.astype(dtype=np.float64) + offset
+
+    return y
+
+
 class TabularTargetValidator(BaseTargetValidator):
     def _fit(
         self,
@@ -101,7 +125,7 @@ def _fit(
 
     def _transform_by_encoder(self, y: SupportedTargetTypes) -> np.ndarray:
         if self.encoder is None:
-            return _check_and_to_numpy(y)
+            return _check_and_to_array(y)
 
         # remove ravel warning from pandas Series
         shape = np.shape(y)
@@ -115,12 +139,9 @@ def _transform_by_encoder(self, y: SupportedTargetTypes) -> np.ndarray:
         else:
             y = self.encoder.transform(np.array(y).reshape(-1, 1)).reshape(-1)
 
-        return _check_and_to_numpy(y)
+        return _check_and_to_array(y)
 
-    def transform(
-        self,
-        y: SupportedTargetTypes,
-    ) -> np.ndarray:
+    def transform(self, y: SupportedTargetTypes) -> np.ndarray:
         """
         Validates and fit a categorical encoder (if needed) to the features.
         The supported data types are List, numpy arrays and pandas DataFrames.
@@ -146,24 +167,11 @@ def transform(
             y = np.ravel(y)
 
         if not self.is_classification and "continuous" not in type_of_target(y):
-            # Regression targets must have numbers after a decimal point.
-            # Ref: https://github.com/scikit-learn/scikit-learn/issues/8952
-            y_min = np.abs(y).min()
-            offset = y_min * 1e-16  # Sufficiently small number
-            if y_min > 1e15:
-                raise ValueError(
-                    "The minimum value for the target labels of regression tasks must be smaller than "
-                    f"1e15 to avoid errors caused by an overflow, but got {y_min}"
-                )
-
-            y = y.astype(dtype=np.float64) + offset  # Since it is all integer, we can just add a random small number
+            y = _modify_regression_target(y)
 
         return y
 
-    def inverse_transform(
-        self,
-        y: SupportedTargetTypes,
-    ) -> np.ndarray:
+    def inverse_transform(self, y: SupportedTargetTypes) -> np.ndarray:
         """
         Revert any encoding transformation done on a target array
 
@@ -197,10 +205,7 @@ def inverse_transform(
             y = y.astype(self.dtype)
         return y
 
-    def _check_data(
-        self,
-        y: SupportedTargetTypes,
-    ) -> None:
+    def _check_data(self, y: SupportedTargetTypes) -> None:
         """
         Perform dimensionality and data type checks on the targets
 
diff --git a/autoPyTorch/utils/common.py b/autoPyTorch/utils/common.py
index 1488d5fcd..b0620a7db 100644
--- a/autoPyTorch/utils/common.py
+++ b/autoPyTorch/utils/common.py
@@ -20,6 +20,15 @@
 from torch.utils.data.dataloader import default_collate
 
 HyperparameterValueType = Union[int, str, float]
+SparseMatrixType = Union[
+    scipy.sparse.bsr_matrix,
+    scipy.sparse.coo_matrix,
+    scipy.sparse.csc_matrix,
+    scipy.sparse.csr_matrix,
+    scipy.sparse.dia_matrix,
+    scipy.sparse.dok_matrix,
+    scipy.sparse.lil_matrix,
+]
 
 
 class FitRequirement(NamedTuple):
diff --git a/test/test_data/test_target_validator.py b/test/test_data/test_target_validator.py
index aadc73416..8fd4527d9 100644
--- a/test/test_data/test_target_validator.py
+++ b/test/test_data/test_target_validator.py
@@ -150,17 +150,17 @@ def test_targetvalidator_supported_types_noclassification(input_data_targettest)
     assert validator.encoder is None
 
     if hasattr(input_data_targettest, "iloc"):
-        np.testing.assert_array_equal(
+        assert np.allclose(
             np.ravel(input_data_targettest.to_numpy()),
             np.ravel(transformed_y)
         )
     elif sparse.issparse(input_data_targettest):
-        np.testing.assert_array_equal(
+        assert np.allclose(
             np.ravel(input_data_targettest.todense()),
             np.ravel(transformed_y.todense())
         )
     else:
-        np.testing.assert_array_equal(
+        assert np.allclose(
             np.ravel(np.array(input_data_targettest)),
             np.ravel(transformed_y)
         )
diff --git a/test/test_datasets/test_base_dataset.py b/test/test_datasets/test_base_dataset.py
new file mode 100644
index 000000000..52b2fa9a5
--- /dev/null
+++ b/test/test_datasets/test_base_dataset.py
@@ -0,0 +1,19 @@
+import numpy as np
+
+import pytest
+
+from autoPyTorch.datasets.base_dataset import _get_output_properties
+
+
+@pytest.mark.parametrize(
+    "target_labels,dim,task_type", (
+        (np.arange(5), 5, "multiclass"),
+        (np.linspace(0, 1, 3), 1, "continuous"),
+        (np.linspace(0, 1, 3)[:, np.newaxis], 1, "continuous")
+    )
+)
+def test_get_output_properties(target_labels, dim, task_type):
+    train_tensors = np.array([np.empty_like(target_labels), target_labels])
+    output_dim, output_type = _get_output_properties(train_tensors)
+    assert output_dim == dim
+    assert output_type == task_type

From 0d9344b20b8cc27fe9178486dfefd96c856a4caa Mon Sep 17 00:00:00 2001
From: nabenabe0928 <shuhei.watanabe.utokyo@gmail.com>
Date: Thu, 24 Feb 2022 01:38:40 +0900
Subject: [PATCH 4/4] [fix] Address Ravin's comments and loosen the small
 number choice

---
 autoPyTorch/data/tabular_target_validator.py |  6 +++---
 autoPyTorch/datasets/base_dataset.py         |  2 +-
 test/test_api/test_api.py                    | 14 ++++++--------
 3 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/autoPyTorch/data/tabular_target_validator.py b/autoPyTorch/data/tabular_target_validator.py
index 9ff7828f5..67b6001f8 100644
--- a/autoPyTorch/data/tabular_target_validator.py
+++ b/autoPyTorch/data/tabular_target_validator.py
@@ -29,11 +29,11 @@ def _modify_regression_target(y: ArrayType) -> ArrayType:
     # Regression targets must have numbers after a decimal point.
     # Ref: https://github.com/scikit-learn/scikit-learn/issues/8952
     y_min = np.abs(y).min()
-    offset = y_min * 1e-16  # Sufficiently small number
-    if y_min > 1e15:
+    offset = max(y_min, 1e-13) * 1e-13  # Sufficiently small number
+    if y_min > 1e12:
         raise ValueError(
             "The minimum value for the target labels of regression tasks must be smaller than "
-            f"1e15 to avoid errors caused by an overflow, but got {y_min}"
+            f"1e12 to avoid errors caused by an overflow, but got {y_min}"
         )
 
     # Since it is all integer, we can just add a random small number
diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py
index d08a4f5b4..be17b945d 100644
--- a/autoPyTorch/datasets/base_dataset.py
+++ b/autoPyTorch/datasets/base_dataset.py
@@ -69,7 +69,7 @@ def _get_output_properties(train_tensors: BaseDatasetInputType) -> Tuple[int, st
         target_labels = np.array(train_tensors[1])
 
     output_type: str = type_of_target(target_labels)
-    if STRING_TO_OUTPUT_TYPES.get(output_type, None) in CLASSIFICATION_OUTPUTS:
+    if STRING_TO_OUTPUT_TYPES[output_type] in CLASSIFICATION_OUTPUTS:
         output_dim = len(np.unique(target_labels))
     elif target_labels.ndim > 1:
         output_dim = target_labels.shape[-1]
diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py
index a538115fc..4346ff2b6 100644
--- a/test/test_api/test_api.py
+++ b/test/test_api/test_api.py
@@ -912,10 +912,8 @@ def test_tabular_classification_test_evaluator(openml_id, backend, n_samples):
 )
 def test_task_inference(ans, task_class, backend):
     # Get the data and check that contents of data-manager make sense
-    X = np.random.random((5, 1))
-    y = np.array([0, 1, 2, 3, 4]) + 10 ** 15
-
-    X_train, _, y_train, _ = sklearn.model_selection.train_test_split(X, y, random_state=42)
+    X = np.random.random((6, 1))
+    y = np.array([-10 ** 12, 0, 1, 2, 3, 4], dtype=np.int64) + 10 ** 12
 
     estimator = task_class(
         backend=backend,
@@ -923,12 +921,12 @@ def test_task_inference(ans, task_class, backend):
         resampling_strategy_args=None,
         seed=42,
     )
-    dataset = estimator.get_dataset(X_train, y_train)
+    dataset = estimator.get_dataset(X, y)
     assert dataset.output_type == ans
 
-    y_train += 1
+    y += 10 ** 12 + 10  # Check if the function catches overflow possibilities
     if ans == 'continuous':
         with pytest.raises(ValueError):  # ValueError due to `Too large value`
-            estimator.get_dataset(X_train, y_train)
+            estimator.get_dataset(X, y)
     else:
-        estimator.get_dataset(X_train, y_train)
+        estimator.get_dataset(X, y)