Merge pull request #105 from nabenabe0928/refactoring-base-dataset

ravinkohli · web-flow · commit 84406957469a · 2021-03-10T17:05:42.000+01:00
Refactoring base dataset
diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py
@@ -24,16 +24,17 @@
 )
 from autoPyTorch.utils.common import FitRequirement, hash_array_or_matrix
 
-BASE_DATASET_INPUT = Union[Tuple[np.ndarray, np.ndarray], Dataset]
+BaseDatasetType = Union[Tuple[np.ndarray, np.ndarray], Dataset]
 
 
 def check_valid_data(data: Any) -> None:
-    if not (hasattr(data, '__getitem__') and hasattr(data, '__len__')):
+    if not all(hasattr(data, attr) for attr in ['__getitem__', '__len__']):
         raise ValueError(
-            'The specified Data for Dataset does either not have a __getitem__ or a __len__ attribute.')
+            'The specified Data for Dataset must have both __getitem__ and __len__ attribute.')
 
 
-def type_check(train_tensors: BASE_DATASET_INPUT, val_tensors: Optional[BASE_DATASET_INPUT] = None) -> None:
+def type_check(train_tensors: BaseDatasetType, val_tensors: Optional[BaseDatasetType] = None) -> None:
+    """To avoid unexpected behavior, we use loops over indices."""
     for i in range(len(train_tensors)):
         check_valid_data(train_tensors[i])
     if val_tensors is not None:
@@ -42,12 +43,20 @@ def type_check(train_tensors: BASE_DATASET_INPUT, val_tensors: Optional[BASE_DAT
 
 
 class TransformSubset(Subset):
-    """
-    Because the BaseDataset contains all the data (train/val/test), the transformations
-    have to be applied with some directions. That is, if yielding train data,
-    we expect to apply train transformation (which have augmentations exclusively).
+    """Wrapper of BaseDataset for splitted datasets
+
+    Since the BaseDataset contains all the data points (train/val/test),
+    we require different transformation for each data point.
+    This class helps to take the subset of the dataset
+    with either training or validation transformation.
 
     We achieve so by adding a train flag to the pytorch subset
+
+    Attributes:
+        dataset (BaseDataset/Dataset): Dataset to sample the subset
+        indices names (Sequence[int]): Indices to sample from the dataset
+        train (bool): If we apply train or validation transformation
+
     """
 
     def __init__(self, dataset: Dataset, indices: Sequence[int], train: bool) -> None:
@@ -62,10 +71,10 @@ def __getitem__(self, idx: int) -> np.ndarray:
 class BaseDataset(Dataset, metaclass=ABCMeta):
     def __init__(
         self,
-        train_tensors: BASE_DATASET_INPUT,
+        train_tensors: BaseDatasetType,
         dataset_name: Optional[str] = None,
-        val_tensors: Optional[BASE_DATASET_INPUT] = None,
-        test_tensors: Optional[BASE_DATASET_INPUT] = None,
+        val_tensors: Optional[BaseDatasetType] = None,
+        test_tensors: Optional[BaseDatasetType] = None,
         resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         shuffle: Optional[bool] = True,
@@ -97,18 +106,15 @@ def __init__(
             val_transforms (Optional[torchvision.transforms.Compose]):
                 Additional Transforms to be applied to the validation/test data
         """
-        if dataset_name is not None:
-            self.dataset_name = dataset_name
-        else:
-            self.dataset_name = hash_array_or_matrix(train_tensors[0])
+        self.dataset_name = dataset_name if dataset_name is not None \
+            else hash_array_or_matrix(train_tensors[0])
+
         if not hasattr(train_tensors[0], 'shape'):
             type_check(train_tensors, val_tensors)
-        self.train_tensors = train_tensors
-        self.val_tensors = val_tensors
-        self.test_tensors = test_tensors
+        self.train_tensors, self.val_tensors, self.test_tensors = train_tensors, val_tensors, test_tensors
         self.cross_validators: Dict[str, CROSS_VAL_FN] = {}
         self.holdout_validators: Dict[str, HOLDOUT_FN] = {}
-        self.rand = np.random.RandomState(seed=seed)
+        self.rng = np.random.RandomState(seed=seed)
         self.shuffle = shuffle
         self.resampling_strategy = resampling_strategy
         self.resampling_strategy_args = resampling_strategy_args
@@ -128,16 +134,8 @@ def __init__(
         self.is_small_preprocess = True
 
         # Make sure cross validation splits are created once
-        self.cross_validators = get_cross_validators(
-            CrossValTypes.stratified_k_fold_cross_validation,
-            CrossValTypes.k_fold_cross_validation,
-            CrossValTypes.shuffle_split_cross_validation,
-            CrossValTypes.stratified_shuffle_split_cross_validation
-        )
-        self.holdout_validators = get_holdout_validators(
-            HoldoutValTypes.holdout_validation,
-            HoldoutValTypes.stratified_holdout_validation
-        )
+        self.cross_validators = get_cross_validators(*CrossValTypes)
+        self.holdout_validators = get_holdout_validators(*HoldoutValTypes)
         self.splits = self.get_splits_from_resampling_strategy()
 
         # We also need to be able to transform the data, be it for pre-processing
@@ -146,19 +144,19 @@ def __init__(
         self.val_transform = val_transforms
 
     def update_transform(self, transform: Optional[torchvision.transforms.Compose],
-                         train: bool = True,
-                         ) -> 'BaseDataset':
+                         train: bool = True) -> 'BaseDataset':
         """
         During the pipeline execution, the pipeline object might propose transformations
         as a product of the current pipeline configuration being tested.
 
-        This utility allows to return a self with the updated transformation, so that
+        This utility allows to return self with the updated transformation, so that
         a dataloader can yield this dataset with the desired transformations
 
         Args:
-            transform (torchvision.transforms.Compose): The transformations proposed
-                by the current pipeline
-            train (bool): Whether to update the train or validation transform
+            transform (torchvision.transforms.Compose):
+                The transformations proposed by the current pipeline
+            train (bool):
+                Whether to update the train or validation transform
 
         Returns:
             self: A copy of the update pipeline
@@ -171,9 +169,9 @@ def update_transform(self, transform: Optional[torchvision.transforms.Compose],
 
     def __getitem__(self, index: int, train: bool = True) -> Tuple[np.ndarray, ...]:
         """
-        The base dataset uses a Subset of the data. Nevertheless, the base dataset expect
-        both validation and test data to be present in the same dataset, which motivated the
-        need to dynamically give train/test data with the __getitem__ command.
+        The base dataset uses a Subset of the data. Nevertheless, the base dataset expects
+        both validation and test data to be present in the same dataset, which motivates
+        the need to dynamically give train/test data with the __getitem__ command.
 
         This method yields a datapoint of the whole data (after a Subset has selected a given
         item, based on the resampling strategy) and applies a train/testing transformation, if any.
@@ -186,34 +184,24 @@ def __getitem__(self, index: int, train: bool = True) -> Tuple[np.ndarray, ...]:
             A transformed single point prediction
         """
 
-        if hasattr(self.train_tensors[0], 'loc'):
-            X = self.train_tensors[0].iloc[[index]]
-        else:
-            X = self.train_tensors[0][index]
+        X = self.train_tensors[0].iloc[[index]] if hasattr(self.train_tensors[0], 'loc') \
+            else self.train_tensors[0][index]
 
         if self.train_transform is not None and train:
             X = self.train_transform(X)
         elif self.val_transform is not None and not train:
             X = self.val_transform(X)
 
         # In case of prediction, the targets are not provided
-        Y = self.train_tensors[1]
-        if Y is not None:
-            Y = Y[index]
-        else:
-            Y = None
+        Y = self.train_tensors[1][index] if self.train_tensors[1] is not None else None
 
         return X, Y
 
     def __len__(self) -> int:
         return self.train_tensors[0].shape[0]
 
     def _get_indices(self) -> np.ndarray:
-        if self.shuffle:
-            indices = self.rand.permutation(len(self))
-        else:
-            indices = np.arange(len(self))
-        return indices
+        return self.rng.permutation(len(self)) if self.shuffle else np.arange(len(self))
 
     def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]]]:
         """
@@ -333,7 +321,7 @@ def get_dataset_for_training(self, split_id: int) -> Tuple[Dataset, Dataset]:
         return (TransformSubset(self, self.splits[split_id][0], train=True),
                 TransformSubset(self, self.splits[split_id][1], train=False))
 
-    def replace_data(self, X_train: BASE_DATASET_INPUT, X_test: Optional[BASE_DATASET_INPUT]) -> 'BaseDataset':
+    def replace_data(self, X_train: BaseDatasetType, X_test: Optional[BaseDatasetType]) -> 'BaseDataset':
         """
         To speed up the training of small dataset, early pre-processing of the data
         can be made on the fly by the pipeline.
@@ -361,7 +349,8 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) ->
                 contain.
 
         Returns:
-
+            dataset_properties (Dict[str, Any]):
+                Dict of the dataset properties.
         """
         dataset_properties = dict()
         for dataset_requirement in dataset_requirements:
diff --git a/test/conftest.py b/test/conftest.py
@@ -276,7 +276,6 @@ def get_fit_dictionary(X, y, validator, backend):
     info = datamanager.get_required_dataset_info()
 
     dataset_properties = datamanager.get_dataset_properties(get_dataset_requirements(info))
-
     fit_dictionary = {
         'X_train': datamanager.train_tensors[0],
         'y_train': datamanager.train_tensors[1],