From 9278b53e6e4bc288cabd9a7ef51eb30925a18d50 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Fri, 21 May 2021 15:49:14 +0200 Subject: [PATCH 1/2] Fix bugs in cutout training --- .../training/trainer/RowCutMixTrainer.py | 3 ++- .../training/trainer/RowCutOutTrainer.py | 27 +++++++++++++------ .../training/trainer/base_trainer.py | 6 ++++- .../training/trainer/base_trainer_choice.py | 4 ++- 4 files changed, 29 insertions(+), 11 deletions(-) diff --git a/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py b/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py index f0d8536f9..e0431ebe5 100644 --- a/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py @@ -39,7 +39,8 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray, # It is unlikely that the batch size is lower than the number of features, but # be safe size = min(X.shape[0], X.shape[1]) - indices = torch.tensor(self.random_state.choice(range(1, size), max(1, np.int(size * lam)))) + indices = torch.tensor(self.random_state.choice(range(1, size), max(1, np.int32(size * lam)), + replace=False)) X[:, indices] = X[index, :][:, indices] diff --git a/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py b/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py index a7936c4f8..97f0caa18 100644 --- a/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py @@ -9,6 +9,8 @@ class RowCutOutTrainer(CutOut, BaseTrainerComponent): + NUMERICAL_VALUE = 0 + CATEGORICAL_VALUE = -1 def data_preparation(self, X: np.ndarray, y: np.ndarray, ) -> typing.Tuple[np.ndarray, typing.Dict[str, np.ndarray]]: @@ -34,17 +36,26 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray, lam = 1 return X, {'y_a': y_a, 'y_b': y_b, 'lam': lam} - # The mixup component mixes up also on the batch dimension - # It is unlikely that the batch size is lower than the number of features, but - # be safe - size = min(X.shape[0], X.shape[1]) - indices = torch.tensor(self.random_state.choice(range(1, size), max(1, np.int(size * self.patch_ratio)))) + size = X.shape[1] + indices = self.random_state.choice(range(1, size), max(1, np.int32(size * self.patch_ratio)), + replace=False) - # We use an ordinal encoder on the tabular data + if not isinstance(self.numerical_columns, typing.Iterable): + raise ValueError("{} requires numerical columns information of {}" + "to prepare data got {}.".format(self.__class__.__name__, + typing.Iterable, + self.numerical_columns)) + numerical_indices = torch.tensor(self.numerical_columns) + categorical_indices = torch.tensor([index for index in indices if index not in self.numerical_columns]) + + # We use an ordinal encoder on the categorical columns of tabular data # -1 is the conceptual equivalent to 0 in a image, that does not # have color as a feature and hence the network has to learn to deal - # without this data - X[:, indices.long()] = -1 + # without this data. For numerical columns we use 0 to cutout the features + # similar to the effect that setting 0 as a pixel value in an image. + X[:, categorical_indices.long()] = self.CATEGORICAL_VALUE + X[:, numerical_indices.long()] = self.NUMERICAL_VALUE + lam = 1 y_a = y y_b = y diff --git a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py index a67c4c967..cb6f8ee3d 100644 --- a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py @@ -233,7 +233,8 @@ def prepare( metrics_during_training: bool, scheduler: _LRScheduler, task_type: int, - labels: Union[np.ndarray, torch.Tensor, pd.DataFrame] + labels: Union[np.ndarray, torch.Tensor, pd.DataFrame], + numerical_columns: Optional[List[int]] = None ) -> None: # Save the device to be used @@ -289,6 +290,9 @@ def prepare( # task type (used for calculating metrics) self.task_type = task_type + # for cutout trainer, we need the list of numerical columns + self.numerical_columns = numerical_columns + def on_epoch_start(self, X: Dict[str, Any], epoch: int) -> None: """ Optional place holder for AutoPytorch Extensions. diff --git a/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py b/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py index a650d2179..0075d69a7 100755 --- a/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py +++ b/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py @@ -336,7 +336,9 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic metrics_during_training=X['metrics_during_training'], scheduler=X['lr_scheduler'], task_type=STRING_TO_TASK_TYPES[X['dataset_properties']['task_type']], - labels=X['y_train'][X['backend'].load_datamanager().splits[X['split_id']][0]] + labels=X['y_train'][X['backend'].load_datamanager().splits[X['split_id']][0]], + numerical_columns=X['dataset_properties']['numerical_columns'] if 'numerical_columns' in X[ + 'dataset_properties'] else None ) total_parameter_count, trainable_parameter_count = self.count_parameters(X['network']) self.run_summary = RunSummary( From 8b71ee2ad3c00277928ac462c0d364151d69aca0 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Fri, 21 May 2021 16:46:27 +0200 Subject: [PATCH 2/2] Address comments from arlind --- .../pipeline/components/training/trainer/RowCutMixTrainer.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py b/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py index e0431ebe5..20d02c793 100644 --- a/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py @@ -35,10 +35,7 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray, if beta <= 0 or r > self.alpha: return X, {'y_a': y, 'y_b': y[index], 'lam': 1} - # The mixup component mixes up also on the batch dimension - # It is unlikely that the batch size is lower than the number of features, but - # be safe - size = min(X.shape[0], X.shape[1]) + size = X.shape[1] indices = torch.tensor(self.random_state.choice(range(1, size), max(1, np.int32(size * lam)), replace=False))