From 676a3bad7f13fa0308e13976d78ebcdcf8e66ada Mon Sep 17 00:00:00 2001 From: Ambuj Pawar Date: Fri, 7 Oct 2022 14:38:54 +0200 Subject: [PATCH 01/32] ADD: Empty file mixup.py for dummy PR --- torchvision/prototype/transforms/_mixup.py | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 torchvision/prototype/transforms/_mixup.py diff --git a/torchvision/prototype/transforms/_mixup.py b/torchvision/prototype/transforms/_mixup.py new file mode 100644 index 00000000000..a691282bc09 --- /dev/null +++ b/torchvision/prototype/transforms/_mixup.py @@ -0,0 +1,3 @@ +# TODO: Consturct a Mixup transform + +class \ No newline at end of file From 60cdf3b729bc008519d700f922647323c9e1e91c Mon Sep 17 00:00:00 2001 From: Ambuj Pawar Date: Fri, 7 Oct 2022 14:41:09 +0200 Subject: [PATCH 02/32] ADD: Empty transform class --- torchvision/prototype/transforms/_mixup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/torchvision/prototype/transforms/_mixup.py b/torchvision/prototype/transforms/_mixup.py index a691282bc09..04f2a365f25 100644 --- a/torchvision/prototype/transforms/_mixup.py +++ b/torchvision/prototype/transforms/_mixup.py @@ -1,3 +1,4 @@ # TODO: Consturct a Mixup transform -class \ No newline at end of file +class _Mixup: + pass From 728c7cacac0a7a300feb9903d1ebb0bb4b393f9b Mon Sep 17 00:00:00 2001 From: Ambuj Pawar Date: Fri, 28 Oct 2022 11:40:57 +0200 Subject: [PATCH 03/32] WIP: Random Mixup for detection --- torchvision/prototype/transforms/_mixup.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/torchvision/prototype/transforms/_mixup.py b/torchvision/prototype/transforms/_mixup.py index 04f2a365f25..f504ca3ba83 100644 --- a/torchvision/prototype/transforms/_mixup.py +++ b/torchvision/prototype/transforms/_mixup.py @@ -1,4 +1,18 @@ # TODO: Consturct a Mixup transform +import random + +from torchvision.prototype import features +from torchvision.prototype.transforms import AutoAugmentPolicy, functional as F, InterpolationMode, Transform + class _Mixup: - pass + def __init__( + self, + *, + alpha: float = 0.9, + beta: float = 0.1, + ) -> None: + super().__init__() + + # Fetch a random image from the dataset + random_index = random.randint(0, ) \ No newline at end of file From f1b70b97cec8e4d6502c4a3c6e9f0fbc3d7c1b87 Mon Sep 17 00:00:00 2001 From: Ambuj Pawar Date: Sat, 5 Nov 2022 21:39:41 +0100 Subject: [PATCH 04/32] First draft: Mixup detections --- test/test_prototype_transforms.py | 77 +++++++++ torchvision/prototype/transforms/__init__.py | 1 + torchvision/prototype/transforms/_mixup.py | 18 --- .../prototype/transforms/_mixup_detection.py | 148 ++++++++++++++++++ 4 files changed, 226 insertions(+), 18 deletions(-) delete mode 100644 torchvision/prototype/transforms/_mixup.py create mode 100644 torchvision/prototype/transforms/_mixup_detection.py diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py index fab4cc0ddd6..31145ea9b96 100644 --- a/test/test_prototype_transforms.py +++ b/test/test_prototype_transforms.py @@ -1918,3 +1918,80 @@ def test__transform(self, inpt): assert type(output) is type(inpt) assert output.shape[-4] == num_samples assert output.dtype == inpt.dtype + + +class TestMixupDetection: + def create_fake_image(self, mocker, image_type): + if image_type == PIL.Image.Image: + return PIL.Image.new("RGB", (32, 32), 123) + return mocker.MagicMock(spec=image_type) + + def test__extract_image_targets_assertion(self, mocker): + transform = transforms.MixupDetection() + + flat_sample = [ + # images, batch size = 2 + self.create_fake_image(mocker, features.Image), + # labels, bboxes, masks + mocker.MagicMock(spec=features.Label), + mocker.MagicMock(spec=features.BoundingBox), + # labels, bboxes, masks + mocker.MagicMock(spec=features.BoundingBox), + ] + + with pytest.raises(TypeError, match="requires input sample to contain equal-sized list of Images"): + transform._extract_image_targets(flat_sample) + + @pytest.mark.parametrize("image_type", [features.Image, PIL.Image.Image, torch.Tensor]) + def test__extract_image_targets(self, image_type, mocker): + transform = transforms.MixupDetection() + + flat_sample = [ + # images, batch size = 2 + self.create_fake_image(mocker, image_type), + self.create_fake_image(mocker, image_type), + # labels, bboxes, masks + mocker.MagicMock(spec=features.Label), + mocker.MagicMock(spec=features.BoundingBox), + # labels, bboxes, masks + mocker.MagicMock(spec=features.Label), + mocker.MagicMock(spec=features.BoundingBox), + ] + + images, targets = transform._extract_image_targets(flat_sample) + + assert len(images) == len(targets) == 2 + if image_type == PIL.Image.Image: + torch.testing.assert_close(images[0], pil_to_tensor(flat_sample[0])) + torch.testing.assert_close(images[1], pil_to_tensor(flat_sample[1])) + else: + assert images[0] == flat_sample[0] + assert images[1] == flat_sample[1] + + def test__mixup(self): + image1 = 2*torch.ones(3, 32, 64) + target_1 = { + "boxes": features.BoundingBox( + torch.tensor([[0.0, 0.0, 10.0, 10.0], [20.0, 20.0, 30.0, 30.0]]), + format = "XYXY", + spatial_size = (32, 64), + ), + "labels": features.Label(torch.tensor([1, 2])), + } + + image2 = 10*torch.ones(3, 64, 32) + target_2 = { + "boxes": features.BoundingBox( + torch.tensor([[10.0, 0.0, 20.0, 20.0], [10.0, 20.0, 30.0, 30.0]]), + format = "XYXY", + spatial_size = (64, 32), + ), + "labels": features.Label(torch.tensor([2, 3])), + } + + transform = transforms.MixupDetection() + output_image, output_target = transform._mixup(image1, target_1, image2, target_2) + assert output_image.shape == (3, 64, 64) + assert output_target["boxes"].spatial_size == (64, 64) + assert len(output_target["boxes"]) == 4 + assert len(output_target["labels"]) == 4 diff --git a/torchvision/prototype/transforms/__init__.py b/torchvision/prototype/transforms/__init__.py index 2f9bd76d4ed..38e53c6a5ae 100644 --- a/torchvision/prototype/transforms/__init__.py +++ b/torchvision/prototype/transforms/__init__.py @@ -51,6 +51,7 @@ ToDtype, TransposeDimensions, ) +from ._mixup_detection import MixupDetection from ._temporal import UniformTemporalSubsample from ._type_conversion import DecodeImage, LabelToOneHot, PILToTensor, ToImagePIL, ToImageTensor, ToPILImage diff --git a/torchvision/prototype/transforms/_mixup.py b/torchvision/prototype/transforms/_mixup.py deleted file mode 100644 index f504ca3ba83..00000000000 --- a/torchvision/prototype/transforms/_mixup.py +++ /dev/null @@ -1,18 +0,0 @@ -# TODO: Consturct a Mixup transform -import random - -from torchvision.prototype import features -from torchvision.prototype.transforms import AutoAugmentPolicy, functional as F, InterpolationMode, Transform - - -class _Mixup: - def __init__( - self, - *, - alpha: float = 0.9, - beta: float = 0.1, - ) -> None: - super().__init__() - - # Fetch a random image from the dataset - random_index = random.randint(0, ) \ No newline at end of file diff --git a/torchvision/prototype/transforms/_mixup_detection.py b/torchvision/prototype/transforms/_mixup_detection.py new file mode 100644 index 00000000000..5720174485a --- /dev/null +++ b/torchvision/prototype/transforms/_mixup_detection.py @@ -0,0 +1,148 @@ +""" +This script is used to apply the mixup transform for Object detection to the dataset. +The code is inspired from the paper: https://arxiv.org/abs/1902.0410 + +In a nutshell, mixup is a data augmentation technique that linearly interpolates between +two randomly chosen images and their labels. The linear interpolation is parameterized by +a weight, beta, which is drawn from a beta distribution. The value of beta is sampled +uniformly from the range [0, 1]. The technique is described in the paper: https://arxiv.org/abs/1902.0410 +""" +import random +from typing import Any, Dict, List, Optional, Tuple +import PIL.Image + +import torch +from torch.utils._pytree import tree_flatten, tree_unflatten +from torchvision.prototype import features +from torchvision.prototype.transforms import functional as F, Transform, SimpleCopyPaste +from torchvision.prototype.transforms._utils import query_chw +from torchvision.prototype.features._feature import is_simple_tensor +from torchvision.transforms.functional import pil_to_tensor + + +class MixupDetection(Transform): + _transformed_types = (features.is_simple_tensor, features.Image, PIL.Image) + + def __init__( + self, + *, + alpha: float = 1.5, + ) -> None: + super().__init__() + self._dist = torch.distributions.Beta(torch.tensor([alpha]), torch.tensor([alpha])) + + + def _get_params(): + pass + + def _extract_image_targets(self, flat_sample: List[Any]) -> Tuple[List[Any], List[Dict[str, Any]]]: + # fetch all images, bboxes and labels from unstructured input + # with List[image], List[BoundingBox], List[Label] + images, bboxes, labels = [], [], [] + for obj in flat_sample: + if isinstance(obj, features.Image) or is_simple_tensor(obj): + images.append(obj) + elif isinstance(obj, PIL.Image.Image): + images.append(pil_to_tensor(obj)) + elif isinstance(obj, features.BoundingBox): + bboxes.append(obj) + elif isinstance(obj, (features.Label, features.OneHotLabel)): + labels.append(obj) + + if not (len(images) == len(bboxes) == len(labels)): + raise TypeError( + f"{type(self).__name__}() requires input sample to contain equal-sized list of Images, " + "BoundingBoxes, and Labels or OneHotLabels." + ) + + targets = [] + for bbox, label in zip(bboxes, labels): + targets.append({"boxes": bbox, "labels": label}) + + return images, targets + + def _check_inputs(self, flat_inputs: List[Any]) -> None: + return super()._check_inputs(flat_inputs) + + def _insert_outputs( + self, flat_sample: List[Any], output_images: List[Any], output_targets: List[Dict[str, Any]] + ) -> None: + c0, c1, c2 = 0, 0, 0 + for i, obj in enumerate(flat_sample): + if isinstance(obj, features.Image): + flat_sample[i] = features.Image.wrap_like(obj, output_images[c0]) + c0 += 1 + elif isinstance(obj, PIL.Image.Image): + flat_sample[i] = F.to_image_pil(output_images[c0]) + c0 += 1 + elif is_simple_tensor(obj): + flat_sample[i] = output_images[c0] + c0 += 1 + elif isinstance(obj, features.BoundingBox): + flat_sample[i] = features.BoundingBox.wrap_like(obj, output_targets[c1]["boxes"]) + c1 += 1 + elif isinstance(obj, (features.Label, features.OneHotLabel)): + flat_sample[i] = obj.wrap_like(obj, output_targets[c2]["labels"]) # type: ignore[arg-type] + c2 += 1 + + def forward(self, *inputs: Any) -> Any: + flat_sample, spec = tree_flatten(inputs if len(inputs) > 1 else inputs[0]) + + images, targets = self._extract_image_targets(flat_sample) + + # images = [t1, t2, ..., tN] + # Let's define paste_images as shifted list of input images + # paste_images = [tN, t1, ..., tN-1,] + images_rolled = images[-1:] + images[:-1] + targets_rolled = targets[-1:] + targets[:-1] + + output_images, output_targets = [], [] + for image_1, target_1, image_2, target_2 in zip(images, targets, images_rolled, targets_rolled): + output_image, output_target = self._mixup( + image_1, + target_1, + image_2, + target_2, + ) + output_images.append(output_image) + output_targets.append(output_target) + + # Insert updated images and targets into input flat_sample + self._insert_outputs(flat_sample, output_images, output_targets) + return tree_unflatten(flat_sample, spec) + + def _mixup( + self, + image_1: features.TensorImageType, + target_1: Dict[str, Any], + image_2: features.TensorImageType, + target_2: Dict[str, Any], + ): + """ + Performs mixup on the given images and targets. + """ + lambd = self._dist.sample().item() + c_1, h_1, w_1 = image_1.shape + c_2, h_2, w_2 = image_2.shape + h_mixup = max(h_1, h_2) + w_mixup = max(w_1, w_2) + + # mixup images + mix_img = torch.zeros(c_1, h_mixup, w_mixup, dtype=torch.float32) + mix_img[:, :image_1.shape[1], :image_1.shape[2]] = image_1 * lambd + mix_img[:, :image_2.shape[1], :image_2.shape[2]] += image_2 * (1. - lambd) + # mixup targets + mix_target = {**target_1, **target_2} + box_format = target_1["boxes"].format + mixed_boxes = { + "boxes": features.BoundingBox( + torch.vstack((target_1["boxes"], target_2["boxes"])), + format=box_format, + spatial_size=(h_mixup, w_mixup), + ) + } + mix_labels = {"labels": torch.cat((target_1["labels"], target_2["labels"]))} + mix_target.update(mixed_boxes) + mix_target.update(mix_labels) + + return mix_img, mix_target From cdda41b9b1c0bb4c9b8f48b0382ec60dda525030 Mon Sep 17 00:00:00 2001 From: Ambuj Pawar Date: Sat, 5 Nov 2022 21:47:43 +0100 Subject: [PATCH 05/32] Fix: precommit issues --- test/test_prototype_transforms.py | 16 +++++----- .../prototype/transforms/_mixup_detection.py | 31 +++++++++---------- 2 files changed, 22 insertions(+), 25 deletions(-) diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py index 31145ea9b96..0cfe82f4037 100644 --- a/test/test_prototype_transforms.py +++ b/test/test_prototype_transforms.py @@ -1969,23 +1969,23 @@ def test__extract_image_targets(self, image_type, mocker): assert images[1] == flat_sample[1] def test__mixup(self): - image1 = 2*torch.ones(3, 32, 64) + image1 = 2 * torch.ones(3, 32, 64) target_1 = { "boxes": features.BoundingBox( torch.tensor([[0.0, 0.0, 10.0, 10.0], [20.0, 20.0, 30.0, 30.0]]), - format = "XYXY", - spatial_size = (32, 64), - ), + format="XYXY", + spatial_size=(32, 64), + ), "labels": features.Label(torch.tensor([1, 2])), } - image2 = 10*torch.ones(3, 64, 32) + image2 = 10 * torch.ones(3, 64, 32) target_2 = { "boxes": features.BoundingBox( torch.tensor([[10.0, 0.0, 20.0, 20.0], [10.0, 20.0, 30.0, 30.0]]), - format = "XYXY", - spatial_size = (64, 32), - ), + format="XYXY", + spatial_size=(64, 32), + ), "labels": features.Label(torch.tensor([2, 3])), } diff --git a/torchvision/prototype/transforms/_mixup_detection.py b/torchvision/prototype/transforms/_mixup_detection.py index 5720174485a..26b6d8d7c94 100644 --- a/torchvision/prototype/transforms/_mixup_detection.py +++ b/torchvision/prototype/transforms/_mixup_detection.py @@ -2,21 +2,19 @@ This script is used to apply the mixup transform for Object detection to the dataset. The code is inspired from the paper: https://arxiv.org/abs/1902.0410 -In a nutshell, mixup is a data augmentation technique that linearly interpolates between -two randomly chosen images and their labels. The linear interpolation is parameterized by -a weight, beta, which is drawn from a beta distribution. The value of beta is sampled -uniformly from the range [0, 1]. The technique is described in the paper: https://arxiv.org/abs/1902.0410 +In a nutshell, mixup is a data augmentation technique that combines two images in the ratio of +beta to (1-beta) and this value of beta is sampled from a beta distribution. This makes our model +robust to the object being present in the image or not. Plus, it is kind of like a free lunch. """ -import random -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Tuple + import PIL.Image import torch from torch.utils._pytree import tree_flatten, tree_unflatten from torchvision.prototype import features -from torchvision.prototype.transforms import functional as F, Transform, SimpleCopyPaste -from torchvision.prototype.transforms._utils import query_chw from torchvision.prototype.features._feature import is_simple_tensor +from torchvision.prototype.transforms import functional as F, Transform from torchvision.transforms.functional import pil_to_tensor @@ -31,7 +29,6 @@ def __init__( super().__init__() self._dist = torch.distributions.Beta(torch.tensor([alpha]), torch.tensor([alpha])) - def _get_params(): pass @@ -99,10 +96,10 @@ def forward(self, *inputs: Any) -> Any: output_images, output_targets = [], [] for image_1, target_1, image_2, target_2 in zip(images, targets, images_rolled, targets_rolled): output_image, output_target = self._mixup( - image_1, - target_1, - image_2, - target_2, + image_1, + target_1, + image_2, + target_2, ) output_images.append(output_image) output_targets.append(output_target) @@ -117,7 +114,7 @@ def _mixup( target_1: Dict[str, Any], image_2: features.TensorImageType, target_2: Dict[str, Any], - ): + ): """ Performs mixup on the given images and targets. """ @@ -129,8 +126,8 @@ def _mixup( # mixup images mix_img = torch.zeros(c_1, h_mixup, w_mixup, dtype=torch.float32) - mix_img[:, :image_1.shape[1], :image_1.shape[2]] = image_1 * lambd - mix_img[:, :image_2.shape[1], :image_2.shape[2]] += image_2 * (1. - lambd) + mix_img[:, : image_1.shape[1], : image_1.shape[2]] = image_1 * lambd + mix_img[:, : image_2.shape[1], : image_2.shape[2]] += image_2 * (1.0 - lambd) # mixup targets mix_target = {**target_1, **target_2} box_format = target_1["boxes"].format @@ -139,7 +136,7 @@ def _mixup( torch.vstack((target_1["boxes"], target_2["boxes"])), format=box_format, spatial_size=(h_mixup, w_mixup), - ) + ) } mix_labels = {"labels": torch.cat((target_1["labels"], target_2["labels"]))} mix_target.update(mixed_boxes) From 2d0765c55fd68bd70a78571ec25a78ffa47b1a89 Mon Sep 17 00:00:00 2001 From: Ambuj Pawar Date: Sun, 6 Nov 2022 10:10:28 +0100 Subject: [PATCH 06/32] Fix: failing CI issues --- .../prototype/transforms/_mixup_detection.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/torchvision/prototype/transforms/_mixup_detection.py b/torchvision/prototype/transforms/_mixup_detection.py index 26b6d8d7c94..9c5e3d8c106 100644 --- a/torchvision/prototype/transforms/_mixup_detection.py +++ b/torchvision/prototype/transforms/_mixup_detection.py @@ -29,7 +29,8 @@ def __init__( super().__init__() self._dist = torch.distributions.Beta(torch.tensor([alpha]), torch.tensor([alpha])) - def _get_params(): + def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: + # TODO: Retrieve the params from the input sample pass def _extract_image_targets(self, flat_sample: List[Any]) -> Tuple[List[Any], List[Dict[str, Any]]]: @@ -114,20 +115,23 @@ def _mixup( target_1: Dict[str, Any], image_2: features.TensorImageType, target_2: Dict[str, Any], - ): + ) -> Tuple[features.TensorImageType, Dict[str, Any]]: """ Performs mixup on the given images and targets. """ - lambd = self._dist.sample().item() + mixup_ratio = self._dist.sample().item() c_1, h_1, w_1 = image_1.shape c_2, h_2, w_2 = image_2.shape h_mixup = max(h_1, h_2) w_mixup = max(w_1, w_2) + if mixup_ratio >= 1: + return image_1, target_1 + # mixup images mix_img = torch.zeros(c_1, h_mixup, w_mixup, dtype=torch.float32) - mix_img[:, : image_1.shape[1], : image_1.shape[2]] = image_1 * lambd - mix_img[:, : image_2.shape[1], : image_2.shape[2]] += image_2 * (1.0 - lambd) + mix_img[:, : image_1.shape[1], : image_1.shape[2]] = image_1 * mixup_ratio + mix_img[:, : image_2.shape[1], : image_2.shape[2]] += image_2 * (1.0 - mixup_ratio) # mixup targets mix_target = {**target_1, **target_2} box_format = target_1["boxes"].format From 7e82ff28474e49ae2f774287575de13501f7fa3a Mon Sep 17 00:00:00 2001 From: Ambuj Pawar Date: Sun, 6 Nov 2022 16:18:20 +0100 Subject: [PATCH 07/32] Fix: Tests and ADD: get_params and check_inputs functions --- test/test_prototype_transforms.py | 10 +++---- .../prototype/transforms/_mixup_detection.py | 28 ++++++++++++++----- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py index 0cfe82f4037..cf06c53efa4 100644 --- a/test/test_prototype_transforms.py +++ b/test/test_prototype_transforms.py @@ -24,7 +24,7 @@ from torchvision.ops.boxes import box_iou from torchvision.prototype import features, transforms from torchvision.prototype.transforms._utils import _isinstance -from torchvision.transforms.functional import InterpolationMode, pil_to_tensor, to_pil_image +from torchvision.transforms.functional import InterpolationMode, pil_to_tensor, to_pil_image, to_tensor BATCH_EXTRA_DIMS = [extra_dims for extra_dims in DEFAULT_EXTRA_DIMS if extra_dims] @@ -1950,10 +1950,10 @@ def test__extract_image_targets(self, image_type, mocker): # images, batch size = 2 self.create_fake_image(mocker, image_type), self.create_fake_image(mocker, image_type), - # labels, bboxes, masks + # labels, bboxes mocker.MagicMock(spec=features.Label), mocker.MagicMock(spec=features.BoundingBox), - # labels, bboxes, masks + # labels, bboxes mocker.MagicMock(spec=features.Label), mocker.MagicMock(spec=features.BoundingBox), ] @@ -1962,8 +1962,8 @@ def test__extract_image_targets(self, image_type, mocker): assert len(images) == len(targets) == 2 if image_type == PIL.Image.Image: - torch.testing.assert_close(images[0], pil_to_tensor(flat_sample[0])) - torch.testing.assert_close(images[1], pil_to_tensor(flat_sample[1])) + torch.testing.assert_close(images[0], to_tensor(flat_sample[0])) + torch.testing.assert_close(images[1], to_tensor(flat_sample[1])) else: assert images[0] == flat_sample[0] assert images[1] == flat_sample[1] diff --git a/torchvision/prototype/transforms/_mixup_detection.py b/torchvision/prototype/transforms/_mixup_detection.py index 9c5e3d8c106..7e8295eac98 100644 --- a/torchvision/prototype/transforms/_mixup_detection.py +++ b/torchvision/prototype/transforms/_mixup_detection.py @@ -15,7 +15,7 @@ from torchvision.prototype import features from torchvision.prototype.features._feature import is_simple_tensor from torchvision.prototype.transforms import functional as F, Transform -from torchvision.transforms.functional import pil_to_tensor +from torchvision.prototype.transforms._utils import has_any class MixupDetection(Transform): @@ -29,9 +29,10 @@ def __init__( super().__init__() self._dist = torch.distributions.Beta(torch.tensor([alpha]), torch.tensor([alpha])) - def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: + def _get_params(self) -> Dict[str, Any]: # TODO: Retrieve the params from the input sample - pass + mixup_ratio = self._dist.sample().item() + return {"mixup_ratio": mixup_ratio} def _extract_image_targets(self, flat_sample: List[Any]) -> Tuple[List[Any], List[Dict[str, Any]]]: # fetch all images, bboxes and labels from unstructured input @@ -41,7 +42,7 @@ def _extract_image_targets(self, flat_sample: List[Any]) -> Tuple[List[Any], Lis if isinstance(obj, features.Image) or is_simple_tensor(obj): images.append(obj) elif isinstance(obj, PIL.Image.Image): - images.append(pil_to_tensor(obj)) + images.append(F.to_tensor(obj)) elif isinstance(obj, features.BoundingBox): bboxes.append(obj) elif isinstance(obj, (features.Label, features.OneHotLabel)): @@ -60,7 +61,19 @@ def _extract_image_targets(self, flat_sample: List[Any]) -> Tuple[List[Any], Lis return images, targets def _check_inputs(self, flat_inputs: List[Any]) -> None: - return super()._check_inputs(flat_inputs) + if has_any(flat_inputs, features.Mask): + raise TypeError(f"Masks are not supported by {type(self).__name__}()") + + if not has_any(flat_inputs, PIL.Image.Image, features.Image, features.is_simple_tensor): + raise TypeError( + f"{type(self).__name__}() requires input sample to contain an tensor or PIL image or a Video." + ) + + if not ( + has_any(flat_inputs, features.Image, PIL.Image.Image, features.is_simple_tensor) + and has_any(flat_inputs, features.BoundingBox) + ): + raise TypeError(f"{type(self).__name__}() is only defined for tensor images/videos and bounding boxes.") def _insert_outputs( self, flat_sample: List[Any], output_images: List[Any], output_targets: List[Dict[str, Any]] @@ -85,6 +98,7 @@ def _insert_outputs( def forward(self, *inputs: Any) -> Any: flat_sample, spec = tree_flatten(inputs if len(inputs) > 1 else inputs[0]) + self._check_inputs(flat_sample) images, targets = self._extract_image_targets(flat_sample) @@ -119,7 +133,7 @@ def _mixup( """ Performs mixup on the given images and targets. """ - mixup_ratio = self._dist.sample().item() + mixup_ratio = self._get_params().get("mixup_ratio") c_1, h_1, w_1 = image_1.shape c_2, h_2, w_2 = image_2.shape h_mixup = max(h_1, h_2) @@ -128,7 +142,7 @@ def _mixup( if mixup_ratio >= 1: return image_1, target_1 - # mixup images + # mixup images and prevent the object aspect ratio from changing mix_img = torch.zeros(c_1, h_mixup, w_mixup, dtype=torch.float32) mix_img[:, : image_1.shape[1], : image_1.shape[2]] = image_1 * mixup_ratio mix_img[:, : image_2.shape[1], : image_2.shape[2]] += image_2 * (1.0 - mixup_ratio) From b83aedf84a1bcce310955cfc576a2615291e1e8c Mon Sep 17 00:00:00 2001 From: Ambuj Pawar Date: Sun, 6 Nov 2022 16:37:52 +0100 Subject: [PATCH 08/32] Fix: Remove usage of soon to be deprecated to_tensor function --- torchvision/prototype/transforms/_mixup_detection.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/torchvision/prototype/transforms/_mixup_detection.py b/torchvision/prototype/transforms/_mixup_detection.py index 7e8295eac98..c7fbf180b34 100644 --- a/torchvision/prototype/transforms/_mixup_detection.py +++ b/torchvision/prototype/transforms/_mixup_detection.py @@ -42,7 +42,9 @@ def _extract_image_targets(self, flat_sample: List[Any]) -> Tuple[List[Any], Lis if isinstance(obj, features.Image) or is_simple_tensor(obj): images.append(obj) elif isinstance(obj, PIL.Image.Image): - images.append(F.to_tensor(obj)) + obj = F.to_image_tensor(obj) + obj = F.convert_image_dtype(obj, dtype=torch.float32) + images.append(obj) elif isinstance(obj, features.BoundingBox): bboxes.append(obj) elif isinstance(obj, (features.Label, features.OneHotLabel)): From 90799b8cd3aec583d6feb266e22d9345fdc4e519 Mon Sep 17 00:00:00 2001 From: Ambuj Pawar Date: Sun, 6 Nov 2022 16:59:53 +0100 Subject: [PATCH 09/32] Remove: get params for mixup --- torchvision/prototype/transforms/_mixup_detection.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/torchvision/prototype/transforms/_mixup_detection.py b/torchvision/prototype/transforms/_mixup_detection.py index c7fbf180b34..ea7293878cb 100644 --- a/torchvision/prototype/transforms/_mixup_detection.py +++ b/torchvision/prototype/transforms/_mixup_detection.py @@ -29,11 +29,6 @@ def __init__( super().__init__() self._dist = torch.distributions.Beta(torch.tensor([alpha]), torch.tensor([alpha])) - def _get_params(self) -> Dict[str, Any]: - # TODO: Retrieve the params from the input sample - mixup_ratio = self._dist.sample().item() - return {"mixup_ratio": mixup_ratio} - def _extract_image_targets(self, flat_sample: List[Any]) -> Tuple[List[Any], List[Dict[str, Any]]]: # fetch all images, bboxes and labels from unstructured input # with List[image], List[BoundingBox], List[Label] @@ -135,13 +130,15 @@ def _mixup( """ Performs mixup on the given images and targets. """ - mixup_ratio = self._get_params().get("mixup_ratio") + mixup_ratio = self._dist.sample().item() + print(mixup_ratio) + c_1, h_1, w_1 = image_1.shape c_2, h_2, w_2 = image_2.shape h_mixup = max(h_1, h_2) w_mixup = max(w_1, w_2) - if mixup_ratio >= 1: + if mixup_ratio >= 1.0: return image_1, target_1 # mixup images and prevent the object aspect ratio from changing From 248737d98be4fba224eb60d090485ae74262f5a7 Mon Sep 17 00:00:00 2001 From: Ambuj Pawar Date: Mon, 7 Nov 2022 10:47:28 +0100 Subject: [PATCH 10/32] Update _mixup_detection.py --- torchvision/prototype/transforms/_mixup_detection.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/torchvision/prototype/transforms/_mixup_detection.py b/torchvision/prototype/transforms/_mixup_detection.py index ea7293878cb..5e6d34cf87c 100644 --- a/torchvision/prototype/transforms/_mixup_detection.py +++ b/torchvision/prototype/transforms/_mixup_detection.py @@ -42,13 +42,13 @@ def _extract_image_targets(self, flat_sample: List[Any]) -> Tuple[List[Any], Lis images.append(obj) elif isinstance(obj, features.BoundingBox): bboxes.append(obj) - elif isinstance(obj, (features.Label, features.OneHotLabel)): + elif isinstance(obj, (features.Label)): labels.append(obj) if not (len(images) == len(bboxes) == len(labels)): raise TypeError( f"{type(self).__name__}() requires input sample to contain equal-sized list of Images, " - "BoundingBoxes, and Labels or OneHotLabels." + "BoundingBoxes and Labels." ) targets = [] @@ -70,7 +70,7 @@ def _check_inputs(self, flat_inputs: List[Any]) -> None: has_any(flat_inputs, features.Image, PIL.Image.Image, features.is_simple_tensor) and has_any(flat_inputs, features.BoundingBox) ): - raise TypeError(f"{type(self).__name__}() is only defined for tensor images/videos and bounding boxes.") + raise TypeError(f"{type(self).__name__}() is only defined for tensor images and bounding boxes.") def _insert_outputs( self, flat_sample: List[Any], output_images: List[Any], output_targets: List[Dict[str, Any]] @@ -89,7 +89,7 @@ def _insert_outputs( elif isinstance(obj, features.BoundingBox): flat_sample[i] = features.BoundingBox.wrap_like(obj, output_targets[c1]["boxes"]) c1 += 1 - elif isinstance(obj, (features.Label, features.OneHotLabel)): + elif isinstance(obj, (features.Label)): flat_sample[i] = obj.wrap_like(obj, output_targets[c2]["labels"]) # type: ignore[arg-type] c2 += 1 From 26316a4c4d09e4f7b17df8caf1812f6b1872fec8 Mon Sep 17 00:00:00 2001 From: Ambuj Pawar Date: Mon, 7 Nov 2022 11:12:17 +0100 Subject: [PATCH 11/32] Remove unused type: ignore due to failing CI test --- torchvision/prototype/transforms/_mixup_detection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/prototype/transforms/_mixup_detection.py b/torchvision/prototype/transforms/_mixup_detection.py index 5e6d34cf87c..0a3e7028b31 100644 --- a/torchvision/prototype/transforms/_mixup_detection.py +++ b/torchvision/prototype/transforms/_mixup_detection.py @@ -90,7 +90,7 @@ def _insert_outputs( flat_sample[i] = features.BoundingBox.wrap_like(obj, output_targets[c1]["boxes"]) c1 += 1 elif isinstance(obj, (features.Label)): - flat_sample[i] = obj.wrap_like(obj, output_targets[c2]["labels"]) # type: ignore[arg-type] + flat_sample[i] = obj.wrap_like(obj, output_targets[c2]["labels"]) c2 += 1 def forward(self, *inputs: Any) -> Any: From 04c80d76b0bc7cc58c7f5537f6e857d1cdd49587 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 8 Nov 2022 13:08:53 +0100 Subject: [PATCH 12/32] add batch detection helpers --- torchvision/prototype/transforms/_augment.py | 142 ++++++++++--------- 1 file changed, 72 insertions(+), 70 deletions(-) diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py index cf861c46d24..e1425d81101 100644 --- a/torchvision/prototype/transforms/_augment.py +++ b/torchvision/prototype/transforms/_augment.py @@ -1,17 +1,17 @@ import math import numbers import warnings -from typing import Any, cast, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Type, Union import PIL.Image import torch -from torch.utils._pytree import tree_flatten, tree_unflatten +from torch.utils._pytree import tree_flatten, tree_unflatten, TreeSpec from torchvision.ops import masks_to_boxes from torchvision.prototype import features from torchvision.prototype.transforms import functional as F, InterpolationMode from ._transform import _RandomApplyTransform -from ._utils import has_any, query_chw, query_spatial_size +from ._utils import _isinstance, has_any, query_chw, query_spatial_size class RandomErasing(_RandomApplyTransform): @@ -190,6 +190,40 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: return inpt +def flatten_and_extract( + inputs: Any, **types_or_checks: Tuple[Union[Type, Callable[[Any], bool]], ...] +) -> Tuple[Tuple[List[Any], TreeSpec, Dict[str, List[int]]], Dict[str, List[Any]]]: + flat_inputs, spec = tree_flatten(inputs if len(inputs) > 1 else inputs[0]) + + idcs: Dict[str, List[int]] = {key: [] for key in types_or_checks.keys()} + inputs: Dict[str, List[Any]] = {key: [] for key in types_or_checks.keys()} + for idx, inpt in enumerate(flat_inputs): + for key, types_or_checks_ in types_or_checks.items(): + if _isinstance(inpt, types_or_checks_): + inputs[key].append(inpt) + idcs[key].append(idx) + break + + num_inputs = [len(inputs_) for inputs_ in inputs.values()] + if not all(num_inputs_ == num_inputs[0] for num_inputs_ in num_inputs[1:]): + raise TypeError("FIXME") + + return (flat_inputs, spec, idcs), inputs + + +def unflatten_and_insert( + flat_inputs_with_spec: Tuple[List[Any], TreeSpec, Dict[str, List[int]]], + outputs: Dict[str, List[Any]], +) -> Any: + flat_inputs, spec, idcs = flat_inputs_with_spec + + for key, idcs_ in idcs.items(): + for idx, output in zip(idcs_, outputs[key]): + flat_inputs[idx] = output + + return tree_unflatten(flat_inputs, spec) + + class SimpleCopyPaste(_RandomApplyTransform): def __init__( self, @@ -205,15 +239,23 @@ def __init__( def _copy_paste( self, - image: features.TensorImageType, + image: features.ImageType, target: Dict[str, Any], - paste_image: features.TensorImageType, + paste_image: features.ImageType, paste_target: Dict[str, Any], random_selection: torch.Tensor, blending: bool, resize_interpolation: F.InterpolationMode, antialias: Optional[bool], - ) -> Tuple[features.TensorImageType, Dict[str, Any]]: + ) -> Tuple[features.ImageType, Dict[str, Any]]: + if isinstance(image, features.Image): + out_image = image.as_subclass(torch.Tensor) + paste_image = paste_image.as_subclass(torch.Tensor) + elif isinstance(image, PIL.Image.Image): + out_image = F.pil_to_tensor(image) + paste_image = F.pil_to_tensor(paste_image) + else: # features.is_simple_tensor(image) + out_image = image paste_masks = paste_target["masks"].wrap_like(paste_target["masks"], paste_target["masks"][random_selection]) paste_boxes = paste_target["boxes"].wrap_like(paste_target["boxes"], paste_target["boxes"][random_selection]) @@ -227,7 +269,7 @@ def _copy_paste( # This is something different to TF implementation we introduced here as # originally the algorithm works on equal-sized data # (for example, coming from LSJ data augmentations) - size1 = cast(List[int], image.shape[-2:]) + size1 = cast(List[int], out_image.shape[-2:]) size2 = paste_image.shape[-2:] if size1 != size2: paste_image = F.resize(paste_image, size=size1, interpolation=resize_interpolation, antialias=antialias) @@ -241,7 +283,7 @@ def _copy_paste( inverse_paste_alpha_mask = paste_alpha_mask.logical_not() # Copy-paste images: - image = image.mul(inverse_paste_alpha_mask).add_(paste_image.mul(paste_alpha_mask)) + out_image = out_image.mul(inverse_paste_alpha_mask).add_(paste_image.mul(paste_alpha_mask)) # Copy-paste masks: masks = masks * inverse_paste_alpha_mask @@ -281,69 +323,28 @@ def _copy_paste( out_target["masks"] = out_target["masks"][valid_targets] out_target["labels"] = out_target["labels"][valid_targets] - return image, out_target - - def _extract_image_targets( - self, flat_sample: List[Any] - ) -> Tuple[List[features.TensorImageType], List[Dict[str, Any]]]: - # fetch all images, bboxes, masks and labels from unstructured input - # with List[image], List[BoundingBox], List[Mask], List[Label] - images, bboxes, masks, labels = [], [], [], [] - for obj in flat_sample: - if isinstance(obj, features.Image) or features.is_simple_tensor(obj): - images.append(obj) - elif isinstance(obj, PIL.Image.Image): - images.append(F.to_image_tensor(obj)) - elif isinstance(obj, features.BoundingBox): - bboxes.append(obj) - elif isinstance(obj, features.Mask): - masks.append(obj) - elif isinstance(obj, (features.Label, features.OneHotLabel)): - labels.append(obj) - - if not (len(images) == len(bboxes) == len(masks) == len(labels)): - raise TypeError( - f"{type(self).__name__}() requires input sample to contain equal sized list of Images, " - "BoundingBoxes, Masks and Labels or OneHotLabels." - ) + if isinstance(image, features.Image): + out_image = features.Image.wrap_like(image, out_image) + elif isinstance(image, PIL.Image.Image): + out_image = F.to_image_pil(out_image) - targets = [] - for bbox, mask, label in zip(bboxes, masks, labels): - targets.append({"boxes": bbox, "masks": mask, "labels": label}) + out_target["boxes"] = features.BoundingBox.wrap_like(target["boxes"], out_target["boxes"]) + out_target["masks"] = features.Mask.wrap_like(target["masks"], out_target["masks"]) + out_target["labels"] = features.Label.wrap_like(target["labels"], out_target["labels"]) - return images, targets - - def _insert_outputs( - self, - flat_sample: List[Any], - output_images: List[features.TensorImageType], - output_targets: List[Dict[str, Any]], - ) -> None: - c0, c1, c2, c3 = 0, 0, 0, 0 - for i, obj in enumerate(flat_sample): - if isinstance(obj, features.Image): - flat_sample[i] = features.Image.wrap_like(obj, output_images[c0]) - c0 += 1 - elif isinstance(obj, PIL.Image.Image): - flat_sample[i] = F.to_image_pil(output_images[c0]) - c0 += 1 - elif features.is_simple_tensor(obj): - flat_sample[i] = output_images[c0] - c0 += 1 - elif isinstance(obj, features.BoundingBox): - flat_sample[i] = features.BoundingBox.wrap_like(obj, output_targets[c1]["boxes"]) - c1 += 1 - elif isinstance(obj, features.Mask): - flat_sample[i] = features.Mask.wrap_like(obj, output_targets[c2]["masks"]) - c2 += 1 - elif isinstance(obj, (features.Label, features.OneHotLabel)): - flat_sample[i] = obj.wrap_like(obj, output_targets[c3]["labels"]) # type: ignore[arg-type] - c3 += 1 + return out_image, out_target def forward(self, *inputs: Any) -> Any: - flat_inputs, spec = tree_flatten(inputs if len(inputs) > 1 else inputs[0]) + flat_inputs_with_spec, inputs = flatten_and_extract( + inputs, + images=(features.Image, PIL.Image.Image, features.is_simple_tensor), + boxes=(features.BoundingBox,), + masks=(features.Mask,), + labels=(features.Label, features.OneHotLabel), + ) - images, targets = self._extract_image_targets(flat_inputs) + images = inputs.pop("images") + targets = [dict(zip(inputs.keys(), target)) for target in zip(*inputs.values())] # images = [t1, t2, ..., tN] # Let's define paste_images as shifted list of input images @@ -380,7 +381,8 @@ def forward(self, *inputs: Any) -> Any: output_images.append(output_image) output_targets.append(output_target) - # Insert updated images and targets into input flat_sample - self._insert_outputs(flat_inputs, output_images, output_targets) - - return tree_unflatten(flat_inputs, spec) + outputs = dict( + dict(zip(output_targets[0].keys(), zip(*(list(target.values()) for target in output_targets)))), + images=images, + ) + return unflatten_and_insert(flat_inputs_with_spec, outputs) From 5667c91984e17cb2c13f4fd226e8f9d3b3bf012e Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 8 Nov 2022 13:22:50 +0100 Subject: [PATCH 13/32] use helpers in detection mixup --- .../prototype/transforms/_mixup_detection.py | 105 ++++++++---------- 1 file changed, 44 insertions(+), 61 deletions(-) diff --git a/torchvision/prototype/transforms/_mixup_detection.py b/torchvision/prototype/transforms/_mixup_detection.py index 0a3e7028b31..0cd68c90d98 100644 --- a/torchvision/prototype/transforms/_mixup_detection.py +++ b/torchvision/prototype/transforms/_mixup_detection.py @@ -11,12 +11,12 @@ import PIL.Image import torch -from torch.utils._pytree import tree_flatten, tree_unflatten from torchvision.prototype import features -from torchvision.prototype.features._feature import is_simple_tensor from torchvision.prototype.transforms import functional as F, Transform from torchvision.prototype.transforms._utils import has_any +from ._augment import flatten_and_extract, unflatten_and_insert + class MixupDetection(Transform): _transformed_types = (features.is_simple_tensor, features.Image, PIL.Image) @@ -29,34 +29,6 @@ def __init__( super().__init__() self._dist = torch.distributions.Beta(torch.tensor([alpha]), torch.tensor([alpha])) - def _extract_image_targets(self, flat_sample: List[Any]) -> Tuple[List[Any], List[Dict[str, Any]]]: - # fetch all images, bboxes and labels from unstructured input - # with List[image], List[BoundingBox], List[Label] - images, bboxes, labels = [], [], [] - for obj in flat_sample: - if isinstance(obj, features.Image) or is_simple_tensor(obj): - images.append(obj) - elif isinstance(obj, PIL.Image.Image): - obj = F.to_image_tensor(obj) - obj = F.convert_image_dtype(obj, dtype=torch.float32) - images.append(obj) - elif isinstance(obj, features.BoundingBox): - bboxes.append(obj) - elif isinstance(obj, (features.Label)): - labels.append(obj) - - if not (len(images) == len(bboxes) == len(labels)): - raise TypeError( - f"{type(self).__name__}() requires input sample to contain equal-sized list of Images, " - "BoundingBoxes and Labels." - ) - - targets = [] - for bbox, label in zip(bboxes, labels): - targets.append({"boxes": bbox, "labels": label}) - - return images, targets - def _check_inputs(self, flat_inputs: List[Any]) -> None: if has_any(flat_inputs, features.Mask): raise TypeError(f"Masks are not supported by {type(self).__name__}()") @@ -72,32 +44,20 @@ def _check_inputs(self, flat_inputs: List[Any]) -> None: ): raise TypeError(f"{type(self).__name__}() is only defined for tensor images and bounding boxes.") - def _insert_outputs( - self, flat_sample: List[Any], output_images: List[Any], output_targets: List[Dict[str, Any]] - ) -> None: - c0, c1, c2 = 0, 0, 0 - for i, obj in enumerate(flat_sample): - if isinstance(obj, features.Image): - flat_sample[i] = features.Image.wrap_like(obj, output_images[c0]) - c0 += 1 - elif isinstance(obj, PIL.Image.Image): - flat_sample[i] = F.to_image_pil(output_images[c0]) - c0 += 1 - elif is_simple_tensor(obj): - flat_sample[i] = output_images[c0] - c0 += 1 - elif isinstance(obj, features.BoundingBox): - flat_sample[i] = features.BoundingBox.wrap_like(obj, output_targets[c1]["boxes"]) - c1 += 1 - elif isinstance(obj, (features.Label)): - flat_sample[i] = obj.wrap_like(obj, output_targets[c2]["labels"]) - c2 += 1 - def forward(self, *inputs: Any) -> Any: - flat_sample, spec = tree_flatten(inputs if len(inputs) > 1 else inputs[0]) - self._check_inputs(flat_sample) - - images, targets = self._extract_image_targets(flat_sample) + flat_inputs_with_spec, inputs = flatten_and_extract( + inputs, + images=(features.Image, PIL.Image.Image, features.is_simple_tensor), + boxes=(features.BoundingBox,), + labels=(features.Label, features.OneHotLabel), + ) + # TODO: refactor this since we have already extracted the images and boxes + self._check_inputs(flat_inputs_with_spec[0]) + + # TODO: this is copying the structure from `SimpleCopyPaste`. We should + # investigate if we want that or a different structure might be beneficial here + images = inputs.pop("images") + targets = [dict(zip(inputs.keys(), target)) for target in zip(*inputs.values())] # images = [t1, t2, ..., tN] # Let's define paste_images as shifted list of input images @@ -116,20 +76,34 @@ def forward(self, *inputs: Any) -> Any: output_images.append(output_image) output_targets.append(output_target) - # Insert updated images and targets into input flat_sample - self._insert_outputs(flat_sample, output_images, output_targets) - return tree_unflatten(flat_sample, spec) + # TODO: same as above + outputs = dict( + dict(zip(output_targets[0].keys(), zip(*(list(target.values()) for target in output_targets)))), + images=images, + ) + return unflatten_and_insert(flat_inputs_with_spec, outputs) def _mixup( self, - image_1: features.TensorImageType, + image_1: features.ImageType, target_1: Dict[str, Any], - image_2: features.TensorImageType, + image_2: features.ImageType, target_2: Dict[str, Any], - ) -> Tuple[features.TensorImageType, Dict[str, Any]]: + ) -> Tuple[features.ImageType, Dict[str, Any]]: """ Performs mixup on the given images and targets. """ + if isinstance(image_1, features.Image): + ref = image_1 + image_1 = image_1.as_subclass(torch.Tensor) + image_2 = image_2.as_subclass(torch.Tensor) + elif isinstance(image_1, PIL.Image.Image): + ref = None + image_1 = F.pil_to_tensor(image_1) + image_2 = F.pil_to_tensor(image_2) + else: # features.is_simple_tensor(image) + ref = None + mixup_ratio = self._dist.sample().item() print(mixup_ratio) @@ -159,4 +133,13 @@ def _mixup( mix_target.update(mixed_boxes) mix_target.update(mix_labels) + if isinstance(image_1, features.Image): + mix_img = features.Image.wrap_like(ref, mix_img) # type: ignore[arg-type] + elif isinstance(image_1, PIL.Image.Image): + mix_img = F.to_image_pil(mix_img) + + mix_target["boxes"] = features.BoundingBox.wrap_like(target_1["boxes"], mix_target["boxes"]) + mix_target["masks"] = features.Mask.wrap_like(target_1["masks"], mix_target["masks"]) + mix_target["labels"] = features.Label.wrap_like(target_1["labels"], mix_target["labels"]) + return mix_img, mix_target From e0724a31624d88a3d2d1a1e857ab9af08efa90c3 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 8 Nov 2022 17:40:12 +0100 Subject: [PATCH 14/32] refactor helpers --- .../prototype/datasets/_builtin/coco.py | 4 +- torchvision/prototype/transforms/_augment.py | 138 ++++++++++-------- .../prototype/transforms/_mixup_detection.py | 47 +----- 3 files changed, 91 insertions(+), 98 deletions(-) diff --git a/torchvision/prototype/datasets/_builtin/coco.py b/torchvision/prototype/datasets/_builtin/coco.py index 72d76f48783..2852c086786 100644 --- a/torchvision/prototype/datasets/_builtin/coco.py +++ b/torchvision/prototype/datasets/_builtin/coco.py @@ -24,7 +24,7 @@ path_accessor, read_categories_file, ) -from torchvision.prototype.features import _Feature, BoundingBox, EncodedImage, Label +from torchvision.prototype.features import _Feature, BoundingBox, EncodedImage, Label, Mask from .._api import register_dataset, register_info @@ -114,7 +114,7 @@ def _decode_instances_anns(self, anns: List[Dict[str, Any]], image_meta: Dict[st labels = [ann["category_id"] for ann in anns] return dict( # TODO: create a segmentation feature - segmentations=_Feature( + segmentations=Mask( torch.stack( [ self._segmentation_to_mask( diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py index e1425d81101..02464fad737 100644 --- a/torchvision/prototype/transforms/_augment.py +++ b/torchvision/prototype/transforms/_augment.py @@ -190,38 +190,88 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: return inpt -def flatten_and_extract( - inputs: Any, **types_or_checks: Tuple[Union[Type, Callable[[Any], bool]], ...] -) -> Tuple[Tuple[List[Any], TreeSpec, Dict[str, List[int]]], Dict[str, List[Any]]]: - flat_inputs, spec = tree_flatten(inputs if len(inputs) > 1 else inputs[0]) - - idcs: Dict[str, List[int]] = {key: [] for key in types_or_checks.keys()} - inputs: Dict[str, List[Any]] = {key: [] for key in types_or_checks.keys()} - for idx, inpt in enumerate(flat_inputs): - for key, types_or_checks_ in types_or_checks.items(): - if _isinstance(inpt, types_or_checks_): - inputs[key].append(inpt) - idcs[key].append(idx) +def flatten_and_extract_data( + inputs: Any, **target_types_or_checks: Tuple[Union[Type, Callable[[Any], bool]], ...] +) -> Tuple[Tuple[List[Any], TreeSpec, List[Dict[str, int]]], List[features.TensorImageType], List[Dict[str, Any]]]: + # Images are special in the sense that they will always be extracted and returned + # separately. Internally however, they behave just as the other features. + types_or_checks: Dict[str, Tuple[Union[Type, Callable[[Any], bool]], ...]] = { + "images": (features.Image, PIL.Image.Image, features.is_simple_tensor), + **target_types_or_checks, + } + + batch = inputs if len(inputs) > 1 else inputs[0] + flat_batch = [] + sample_specs = [] + + offset = 0 + batch_idcs = [] + batch_data = [] + for sample_idx, sample in enumerate(batch): + flat_sample, sample_spec = tree_flatten(sample) + flat_batch.extend(flat_sample) + sample_specs.append(sample_spec) + + sample_types_or_checks = types_or_checks.copy() + sample_idcs = {} + sample_data = {} + for flat_idx, item in enumerate(flat_sample, offset): + if not sample_types_or_checks: break - num_inputs = [len(inputs_) for inputs_ in inputs.values()] - if not all(num_inputs_ == num_inputs[0] for num_inputs_ in num_inputs[1:]): - raise TypeError("FIXME") + for key, types_or_checks_ in sample_types_or_checks.items(): + if _isinstance(item, types_or_checks_): + break + else: + continue + + del sample_types_or_checks[key] + sample_idcs[key] = flat_idx + sample_data[key] = item + + if sample_types_or_checks: + # TODO: improve message + raise TypeError(f"Sample at index {sample_idx} in the batch is missing {sample_types_or_checks.keys()}`") + + batch_idcs.append(sample_idcs) + batch_data.append(sample_data) + offset += len(flat_sample) - return (flat_inputs, spec, idcs), inputs + batch_spec = TreeSpec(list, context=None, children_specs=sample_specs) + targets = batch_data + batch_data = [] + for target in targets: + image = target.pop("images") + if isinstance(image, features.Image): + image = image.as_subclass(torch.Tensor) + elif isinstance(image, PIL.Image.Image): + image = F.pil_to_tensor(image) + batch_data.append(image) + + return (flat_batch, batch_spec, batch_idcs), batch_data, targets -def unflatten_and_insert( - flat_inputs_with_spec: Tuple[List[Any], TreeSpec, Dict[str, List[int]]], - outputs: Dict[str, List[Any]], + +def unflatten_and_insert_data( + flat_batch_with_spec: Tuple[List[Any], TreeSpec, List[Dict[str, int]]], + images: List[features.TensorImageType], + targets: List[Dict[str, Any]], ) -> Any: - flat_inputs, spec, idcs = flat_inputs_with_spec + flat_batch, batch_spec, batch_idcs = flat_batch_with_spec + + for sample_idx, sample_idcs in enumerate(batch_idcs): + for key, flat_idx in sample_idcs.items(): + item = images[sample_idx] if key == "images" else targets[sample_idx][key] - for key, idcs_ in idcs.items(): - for idx, output in zip(idcs_, outputs[key]): - flat_inputs[idx] = output + inpt = flat_batch[flat_idx] + if isinstance(inpt, features._Feature): + item = type(inpt).wrap_like(inpt, item) + elif isinstance(inpt, PIL.Image.Image): + item = F.to_image_pil(item) - return tree_unflatten(flat_inputs, spec) + flat_batch[flat_idx] = item + + return tree_unflatten(flat_batch, batch_spec) class SimpleCopyPaste(_RandomApplyTransform): @@ -239,24 +289,15 @@ def __init__( def _copy_paste( self, - image: features.ImageType, + image: features.TensorImageType, target: Dict[str, Any], - paste_image: features.ImageType, + paste_image: features.TensorImageType, paste_target: Dict[str, Any], random_selection: torch.Tensor, blending: bool, resize_interpolation: F.InterpolationMode, antialias: Optional[bool], - ) -> Tuple[features.ImageType, Dict[str, Any]]: - if isinstance(image, features.Image): - out_image = image.as_subclass(torch.Tensor) - paste_image = paste_image.as_subclass(torch.Tensor) - elif isinstance(image, PIL.Image.Image): - out_image = F.pil_to_tensor(image) - paste_image = F.pil_to_tensor(paste_image) - else: # features.is_simple_tensor(image) - out_image = image - + ) -> Tuple[features.TensorImageType, Dict[str, Any]]: paste_masks = paste_target["masks"].wrap_like(paste_target["masks"], paste_target["masks"][random_selection]) paste_boxes = paste_target["boxes"].wrap_like(paste_target["boxes"], paste_target["boxes"][random_selection]) paste_labels = paste_target["labels"].wrap_like( @@ -269,7 +310,7 @@ def _copy_paste( # This is something different to TF implementation we introduced here as # originally the algorithm works on equal-sized data # (for example, coming from LSJ data augmentations) - size1 = cast(List[int], out_image.shape[-2:]) + size1 = cast(List[int], image.shape[-2:]) size2 = paste_image.shape[-2:] if size1 != size2: paste_image = F.resize(paste_image, size=size1, interpolation=resize_interpolation, antialias=antialias) @@ -283,7 +324,7 @@ def _copy_paste( inverse_paste_alpha_mask = paste_alpha_mask.logical_not() # Copy-paste images: - out_image = out_image.mul(inverse_paste_alpha_mask).add_(paste_image.mul(paste_alpha_mask)) + out_image = image.mul(inverse_paste_alpha_mask).add_(paste_image.mul(paste_alpha_mask)) # Copy-paste masks: masks = masks * inverse_paste_alpha_mask @@ -323,29 +364,16 @@ def _copy_paste( out_target["masks"] = out_target["masks"][valid_targets] out_target["labels"] = out_target["labels"][valid_targets] - if isinstance(image, features.Image): - out_image = features.Image.wrap_like(image, out_image) - elif isinstance(image, PIL.Image.Image): - out_image = F.to_image_pil(out_image) - - out_target["boxes"] = features.BoundingBox.wrap_like(target["boxes"], out_target["boxes"]) - out_target["masks"] = features.Mask.wrap_like(target["masks"], out_target["masks"]) - out_target["labels"] = features.Label.wrap_like(target["labels"], out_target["labels"]) - return out_image, out_target def forward(self, *inputs: Any) -> Any: - flat_inputs_with_spec, inputs = flatten_and_extract( + flat_batch_with_spec, images, targets = flatten_and_extract_data( inputs, - images=(features.Image, PIL.Image.Image, features.is_simple_tensor), boxes=(features.BoundingBox,), masks=(features.Mask,), labels=(features.Label, features.OneHotLabel), ) - images = inputs.pop("images") - targets = [dict(zip(inputs.keys(), target)) for target in zip(*inputs.values())] - # images = [t1, t2, ..., tN] # Let's define paste_images as shifted list of input images # paste_images = [t2, t3, ..., tN, t1] @@ -381,8 +409,4 @@ def forward(self, *inputs: Any) -> Any: output_images.append(output_image) output_targets.append(output_target) - outputs = dict( - dict(zip(output_targets[0].keys(), zip(*(list(target.values()) for target in output_targets)))), - images=images, - ) - return unflatten_and_insert(flat_inputs_with_spec, outputs) + return unflatten_and_insert_data(flat_batch_with_spec, output_images, output_targets) diff --git a/torchvision/prototype/transforms/_mixup_detection.py b/torchvision/prototype/transforms/_mixup_detection.py index 0cd68c90d98..d63f1f6177e 100644 --- a/torchvision/prototype/transforms/_mixup_detection.py +++ b/torchvision/prototype/transforms/_mixup_detection.py @@ -12,10 +12,10 @@ import torch from torchvision.prototype import features -from torchvision.prototype.transforms import functional as F, Transform +from torchvision.prototype.transforms import Transform from torchvision.prototype.transforms._utils import has_any -from ._augment import flatten_and_extract, unflatten_and_insert +from ._augment import flatten_and_extract_data, unflatten_and_insert_data class MixupDetection(Transform): @@ -45,19 +45,13 @@ def _check_inputs(self, flat_inputs: List[Any]) -> None: raise TypeError(f"{type(self).__name__}() is only defined for tensor images and bounding boxes.") def forward(self, *inputs: Any) -> Any: - flat_inputs_with_spec, inputs = flatten_and_extract( + flat_batch_with_spec, images, targets = flatten_and_extract_data( inputs, - images=(features.Image, PIL.Image.Image, features.is_simple_tensor), boxes=(features.BoundingBox,), labels=(features.Label, features.OneHotLabel), ) # TODO: refactor this since we have already extracted the images and boxes - self._check_inputs(flat_inputs_with_spec[0]) - - # TODO: this is copying the structure from `SimpleCopyPaste`. We should - # investigate if we want that or a different structure might be beneficial here - images = inputs.pop("images") - targets = [dict(zip(inputs.keys(), target)) for target in zip(*inputs.values())] + self._check_inputs(flat_batch_with_spec[0]) # images = [t1, t2, ..., tN] # Let's define paste_images as shifted list of input images @@ -76,34 +70,18 @@ def forward(self, *inputs: Any) -> Any: output_images.append(output_image) output_targets.append(output_target) - # TODO: same as above - outputs = dict( - dict(zip(output_targets[0].keys(), zip(*(list(target.values()) for target in output_targets)))), - images=images, - ) - return unflatten_and_insert(flat_inputs_with_spec, outputs) + return unflatten_and_insert_data(flat_batch_with_spec, output_images, output_targets) def _mixup( self, - image_1: features.ImageType, + image_1: features.TensorImageType, target_1: Dict[str, Any], - image_2: features.ImageType, + image_2: features.TensorImageType, target_2: Dict[str, Any], - ) -> Tuple[features.ImageType, Dict[str, Any]]: + ) -> Tuple[features.TensorImageType, Dict[str, Any]]: """ Performs mixup on the given images and targets. """ - if isinstance(image_1, features.Image): - ref = image_1 - image_1 = image_1.as_subclass(torch.Tensor) - image_2 = image_2.as_subclass(torch.Tensor) - elif isinstance(image_1, PIL.Image.Image): - ref = None - image_1 = F.pil_to_tensor(image_1) - image_2 = F.pil_to_tensor(image_2) - else: # features.is_simple_tensor(image) - ref = None - mixup_ratio = self._dist.sample().item() print(mixup_ratio) @@ -133,13 +111,4 @@ def _mixup( mix_target.update(mixed_boxes) mix_target.update(mix_labels) - if isinstance(image_1, features.Image): - mix_img = features.Image.wrap_like(ref, mix_img) # type: ignore[arg-type] - elif isinstance(image_1, PIL.Image.Image): - mix_img = F.to_image_pil(mix_img) - - mix_target["boxes"] = features.BoundingBox.wrap_like(target_1["boxes"], mix_target["boxes"]) - mix_target["masks"] = features.Mask.wrap_like(target_1["masks"], mix_target["masks"]) - mix_target["labels"] = features.Label.wrap_like(target_1["labels"], mix_target["labels"]) - return mix_img, mix_target From 61770576e84afc33371508219f67dc530f60c75d Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Thu, 1 Dec 2022 08:03:26 +0100 Subject: [PATCH 15/32] revert accidental COCO change --- torchvision/prototype/datasets/_builtin/coco.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/prototype/datasets/_builtin/coco.py b/torchvision/prototype/datasets/_builtin/coco.py index fb0f66df5aa..4ec4580e780 100644 --- a/torchvision/prototype/datasets/_builtin/coco.py +++ b/torchvision/prototype/datasets/_builtin/coco.py @@ -114,7 +114,7 @@ def _decode_instances_anns(self, anns: List[Dict[str, Any]], image_meta: Dict[st labels = [ann["category_id"] for ann in anns] return dict( # TODO: create a segmentation feature - segmentations=Mask( + segmentations=_Feature( torch.stack( [ self._segmentation_to_mask( From 2b67017b4a041955c3d2e2c8e0742a7a58732589 Mon Sep 17 00:00:00 2001 From: Ambuj Pawar Date: Sun, 4 Dec 2022 16:12:41 +0100 Subject: [PATCH 16/32] Move: mixup detection to _augment.py --- torchvision/prototype/transforms/__init__.py | 3 +- torchvision/prototype/transforms/_augment.py | 93 +++++++++++++- .../prototype/transforms/_mixup_detection.py | 114 ------------------ 3 files changed, 93 insertions(+), 117 deletions(-) delete mode 100644 torchvision/prototype/transforms/_mixup_detection.py diff --git a/torchvision/prototype/transforms/__init__.py b/torchvision/prototype/transforms/__init__.py index f8052b721e6..13eb216813e 100644 --- a/torchvision/prototype/transforms/__init__.py +++ b/torchvision/prototype/transforms/__init__.py @@ -5,7 +5,7 @@ from ._transform import Transform # usort: skip from ._presets import StereoMatching # usort: skip -from ._augment import RandomCutmix, RandomErasing, RandomMixup, SimpleCopyPaste +from ._augment import MixupDetection, RandomCutmix, RandomErasing, RandomMixup, SimpleCopyPaste from ._auto_augment import AugMix, AutoAugment, RandAugment, TrivialAugmentWide from ._color import ( ColorJitter, @@ -51,7 +51,6 @@ ToDtype, TransposeDimensions, ) -from ._mixup_detection import MixupDetection from ._temporal import UniformTemporalSubsample from ._type_conversion import LabelToOneHot, PILToTensor, ToImagePIL, ToImageTensor, ToPILImage diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py index b39680cbb68..6940b7fb50c 100644 --- a/torchvision/prototype/transforms/_augment.py +++ b/torchvision/prototype/transforms/_augment.py @@ -8,7 +8,7 @@ from torch.utils._pytree import tree_flatten, tree_unflatten, TreeSpec from torchvision.ops import masks_to_boxes from torchvision.prototype import features -from torchvision.prototype.transforms import functional as F, InterpolationMode +from torchvision.prototype.transforms import functional as F, InterpolationMode, Transform from ._transform import _RandomApplyTransform from .utils import check_type, has_any, query_chw, query_spatial_size @@ -410,3 +410,94 @@ def forward(self, *inputs: Any) -> Any: output_targets.append(output_target) return unflatten_and_insert_data(flat_batch_with_spec, output_images, output_targets) + + +class MixupDetection(Transform): + _transformed_types = (features.is_simple_tensor, features.Image, PIL.Image) + + def __init__( + self, + *, + alpha: float = 1.5, + ) -> None: + super().__init__() + self._dist = torch.distributions.Beta(torch.tensor([alpha]), torch.tensor([alpha])) + + def _check_inputs(self, flat_inputs: List[Any]) -> None: + if has_any(flat_inputs, features.Mask, features.Video): + raise TypeError(f"{type(self).__name__}() is only supported for images and bounding boxes.") + + if not ( + has_any(flat_inputs, features.Image, PIL.Image.Image, features.is_simple_tensor) + and has_any(flat_inputs, features.BoundingBox) + ): + raise TypeError(f"{type(self).__name__}() is only defined for tensor images and bounding boxes.") + + def forward(self, *inputs: Any) -> Any: + flat_batch_with_spec, images, targets = flatten_and_extract_data( + inputs, + boxes=(features.BoundingBox,), + labels=(features.Label, features.OneHotLabel), + ) + # TODO: refactor this since we have already extracted the images and boxes + self._check_inputs(flat_batch_with_spec[0]) + + # images = [t1, t2, ..., tN] + # Let's define paste_images as shifted list of input images + # paste_images = [tN, t1, ..., tN-1,] + images_rolled = images[-1:] + images[:-1] + targets_rolled = targets[-1:] + targets[:-1] + + output_images, output_targets = [], [] + for image_1, target_1, image_2, target_2 in zip(images, targets, images_rolled, targets_rolled): + output_image, output_target = self._mixup( + image_1, + target_1, + image_2, + target_2, + ) + output_images.append(output_image) + output_targets.append(output_target) + + return unflatten_and_insert_data(flat_batch_with_spec, output_images, output_targets) + + def _mixup( + self, + image_1: features.TensorImageType, + target_1: Dict[str, Any], + image_2: features.TensorImageType, + target_2: Dict[str, Any], + ) -> Tuple[features.TensorImageType, Dict[str, Any]]: + """ + Performs mixup on the given images and targets. + """ + mixup_ratio = self._dist.sample().item() + print(mixup_ratio) + + c_1, h_1, w_1 = image_1.shape + c_2, h_2, w_2 = image_2.shape + h_mixup = max(h_1, h_2) + w_mixup = max(w_1, w_2) + + if mixup_ratio >= 1.0: + return image_1, target_1 + + # mixup images and prevent the object aspect ratio from changing + mix_img = torch.zeros(c_1, h_mixup, w_mixup, dtype=torch.float32) + mix_img[:, : image_1.shape[1], : image_1.shape[2]] = image_1 * mixup_ratio + mix_img[:, : image_2.shape[1], : image_2.shape[2]] += image_2 * (1.0 - mixup_ratio) + # mixup targets + mix_target = {**target_1, **target_2} + box_format = target_1["boxes"].format + mixed_boxes = { + "boxes": features.BoundingBox( + torch.vstack((target_1["boxes"], target_2["boxes"])), + format=box_format, + spatial_size=(h_mixup, w_mixup), + ) + } + mix_labels = {"labels": torch.cat((target_1["labels"], target_2["labels"]))} + mix_target.update(mixed_boxes) + mix_target.update(mix_labels) + + return mix_img, mix_target diff --git a/torchvision/prototype/transforms/_mixup_detection.py b/torchvision/prototype/transforms/_mixup_detection.py deleted file mode 100644 index d63f1f6177e..00000000000 --- a/torchvision/prototype/transforms/_mixup_detection.py +++ /dev/null @@ -1,114 +0,0 @@ -""" -This script is used to apply the mixup transform for Object detection to the dataset. -The code is inspired from the paper: https://arxiv.org/abs/1902.0410 - -In a nutshell, mixup is a data augmentation technique that combines two images in the ratio of -beta to (1-beta) and this value of beta is sampled from a beta distribution. This makes our model -robust to the object being present in the image or not. Plus, it is kind of like a free lunch. -""" -from typing import Any, Dict, List, Tuple - -import PIL.Image - -import torch -from torchvision.prototype import features -from torchvision.prototype.transforms import Transform -from torchvision.prototype.transforms._utils import has_any - -from ._augment import flatten_and_extract_data, unflatten_and_insert_data - - -class MixupDetection(Transform): - _transformed_types = (features.is_simple_tensor, features.Image, PIL.Image) - - def __init__( - self, - *, - alpha: float = 1.5, - ) -> None: - super().__init__() - self._dist = torch.distributions.Beta(torch.tensor([alpha]), torch.tensor([alpha])) - - def _check_inputs(self, flat_inputs: List[Any]) -> None: - if has_any(flat_inputs, features.Mask): - raise TypeError(f"Masks are not supported by {type(self).__name__}()") - - if not has_any(flat_inputs, PIL.Image.Image, features.Image, features.is_simple_tensor): - raise TypeError( - f"{type(self).__name__}() requires input sample to contain an tensor or PIL image or a Video." - ) - - if not ( - has_any(flat_inputs, features.Image, PIL.Image.Image, features.is_simple_tensor) - and has_any(flat_inputs, features.BoundingBox) - ): - raise TypeError(f"{type(self).__name__}() is only defined for tensor images and bounding boxes.") - - def forward(self, *inputs: Any) -> Any: - flat_batch_with_spec, images, targets = flatten_and_extract_data( - inputs, - boxes=(features.BoundingBox,), - labels=(features.Label, features.OneHotLabel), - ) - # TODO: refactor this since we have already extracted the images and boxes - self._check_inputs(flat_batch_with_spec[0]) - - # images = [t1, t2, ..., tN] - # Let's define paste_images as shifted list of input images - # paste_images = [tN, t1, ..., tN-1,] - images_rolled = images[-1:] + images[:-1] - targets_rolled = targets[-1:] + targets[:-1] - - output_images, output_targets = [], [] - for image_1, target_1, image_2, target_2 in zip(images, targets, images_rolled, targets_rolled): - output_image, output_target = self._mixup( - image_1, - target_1, - image_2, - target_2, - ) - output_images.append(output_image) - output_targets.append(output_target) - - return unflatten_and_insert_data(flat_batch_with_spec, output_images, output_targets) - - def _mixup( - self, - image_1: features.TensorImageType, - target_1: Dict[str, Any], - image_2: features.TensorImageType, - target_2: Dict[str, Any], - ) -> Tuple[features.TensorImageType, Dict[str, Any]]: - """ - Performs mixup on the given images and targets. - """ - mixup_ratio = self._dist.sample().item() - print(mixup_ratio) - - c_1, h_1, w_1 = image_1.shape - c_2, h_2, w_2 = image_2.shape - h_mixup = max(h_1, h_2) - w_mixup = max(w_1, w_2) - - if mixup_ratio >= 1.0: - return image_1, target_1 - - # mixup images and prevent the object aspect ratio from changing - mix_img = torch.zeros(c_1, h_mixup, w_mixup, dtype=torch.float32) - mix_img[:, : image_1.shape[1], : image_1.shape[2]] = image_1 * mixup_ratio - mix_img[:, : image_2.shape[1], : image_2.shape[2]] += image_2 * (1.0 - mixup_ratio) - # mixup targets - mix_target = {**target_1, **target_2} - box_format = target_1["boxes"].format - mixed_boxes = { - "boxes": features.BoundingBox( - torch.vstack((target_1["boxes"], target_2["boxes"])), - format=box_format, - spatial_size=(h_mixup, w_mixup), - ) - } - mix_labels = {"labels": torch.cat((target_1["labels"], target_2["labels"]))} - mix_target.update(mixed_boxes) - mix_target.update(mix_labels) - - return mix_img, mix_target From ae9908b037f5c64334ddba5f61d378bcc699a458 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 6 Dec 2022 13:51:24 +0100 Subject: [PATCH 17/32] refactor extraction and insertion --- torchvision/prototype/transforms/_augment.py | 127 +++++++------------ 1 file changed, 47 insertions(+), 80 deletions(-) diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py index ed52b19851a..cf84dfb9d8c 100644 --- a/torchvision/prototype/transforms/_augment.py +++ b/torchvision/prototype/transforms/_augment.py @@ -191,15 +191,8 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: def flatten_and_extract_data( - inputs: Any, **target_types_or_checks: Tuple[Union[Type, Callable[[Any], bool]], ...] -) -> Tuple[Tuple[List[Any], TreeSpec, List[Dict[str, int]]], List[datapoints.TensorImageType], List[Dict[str, Any]]]: - # Images are special in the sense that they will always be extracted and returned - # separately. Internally however, they behave just as the other datapoints. - types_or_checks: Dict[str, Tuple[Union[Type, Callable[[Any], bool]], ...]] = { - "images": (datapoints.Image, PIL.Image.Image, is_simple_tensor), - **target_types_or_checks, - } - + inputs: Any, **types_or_checks: Tuple[Union[Type, Callable[[Any], bool]], ...] +) -> Tuple[Tuple[List[Any], TreeSpec, List[Dict[str, int]]], List[Dict[str, Any]]]: batch = inputs if len(inputs) > 1 else inputs[0] flat_batch = [] sample_specs = [] @@ -239,35 +232,25 @@ def flatten_and_extract_data( batch_spec = TreeSpec(list, context=None, children_specs=sample_specs) - targets = batch_data - batch_data = [] - for target in targets: - image = target.pop("images") - if isinstance(image, datapoints.Image): - image = image.as_subclass(torch.Tensor) - elif isinstance(image, PIL.Image.Image): - image = F.pil_to_tensor(image) - batch_data.append(image) - - return (flat_batch, batch_spec, batch_idcs), batch_data, targets + return (flat_batch, batch_spec, batch_idcs), batch_data def unflatten_and_insert_data( flat_batch_with_spec: Tuple[List[Any], TreeSpec, List[Dict[str, int]]], - images: List[datapoints.TensorImageType], - targets: List[Dict[str, Any]], + batch: List[Dict[str, Any]], ) -> Any: flat_batch, batch_spec, batch_idcs = flat_batch_with_spec for sample_idx, sample_idcs in enumerate(batch_idcs): for key, flat_idx in sample_idcs.items(): - item = images[sample_idx] if key == "images" else targets[sample_idx][key] - inpt = flat_batch[flat_idx] - if isinstance(inpt, datapoints._datapoint.Datapoint): - item = type(inpt).wrap_like(inpt, item) - elif isinstance(inpt, PIL.Image.Image): - item = F.to_image_pil(item) + item = batch[sample_idx][key] + + if not is_simple_tensor(inpt) and is_simple_tensor(item): + if isinstance(inpt, datapoints._datapoint.Datapoint): + item = type(inpt).wrap_like(inpt, item) + elif isinstance(inpt, PIL.Image.Image): + item = F.to_image_pil(item) flat_batch[flat_idx] = item @@ -434,70 +417,54 @@ def _check_inputs(self, flat_inputs: List[Any]) -> None: raise TypeError(f"{type(self).__name__}() is only defined for tensor images and bounding boxes.") def forward(self, *inputs: Any) -> Any: - flat_batch_with_spec, images, targets = flatten_and_extract_data( + flat_batch_with_spec, batch = flatten_and_extract_data( inputs, + image=(datapoints.Image, PIL.Image.Image, is_simple_tensor), boxes=(datapoints.BoundingBox,), labels=(datapoints.Label, datapoints.OneHotLabel), ) # TODO: refactor this since we have already extracted the images and boxes self._check_inputs(flat_batch_with_spec[0]) - # images = [t1, t2, ..., tN] - # Let's define paste_images as shifted list of input images - # paste_images = [tN, t1, ..., tN-1,] - images_rolled = images[-1:] + images[:-1] - targets_rolled = targets[-1:] + targets[:-1] - - output_images, output_targets = [], [] - for image_1, target_1, image_2, target_2 in zip(images, targets, images_rolled, targets_rolled): - output_image, output_target = self._mixup( - image_1, - target_1, - image_2, - target_2, - ) - output_images.append(output_image) - output_targets.append(output_target) + batch_output = [ + self._mixup(sample, sample_rolled) for sample, sample_rolled in zip(batch, batch[-1:] + batch[:-1]) + ] - return unflatten_and_insert_data(flat_batch_with_spec, output_images, output_targets) + return unflatten_and_insert_data(flat_batch_with_spec, batch_output) - def _mixup( - self, - image_1: datapoints.TensorImageType, - target_1: Dict[str, Any], - image_2: datapoints.TensorImageType, - target_2: Dict[str, Any], - ) -> Tuple[datapoints.TensorImageType, Dict[str, Any]]: - """ - Performs mixup on the given images and targets. - """ + def _mixup(self, sample_1: Dict[str, Any], sample_2: Dict[str, Any]) -> Dict[str, Any]: mixup_ratio = self._dist.sample().item() - print(mixup_ratio) - c_1, h_1, w_1 = image_1.shape - c_2, h_2, w_2 = image_2.shape + if mixup_ratio >= 1.0: + return sample_1 + + image_1 = sample_1["image"] + if isinstance(image_1, PIL.Image.Image): + image_1 = F.pil_to_tensor(image_1) + + image_2 = sample_2["image"] + if isinstance(image_2, PIL.Image.Image): + image_2 = F.pil_to_tensor(image_2) + + h_1, w_1 = image_1.shape[-2:] + h_2, w_2 = image_2.shape[-2:] h_mixup = max(h_1, h_2) w_mixup = max(w_1, w_2) - if mixup_ratio >= 1.0: - return image_1, target_1 - - # mixup images and prevent the object aspect ratio from changing - mix_img = torch.zeros(c_1, h_mixup, w_mixup, dtype=torch.float32) - mix_img[:, : image_1.shape[1], : image_1.shape[2]] = image_1 * mixup_ratio - mix_img[:, : image_2.shape[1], : image_2.shape[2]] += image_2 * (1.0 - mixup_ratio) - # mixup targets - mix_target = {**target_1, **target_2} - box_format = target_1["boxes"].format - mixed_boxes = { - "boxes": datapoints.BoundingBox( - torch.vstack((target_1["boxes"], target_2["boxes"])), - format=box_format, - spatial_size=(h_mixup, w_mixup), - ) - } - mix_labels = {"labels": torch.cat((target_1["labels"], target_2["labels"]))} - mix_target.update(mixed_boxes) - mix_target.update(mix_labels) + # TODO: add the option to fill this with something else than 0 + mix_image = F.pad_image_tensor(image_1 * mixup_ratio, padding=[0, 0, h_mixup - h_1, w_mixup - w_1], fill=None) + mix_image[:, :h_2, :w_2] += image_2 * (1.0 - mixup_ratio) + mix_image = mix_image.to(image_1) + + mix_boxes = datapoints.BoundingBox.wrap_like( + sample_1["boxes"], + torch.cat([sample_1["boxes"], sample_2["boxes"]], dim=-2), + spatial_size=(h_mixup, w_mixup), + ) + + mix_labels = datapoints.Label.wrap_like( + sample_1["labels"], + torch.cat([sample_1["labels"], sample_2["labels"]], dim=-1), + ) - return mix_img, mix_target + return dict(image=mix_image, boxes=mix_boxes, labels=mix_labels) From c2e2757fb656440c1b8aa9e43bf9491c3a325459 Mon Sep 17 00:00:00 2001 From: Ambuj Pawar Date: Sat, 17 Dec 2022 12:04:08 +0100 Subject: [PATCH 18/32] Fix: Failing SimpleCopyPaste and MixupDetection Failing tests --- test/test_prototype_transforms.py | 130 +++---------------- torchvision/prototype/transforms/_augment.py | 16 +-- 2 files changed, 28 insertions(+), 118 deletions(-) diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py index 43596d90a82..d0d059b2b7a 100644 --- a/test/test_prototype_transforms.py +++ b/test/test_prototype_transforms.py @@ -27,7 +27,7 @@ from torchvision.prototype import datapoints, transforms from torchvision.prototype.transforms.utils import check_type -from torchvision.transforms.functional import InterpolationMode, pil_to_tensor, to_pil_image, to_tensor +from torchvision.transforms.functional import InterpolationMode, pil_to_tensor, to_pil_image BATCH_EXTRA_DIMS = [extra_dims for extra_dims in DEFAULT_EXTRA_DIMS if extra_dims] @@ -1437,63 +1437,6 @@ def create_fake_image(self, mocker, image_type): return PIL.Image.new("RGB", (32, 32), 123) return mocker.MagicMock(spec=image_type) - def test__extract_image_targets_assertion(self, mocker): - transform = transforms.SimpleCopyPaste() - - flat_sample = [ - # images, batch size = 2 - self.create_fake_image(mocker, datapoints.Image), - # labels, bboxes, masks - mocker.MagicMock(spec=datapoints.Label), - mocker.MagicMock(spec=datapoints.BoundingBox), - mocker.MagicMock(spec=datapoints.Mask), - # labels, bboxes, masks - mocker.MagicMock(spec=datapoints.BoundingBox), - mocker.MagicMock(spec=datapoints.Mask), - ] - - with pytest.raises(TypeError, match="requires input sample to contain equal sized list of Images"): - transform._extract_image_targets(flat_sample) - - @pytest.mark.parametrize("image_type", [datapoints.Image, PIL.Image.Image, torch.Tensor]) - @pytest.mark.parametrize("label_type", [datapoints.Label, datapoints.OneHotLabel]) - def test__extract_image_targets(self, image_type, label_type, mocker): - transform = transforms.SimpleCopyPaste() - - flat_sample = [ - # images, batch size = 2 - self.create_fake_image(mocker, image_type), - self.create_fake_image(mocker, image_type), - # labels, bboxes, masks - mocker.MagicMock(spec=label_type), - mocker.MagicMock(spec=datapoints.BoundingBox), - mocker.MagicMock(spec=datapoints.Mask), - # labels, bboxes, masks - mocker.MagicMock(spec=label_type), - mocker.MagicMock(spec=datapoints.BoundingBox), - mocker.MagicMock(spec=datapoints.Mask), - ] - - images, targets = transform._extract_image_targets(flat_sample) - - assert len(images) == len(targets) == 2 - if image_type == PIL.Image.Image: - torch.testing.assert_close(images[0], pil_to_tensor(flat_sample[0])) - torch.testing.assert_close(images[1], pil_to_tensor(flat_sample[1])) - else: - assert images[0] == flat_sample[0] - assert images[1] == flat_sample[1] - - for target in targets: - for key, type_ in [ - ("boxes", datapoints.BoundingBox), - ("masks", datapoints.Mask), - ("labels", label_type), - ]: - assert key in target - assert isinstance(target[key], type_) - assert target[key] in flat_sample - @pytest.mark.parametrize("label_type", [datapoints.Label, datapoints.OneHotLabel]) def test__copy_paste(self, label_type): image = 2 * torch.ones(3, 32, 32) @@ -1933,72 +1876,41 @@ def create_fake_image(self, mocker, image_type): return PIL.Image.new("RGB", (32, 32), 123) return mocker.MagicMock(spec=image_type) - def test__extract_image_targets_assertion(self, mocker): - transform = transforms.MixupDetection() - - flat_sample = [ - # images, batch size = 2 - self.create_fake_image(mocker, features.Image), - # labels, bboxes, masks - mocker.MagicMock(spec=features.Label), - mocker.MagicMock(spec=features.BoundingBox), - # labels, bboxes, masks - mocker.MagicMock(spec=features.BoundingBox), - ] - - with pytest.raises(TypeError, match="requires input sample to contain equal-sized list of Images"): - transform._extract_image_targets(flat_sample) - - @pytest.mark.parametrize("image_type", [features.Image, PIL.Image.Image, torch.Tensor]) - def test__extract_image_targets(self, image_type, mocker): - transform = transforms.MixupDetection() - - flat_sample = [ - # images, batch size = 2 - self.create_fake_image(mocker, image_type), - self.create_fake_image(mocker, image_type), - # labels, bboxes - mocker.MagicMock(spec=features.Label), - mocker.MagicMock(spec=features.BoundingBox), - # labels, bboxes - mocker.MagicMock(spec=features.Label), - mocker.MagicMock(spec=features.BoundingBox), - ] - - images, targets = transform._extract_image_targets(flat_sample) - - assert len(images) == len(targets) == 2 - if image_type == PIL.Image.Image: - torch.testing.assert_close(images[0], to_tensor(flat_sample[0])) - torch.testing.assert_close(images[1], to_tensor(flat_sample[1])) - else: - assert images[0] == flat_sample[0] - assert images[1] == flat_sample[1] - def test__mixup(self): image1 = 2 * torch.ones(3, 32, 64) target_1 = { - "boxes": features.BoundingBox( + "boxes": datapoints.BoundingBox( torch.tensor([[0.0, 0.0, 10.0, 10.0], [20.0, 20.0, 30.0, 30.0]]), format="XYXY", spatial_size=(32, 64), ), - "labels": features.Label(torch.tensor([1, 2])), + "labels": datapoints.Label(torch.tensor([1, 2])), } image2 = 10 * torch.ones(3, 64, 32) target_2 = { - "boxes": features.BoundingBox( + "boxes": datapoints.BoundingBox( torch.tensor([[10.0, 0.0, 20.0, 20.0], [10.0, 20.0, 30.0, 30.0]]), format="XYXY", spatial_size=(64, 32), ), - "labels": features.Label(torch.tensor([2, 3])), + "labels": datapoints.Label(torch.tensor([2, 3])), + } + + sample_1 = { + "image": image1, + "boxes": target_1["boxes"], + "labels": target_1["labels"], + } + sample_2 = { + "image": image2, + "boxes": target_2["boxes"], + "labels": target_2["labels"], } transform = transforms.MixupDetection() - output_image, output_target = transform._mixup(image1, target_1, image2, target_2) - assert output_image.shape == (3, 64, 64) - assert output_target["boxes"].spatial_size == (64, 64) - assert len(output_target["boxes"]) == 4 - assert len(output_target["labels"]) == 4 + output = transform._mixup(sample_1, sample_2) + assert output["image"].shape == (3, 64, 64) + assert output["boxes"].spatial_size == (64, 64) + assert len(output["boxes"]) == 4 + assert len(output["labels"]) == 4 diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py index cf84dfb9d8c..094c00a3495 100644 --- a/torchvision/prototype/transforms/_augment.py +++ b/torchvision/prototype/transforms/_augment.py @@ -410,12 +410,6 @@ def _check_inputs(self, flat_inputs: List[Any]) -> None: if has_any(flat_inputs, datapoints.Mask, datapoints.Video): raise TypeError(f"{type(self).__name__}() is only supported for images and bounding boxes.") - if not ( - has_any(flat_inputs, datapoints.Image, PIL.Image.Image, is_simple_tensor) - and has_any(flat_inputs, datapoints.BoundingBox) - ): - raise TypeError(f"{type(self).__name__}() is only defined for tensor images and bounding boxes.") - def forward(self, *inputs: Any) -> Any: flat_batch_with_spec, batch = flatten_and_extract_data( inputs, @@ -446,15 +440,19 @@ def _mixup(self, sample_1: Dict[str, Any], sample_2: Dict[str, Any]) -> Dict[str if isinstance(image_2, PIL.Image.Image): image_2 = F.pil_to_tensor(image_2) - h_1, w_1 = image_1.shape[-2:] + c_1, h_1, w_1 = image_1.shape h_2, w_2 = image_2.shape[-2:] h_mixup = max(h_1, h_2) w_mixup = max(w_1, w_2) # TODO: add the option to fill this with something else than 0 - mix_image = F.pad_image_tensor(image_1 * mixup_ratio, padding=[0, 0, h_mixup - h_1, w_mixup - w_1], fill=None) + # mix_image = F.pad_image_tensor(image_1 * mixup_ratio, padding=[0, 0, h_mixup - h_1, w_mixup - w_1], fill=None) + # mix_image[:, :h_2, :w_2] += image_2 * (1.0 - mixup_ratio) + # mix_image = mix_image.to(image_1) + + mix_image = torch.zeros(c_1, h_mixup, w_mixup, dtype=torch.float32) + mix_image[:, :h_1, :w_1] = image_1 * mixup_ratio mix_image[:, :h_2, :w_2] += image_2 * (1.0 - mixup_ratio) - mix_image = mix_image.to(image_1) mix_boxes = datapoints.BoundingBox.wrap_like( sample_1["boxes"], From 5398c73036ffbd141fecd457122ddc23f79240e7 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 19 Dec 2022 11:20:29 +0100 Subject: [PATCH 19/32] sample ratio in get_params --- test/test_prototype_transforms.py | 2 +- torchvision/prototype/transforms/_augment.py | 21 ++++++++++---------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py index d0d059b2b7a..d0782aefa52 100644 --- a/test/test_prototype_transforms.py +++ b/test/test_prototype_transforms.py @@ -1909,7 +1909,7 @@ def test__mixup(self): } transform = transforms.MixupDetection() - output = transform._mixup(sample_1, sample_2) + output = transform._mixup(sample_1, sample_2, 0.5) assert output["image"].shape == (3, 64, 64) assert output["boxes"].spatial_size == (64, 64) assert len(output["boxes"]) == 4 diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py index 094c00a3495..9640fd7cb69 100644 --- a/torchvision/prototype/transforms/_augment.py +++ b/torchvision/prototype/transforms/_augment.py @@ -410,6 +410,9 @@ def _check_inputs(self, flat_inputs: List[Any]) -> None: if has_any(flat_inputs, datapoints.Mask, datapoints.Video): raise TypeError(f"{type(self).__name__}() is only supported for images and bounding boxes.") + def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: + return dict(ratio=float(self._dist.sample())) + def forward(self, *inputs: Any) -> Any: flat_batch_with_spec, batch = flatten_and_extract_data( inputs, @@ -417,19 +420,17 @@ def forward(self, *inputs: Any) -> Any: boxes=(datapoints.BoundingBox,), labels=(datapoints.Label, datapoints.OneHotLabel), ) - # TODO: refactor this since we have already extracted the images and boxes self._check_inputs(flat_batch_with_spec[0]) batch_output = [ - self._mixup(sample, sample_rolled) for sample, sample_rolled in zip(batch, batch[-1:] + batch[:-1]) + self._mixup(sample, sample_rolled, self._get_params([])["ratio"]) + for sample, sample_rolled in zip(batch, batch[-1:] + batch[:-1]) ] return unflatten_and_insert_data(flat_batch_with_spec, batch_output) - def _mixup(self, sample_1: Dict[str, Any], sample_2: Dict[str, Any]) -> Dict[str, Any]: - mixup_ratio = self._dist.sample().item() - - if mixup_ratio >= 1.0: + def _mixup(self, sample_1: Dict[str, Any], sample_2: Dict[str, Any], ratio: float) -> Dict[str, Any]: + if ratio >= 1.0: return sample_1 image_1 = sample_1["image"] @@ -446,13 +447,13 @@ def _mixup(self, sample_1: Dict[str, Any], sample_2: Dict[str, Any]) -> Dict[str w_mixup = max(w_1, w_2) # TODO: add the option to fill this with something else than 0 - # mix_image = F.pad_image_tensor(image_1 * mixup_ratio, padding=[0, 0, h_mixup - h_1, w_mixup - w_1], fill=None) - # mix_image[:, :h_2, :w_2] += image_2 * (1.0 - mixup_ratio) + # mix_image = F.pad_image_tensor(image_1 * ratio, padding=[0, 0, h_mixup - h_1, w_mixup - w_1], fill=None) + # mix_image[:, :h_2, :w_2] += image_2 * (1.0 - ratio) # mix_image = mix_image.to(image_1) mix_image = torch.zeros(c_1, h_mixup, w_mixup, dtype=torch.float32) - mix_image[:, :h_1, :w_1] = image_1 * mixup_ratio - mix_image[:, :h_2, :w_2] += image_2 * (1.0 - mixup_ratio) + mix_image[:, :h_1, :w_1] = image_1 * ratio + mix_image[:, :h_2, :w_2] += image_2 * (1.0 - ratio) mix_boxes = datapoints.BoundingBox.wrap_like( sample_1["boxes"], From 044ba0d0828809df55dddc5acfabb034ea1c4f1f Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 19 Dec 2022 11:33:05 +0100 Subject: [PATCH 20/32] fix padding --- torchvision/prototype/transforms/_augment.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py index 9640fd7cb69..99eaa85ac22 100644 --- a/torchvision/prototype/transforms/_augment.py +++ b/torchvision/prototype/transforms/_augment.py @@ -441,19 +441,15 @@ def _mixup(self, sample_1: Dict[str, Any], sample_2: Dict[str, Any], ratio: floa if isinstance(image_2, PIL.Image.Image): image_2 = F.pil_to_tensor(image_2) - c_1, h_1, w_1 = image_1.shape + h_1, w_1 = image_1.shape[-2:] h_2, w_2 = image_2.shape[-2:] h_mixup = max(h_1, h_2) w_mixup = max(w_1, w_2) # TODO: add the option to fill this with something else than 0 - # mix_image = F.pad_image_tensor(image_1 * ratio, padding=[0, 0, h_mixup - h_1, w_mixup - w_1], fill=None) - # mix_image[:, :h_2, :w_2] += image_2 * (1.0 - ratio) - # mix_image = mix_image.to(image_1) - - mix_image = torch.zeros(c_1, h_mixup, w_mixup, dtype=torch.float32) - mix_image[:, :h_1, :w_1] = image_1 * ratio - mix_image[:, :h_2, :w_2] += image_2 * (1.0 - ratio) + mix_image = F.pad_image_tensor(image_1, padding=[0, 0, w_mixup - w_1, h_mixup - h_1], fill=None).mul_(ratio) + mix_image[..., :h_2, :w_2] = image_2 * (1.0 - ratio) + mix_image = mix_image.to(image_1) mix_boxes = datapoints.BoundingBox.wrap_like( sample_1["boxes"], From 884ace1c86f64d83b0768c190db22cb95f52e97c Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 19 Dec 2022 11:38:03 +0100 Subject: [PATCH 21/32] perform image conversion upfront --- torchvision/prototype/transforms/_augment.py | 28 +++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py index 99eaa85ac22..3ac3b2e5371 100644 --- a/torchvision/prototype/transforms/_augment.py +++ b/torchvision/prototype/transforms/_augment.py @@ -422,6 +422,14 @@ def forward(self, *inputs: Any) -> Any: ) self._check_inputs(flat_batch_with_spec[0]) + for sample in batch: + image = sample.pop("image") + if isinstance(image, PIL.Image.Image): + image = F.pil_to_tensor(image) + elif isinstance(image, datapoints.Image): + image = image.as_subclass(torch.Tensor) + sample["image"] = image + batch_output = [ self._mixup(sample, sample_rolled, self._get_params([])["ratio"]) for sample, sample_rolled in zip(batch, batch[-1:] + batch[:-1]) @@ -433,23 +441,17 @@ def _mixup(self, sample_1: Dict[str, Any], sample_2: Dict[str, Any], ratio: floa if ratio >= 1.0: return sample_1 - image_1 = sample_1["image"] - if isinstance(image_1, PIL.Image.Image): - image_1 = F.pil_to_tensor(image_1) - - image_2 = sample_2["image"] - if isinstance(image_2, PIL.Image.Image): - image_2 = F.pil_to_tensor(image_2) - - h_1, w_1 = image_1.shape[-2:] - h_2, w_2 = image_2.shape[-2:] + h_1, w_1 = sample_1["image"].shape[-2:] + h_2, w_2 = sample_2["image"].shape[-2:] h_mixup = max(h_1, h_2) w_mixup = max(w_1, w_2) # TODO: add the option to fill this with something else than 0 - mix_image = F.pad_image_tensor(image_1, padding=[0, 0, w_mixup - w_1, h_mixup - h_1], fill=None).mul_(ratio) - mix_image[..., :h_2, :w_2] = image_2 * (1.0 - ratio) - mix_image = mix_image.to(image_1) + mix_image = F.pad_image_tensor(sample_1["image"], padding=[0, 0, w_mixup - w_1, h_mixup - h_1], fill=None).mul_( + ratio + ) + mix_image[..., :h_2, :w_2] = sample_2["image"] * (1.0 - ratio) + mix_image = mix_image.to(sample_1["image"]) mix_boxes = datapoints.BoundingBox.wrap_like( sample_1["boxes"], From 99de2327fd6bd9e33866afcf6b2181443b33800b Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 19 Dec 2022 13:10:24 +0100 Subject: [PATCH 22/32] create base class --- torchvision/prototype/transforms/_augment.py | 91 ++----------------- .../prototype/transforms/_transform.py | 89 +++++++++++++++++- 2 files changed, 96 insertions(+), 84 deletions(-) diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py index 3ac3b2e5371..78cadf9bc23 100644 --- a/torchvision/prototype/transforms/_augment.py +++ b/torchvision/prototype/transforms/_augment.py @@ -1,17 +1,17 @@ import math import numbers import warnings -from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Type, Union +from typing import Any, cast, Dict, List, Optional, Tuple, Union import PIL.Image + import torch -from torch.utils._pytree import tree_flatten, tree_unflatten, TreeSpec from torchvision.ops import masks_to_boxes from torchvision.prototype import datapoints -from torchvision.prototype.transforms import functional as F, InterpolationMode, Transform +from torchvision.prototype.transforms import functional as F, InterpolationMode -from ._transform import _RandomApplyTransform -from .utils import check_type, has_any, is_simple_tensor, query_chw, query_spatial_size +from ._transform import _DetectionBatchTransform, _RandomApplyTransform +from .utils import has_any, is_simple_tensor, query_chw, query_spatial_size class RandomErasing(_RandomApplyTransform): @@ -190,73 +190,6 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: return inpt -def flatten_and_extract_data( - inputs: Any, **types_or_checks: Tuple[Union[Type, Callable[[Any], bool]], ...] -) -> Tuple[Tuple[List[Any], TreeSpec, List[Dict[str, int]]], List[Dict[str, Any]]]: - batch = inputs if len(inputs) > 1 else inputs[0] - flat_batch = [] - sample_specs = [] - - offset = 0 - batch_idcs = [] - batch_data = [] - for sample_idx, sample in enumerate(batch): - flat_sample, sample_spec = tree_flatten(sample) - flat_batch.extend(flat_sample) - sample_specs.append(sample_spec) - - sample_types_or_checks = types_or_checks.copy() - sample_idcs = {} - sample_data = {} - for flat_idx, item in enumerate(flat_sample, offset): - if not sample_types_or_checks: - break - - for key, types_or_checks_ in sample_types_or_checks.items(): - if check_type(item, types_or_checks_): - break - else: - continue - - del sample_types_or_checks[key] - sample_idcs[key] = flat_idx - sample_data[key] = item - - if sample_types_or_checks: - # TODO: improve message - raise TypeError(f"Sample at index {sample_idx} in the batch is missing {sample_types_or_checks.keys()}`") - - batch_idcs.append(sample_idcs) - batch_data.append(sample_data) - offset += len(flat_sample) - - batch_spec = TreeSpec(list, context=None, children_specs=sample_specs) - - return (flat_batch, batch_spec, batch_idcs), batch_data - - -def unflatten_and_insert_data( - flat_batch_with_spec: Tuple[List[Any], TreeSpec, List[Dict[str, int]]], - batch: List[Dict[str, Any]], -) -> Any: - flat_batch, batch_spec, batch_idcs = flat_batch_with_spec - - for sample_idx, sample_idcs in enumerate(batch_idcs): - for key, flat_idx in sample_idcs.items(): - inpt = flat_batch[flat_idx] - item = batch[sample_idx][key] - - if not is_simple_tensor(inpt) and is_simple_tensor(item): - if isinstance(inpt, datapoints._datapoint.Datapoint): - item = type(inpt).wrap_like(inpt, item) - elif isinstance(inpt, PIL.Image.Image): - item = F.to_image_pil(item) - - flat_batch[flat_idx] = item - - return tree_unflatten(flat_batch, batch_spec) - - class SimpleCopyPaste(_RandomApplyTransform): def __init__( self, @@ -395,7 +328,7 @@ def forward(self, *inputs: Any) -> Any: return unflatten_and_insert_data(flat_batch_with_spec, output_images, output_targets) -class MixupDetection(Transform): +class MixupDetection(_DetectionBatchTransform): _transformed_types = (is_simple_tensor, datapoints.Image, PIL.Image) def __init__( @@ -414,7 +347,7 @@ def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: return dict(ratio=float(self._dist.sample())) def forward(self, *inputs: Any) -> Any: - flat_batch_with_spec, batch = flatten_and_extract_data( + flat_batch_with_spec, batch = self._flatten_and_extract_data( inputs, image=(datapoints.Image, PIL.Image.Image, is_simple_tensor), boxes=(datapoints.BoundingBox,), @@ -422,20 +355,14 @@ def forward(self, *inputs: Any) -> Any: ) self._check_inputs(flat_batch_with_spec[0]) - for sample in batch: - image = sample.pop("image") - if isinstance(image, PIL.Image.Image): - image = F.pil_to_tensor(image) - elif isinstance(image, datapoints.Image): - image = image.as_subclass(torch.Tensor) - sample["image"] = image + batch = self._to_image_tensor(batch) batch_output = [ self._mixup(sample, sample_rolled, self._get_params([])["ratio"]) for sample, sample_rolled in zip(batch, batch[-1:] + batch[:-1]) ] - return unflatten_and_insert_data(flat_batch_with_spec, batch_output) + return self._unflatten_and_insert_data(flat_batch_with_spec, batch_output) def _mixup(self, sample_1: Dict[str, Any], sample_2: Dict[str, Any], ratio: float) -> Dict[str, Any]: if ratio >= 1.0: diff --git a/torchvision/prototype/transforms/_transform.py b/torchvision/prototype/transforms/_transform.py index 43224cabd38..e16adb405a1 100644 --- a/torchvision/prototype/transforms/_transform.py +++ b/torchvision/prototype/transforms/_transform.py @@ -2,10 +2,13 @@ from typing import Any, Callable, Dict, List, Tuple, Type, Union import PIL.Image + import torch from torch import nn -from torch.utils._pytree import tree_flatten, tree_unflatten -from torchvision.prototype.transforms.utils import check_type +from torch.utils._pytree import tree_flatten, tree_unflatten, TreeSpec +from torchvision.prototype import datapoints +from torchvision.prototype.transforms import functional as F +from torchvision.prototype.transforms.utils import check_type, is_simple_tensor from torchvision.utils import _log_api_usage_once @@ -83,3 +86,85 @@ def forward(self, *inputs: Any) -> Any: ] return tree_unflatten(flat_outputs, spec) + + +class _DetectionBatchTransform(Transform): + @staticmethod + def _flatten_and_extract_data( + inputs: Any, **types_or_checks: Tuple[Union[Type, Callable[[Any], bool]], ...] + ) -> Tuple[Tuple[List[Any], TreeSpec, List[Dict[str, int]]], List[Dict[str, Any]]]: + batch = inputs if len(inputs) > 1 else inputs[0] + flat_batch = [] + sample_specs = [] + + offset = 0 + batch_idcs = [] + batch_data = [] + for sample_idx, sample in enumerate(batch): + flat_sample, sample_spec = tree_flatten(sample) + flat_batch.extend(flat_sample) + sample_specs.append(sample_spec) + + sample_types_or_checks = types_or_checks.copy() + sample_idcs = {} + sample_data = {} + for flat_idx, item in enumerate(flat_sample, offset): + if not sample_types_or_checks: + break + + for key, types_or_checks_ in sample_types_or_checks.items(): + if check_type(item, types_or_checks_): + break + else: + continue + + del sample_types_or_checks[key] + sample_idcs[key] = flat_idx + sample_data[key] = item + + if sample_types_or_checks: + # TODO: improve message + raise TypeError( + f"Sample at index {sample_idx} in the batch is missing {sample_types_or_checks.keys()}`" + ) + + batch_idcs.append(sample_idcs) + batch_data.append(sample_data) + offset += len(flat_sample) + + batch_spec = TreeSpec(list, context=None, children_specs=sample_specs) + + return (flat_batch, batch_spec, batch_idcs), batch_data + + @staticmethod + def _to_image_tensor(batch: List[Dict[str, Any]], *, key: str = "image") -> List[Dict[str, Any]]: + for sample in batch: + image = sample.pop(key) + if isinstance(image, PIL.Image.Image): + image = F.pil_to_tensor(image) + elif isinstance(image, datapoints.Image): + image = image.as_subclass(torch.Tensor) + sample[key] = image + return batch + + @staticmethod + def _unflatten_and_insert_data( + flat_batch_with_spec: Tuple[List[Any], TreeSpec, List[Dict[str, int]]], + batch: List[Dict[str, Any]], + ) -> Any: + flat_batch, batch_spec, batch_idcs = flat_batch_with_spec + + for sample_idx, sample_idcs in enumerate(batch_idcs): + for key, flat_idx in sample_idcs.items(): + inpt = flat_batch[flat_idx] + item = batch[sample_idx][key] + + if not is_simple_tensor(inpt) and is_simple_tensor(item): + if isinstance(inpt, datapoints._datapoint.Datapoint): + item = type(inpt).wrap_like(inpt, item) + elif isinstance(inpt, PIL.Image.Image): + item = F.to_image_pil(item) + + flat_batch[flat_idx] = item + + return tree_unflatten(flat_batch, batch_spec) From a6b9ae0d07a8c8bfb9d5c7d4490373b6b297b5e6 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 19 Dec 2022 13:45:14 +0100 Subject: [PATCH 23/32] add shortcut for ratio==0 --- torchvision/prototype/transforms/_augment.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py index b2ce39b6ae9..3d19d5b9f1c 100644 --- a/torchvision/prototype/transforms/_augment.py +++ b/torchvision/prototype/transforms/_augment.py @@ -366,6 +366,8 @@ def forward(self, *inputs: Any) -> Any: def _mixup(self, sample_1: Dict[str, Any], sample_2: Dict[str, Any], ratio: float) -> Dict[str, Any]: if ratio >= 1.0: return sample_1 + elif ratio == 0: + return sample_2 h_1, w_1 = sample_1["image"].shape[-2:] h_2, w_2 = sample_2["image"].shape[-2:] From fce49b8ea5a6900f0bfcb3e2d95e26362482e29c Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 19 Dec 2022 13:54:56 +0100 Subject: [PATCH 24/32] fix dtype --- torchvision/prototype/transforms/_augment.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py index 3d19d5b9f1c..69aa605af05 100644 --- a/torchvision/prototype/transforms/_augment.py +++ b/torchvision/prototype/transforms/_augment.py @@ -366,7 +366,7 @@ def forward(self, *inputs: Any) -> Any: def _mixup(self, sample_1: Dict[str, Any], sample_2: Dict[str, Any], ratio: float) -> Dict[str, Any]: if ratio >= 1.0: return sample_1 - elif ratio == 0: + elif ratio == 0.0: return sample_2 h_1, w_1 = sample_1["image"].shape[-2:] @@ -375,9 +375,10 @@ def _mixup(self, sample_1: Dict[str, Any], sample_2: Dict[str, Any], ratio: floa w_mixup = max(w_1, w_2) # TODO: add the option to fill this with something else than 0 - mix_image = F.pad_image_tensor(sample_1["image"], padding=[0, 0, w_mixup - w_1, h_mixup - h_1], fill=None).mul_( - ratio - ) + dtype = sample_1["image"].dtype if sample_1["image"].is_floating_point() else torch.float32 + mix_image = F.pad_image_tensor( + sample_1["image"].to(dtype), padding=[0, 0, w_mixup - w_1, h_mixup - h_1], fill=None + ).mul_(ratio) mix_image[..., :h_2, :w_2] = sample_2["image"] * (1.0 - ratio) mix_image = mix_image.to(sample_1["image"]) From d99547135aea8d003d20b9858c095b9601b1f7bb Mon Sep 17 00:00:00 2001 From: Ambuj Pawar Date: Sat, 21 Jan 2023 11:40:14 +0100 Subject: [PATCH 25/32] Apply suggestions from code review Co-authored-by: Philip Meier --- torchvision/prototype/transforms/_augment.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py index 69aa605af05..611264cd40a 100644 --- a/torchvision/prototype/transforms/_augment.py +++ b/torchvision/prototype/transforms/_augment.py @@ -328,8 +328,6 @@ def forward(self, *inputs: Any) -> Any: class MixupDetection(_DetectionBatchTransform): - _transformed_types = (is_simple_tensor, datapoints.Image, PIL.Image) - def __init__( self, *, From cbf09c2b2d535ef2155face18648911720c42497 Mon Sep 17 00:00:00 2001 From: Ambuj Pawar Date: Sat, 21 Jan 2023 19:37:56 +0100 Subject: [PATCH 26/32] Undo removing test_extract_image_target of TestSimpleCopyPaste --- test/test_prototype_transforms.py | 57 +++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py index 9d4586493ec..1528742f1e6 100644 --- a/test/test_prototype_transforms.py +++ b/test/test_prototype_transforms.py @@ -1437,6 +1437,63 @@ def create_fake_image(self, mocker, image_type): return PIL.Image.new("RGB", (32, 32), 123) return mocker.MagicMock(spec=image_type) + def test__extract_image_targets_assertion(self, mocker): + transform = transforms.SimpleCopyPaste() + + flat_sample = [ + # images, batch size = 2 + self.create_fake_image(mocker, datapoints.Image), + # labels, bboxes, masks + mocker.MagicMock(spec=datapoints.Label), + mocker.MagicMock(spec=datapoints.BoundingBox), + mocker.MagicMock(spec=datapoints.Mask), + # labels, bboxes, masks + mocker.MagicMock(spec=datapoints.BoundingBox), + mocker.MagicMock(spec=datapoints.Mask), + ] + + with pytest.raises(TypeError, match="requires input sample to contain equal sized list of Images"): + transform._extract_image_targets(flat_sample) + + @pytest.mark.parametrize("image_type", [datapoints.Image, PIL.Image.Image, torch.Tensor]) + @pytest.mark.parametrize("label_type", [datapoints.Label, datapoints.OneHotLabel]) + def test__extract_image_targets(self, image_type, label_type, mocker): + transform = transforms.SimpleCopyPaste() + + flat_sample = [ + # images, batch size = 2 + self.create_fake_image(mocker, image_type), + self.create_fake_image(mocker, image_type), + # labels, bboxes, masks + mocker.MagicMock(spec=label_type), + mocker.MagicMock(spec=datapoints.BoundingBox), + mocker.MagicMock(spec=datapoints.Mask), + # labels, bboxes, masks + mocker.MagicMock(spec=label_type), + mocker.MagicMock(spec=datapoints.BoundingBox), + mocker.MagicMock(spec=datapoints.Mask), + ] + + images, targets = transform._extract_image_targets(flat_sample) + + assert len(images) == len(targets) == 2 + if image_type == PIL.Image.Image: + torch.testing.assert_close(images[0], pil_to_tensor(flat_sample[0])) + torch.testing.assert_close(images[1], pil_to_tensor(flat_sample[1])) + else: + assert images[0] == flat_sample[0] + assert images[1] == flat_sample[1] + + for target in targets: + for key, type_ in [ + ("boxes", datapoints.BoundingBox), + ("masks", datapoints.Mask), + ("labels", label_type), + ]: + assert key in target + assert isinstance(target[key], type_) + assert target[key] in flat_sample + @pytest.mark.parametrize("label_type", [datapoints.Label, datapoints.OneHotLabel]) def test__copy_paste(self, label_type): image = 2 * torch.ones(3, 32, 32) From 685d0423270217896a4f66ce43b12ba5f105f22b Mon Sep 17 00:00:00 2001 From: Ambuj Pawar Date: Sun, 22 Jan 2023 13:49:21 +0100 Subject: [PATCH 27/32] ADD: Test cases when mixup ratio is 0, 0.5, 1 --- test/test_prototype_transforms.py | 47 ++++++++++++++++++------------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py index 1528742f1e6..85df758bd64 100644 --- a/test/test_prototype_transforms.py +++ b/test/test_prototype_transforms.py @@ -27,7 +27,7 @@ from torchvision.prototype import datapoints, transforms from torchvision.prototype.transforms.utils import check_type -from torchvision.transforms.functional import InterpolationMode, pil_to_tensor, to_pil_image +from torchvision.transforms.functional import get_image_size, InterpolationMode, pil_to_tensor, to_pil_image BATCH_EXTRA_DIMS = [extra_dims for extra_dims in DEFAULT_EXTRA_DIMS if extra_dims] @@ -1928,46 +1928,53 @@ def test__transform(self, inpt): class TestMixupDetection: - def create_fake_image(self, mocker, image_type): + def create_fake_image(self, mocker, image_type, *, size=(32, 32), color=123): if image_type == PIL.Image.Image: - return PIL.Image.new("RGB", (32, 32), 123) + return PIL.Image.new("RGB", size, color) return mocker.MagicMock(spec=image_type) - def test__mixup(self): - image1 = 2 * torch.ones(3, 32, 64) + @pytest.mark.parametrize("ratio", [0.0, 1.0]) + def test__mixup(self, mocker, ratio): + image_1 = self.create_fake_image(mocker, PIL.Image.Image, size=(128, 128), color=(124, 124, 124)) target_1 = { "boxes": datapoints.BoundingBox( torch.tensor([[0.0, 0.0, 10.0, 10.0], [20.0, 20.0, 30.0, 30.0]]), format="XYXY", - spatial_size=(32, 64), + spatial_size=get_image_size(image_1), ), "labels": datapoints.Label(torch.tensor([1, 2])), } + sample_1 = { + "image": image_1, + "boxes": target_1["boxes"], + "labels": target_1["labels"], + } - image2 = 10 * torch.ones(3, 64, 32) + image_2 = self.create_fake_image(mocker, PIL.Image.Image, size=(128, 128), color=(0, 0, 0)) target_2 = { "boxes": datapoints.BoundingBox( torch.tensor([[10.0, 0.0, 20.0, 20.0], [10.0, 20.0, 30.0, 30.0]]), format="XYXY", - spatial_size=(64, 32), + spatial_size=get_image_size(image_2), ), "labels": datapoints.Label(torch.tensor([2, 3])), } - - sample_1 = { - "image": image1, - "boxes": target_1["boxes"], - "labels": target_1["labels"], - } sample_2 = { - "image": image2, + "image": image_2, "boxes": target_2["boxes"], "labels": target_2["labels"], } transform = transforms.MixupDetection() - output = transform._mixup(sample_1, sample_2, 0.5) - assert output["image"].shape == (3, 64, 64) - assert output["boxes"].spatial_size == (64, 64) - assert len(output["boxes"]) == 4 - assert len(output["labels"]) == 4 + output = transform._mixup(sample_1, sample_2, ratio) + + if ratio == 0: + assert output == sample_1 + + elif ratio == 1: + assert output == sample_2 + + elif ratio == 0.5: + assert output["image"] == np.fromarray((np.asarray(image_1) + np.asarray(image_2)) / 2).astype(np.uint8) + assert output["boxes"] == torch.cat([target_1["boxes"], target_2["boxes"]]) + assert output["labels"] == torch.cat([target_1["labels"], target_2["labels"]]) From 33192155f6a497ffc8a402020bf7c770dc22207f Mon Sep 17 00:00:00 2001 From: Ambuj Pawar Date: Sun, 22 Jan 2023 14:50:47 +0100 Subject: [PATCH 28/32] Fix: was doing wrong asserts. Corrected it --- test/test_prototype_transforms.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py index 85df758bd64..364b26fa995 100644 --- a/test/test_prototype_transforms.py +++ b/test/test_prototype_transforms.py @@ -1933,9 +1933,10 @@ def create_fake_image(self, mocker, image_type, *, size=(32, 32), color=123): return PIL.Image.new("RGB", size, color) return mocker.MagicMock(spec=image_type) - @pytest.mark.parametrize("ratio", [0.0, 1.0]) + @pytest.mark.parametrize("ratio", [0.0, 0.5, 1.0]) def test__mixup(self, mocker, ratio): image_1 = self.create_fake_image(mocker, PIL.Image.Image, size=(128, 128), color=(124, 124, 124)) + image_1 = pil_to_tensor(image_1) target_1 = { "boxes": datapoints.BoundingBox( torch.tensor([[0.0, 0.0, 10.0, 10.0], [20.0, 20.0, 30.0, 30.0]]), @@ -1951,6 +1952,7 @@ def test__mixup(self, mocker, ratio): } image_2 = self.create_fake_image(mocker, PIL.Image.Image, size=(128, 128), color=(0, 0, 0)) + image_2 = pil_to_tensor(image_2) target_2 = { "boxes": datapoints.BoundingBox( torch.tensor([[10.0, 0.0, 20.0, 20.0], [10.0, 20.0, 30.0, 30.0]]), @@ -1969,12 +1971,13 @@ def test__mixup(self, mocker, ratio): output = transform._mixup(sample_1, sample_2, ratio) if ratio == 0: - assert output == sample_1 + assert output == sample_2 elif ratio == 1: - assert output == sample_2 + assert output == sample_1 elif ratio == 0.5: - assert output["image"] == np.fromarray((np.asarray(image_1) + np.asarray(image_2)) / 2).astype(np.uint8) + # TODO: Fix this test + assert output["image"] == (np.asarray(image_1) + np.asarray(image_2)) / 2 assert output["boxes"] == torch.cat([target_1["boxes"], target_2["boxes"]]) assert output["labels"] == torch.cat([target_1["labels"], target_2["labels"]]) From 02214b62b20c0aea12312bd90af4116e5ac54941 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 23 Jan 2023 12:16:24 +0100 Subject: [PATCH 29/32] fix mixing --- torchvision/prototype/transforms/_augment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py index 611264cd40a..37b366b7537 100644 --- a/torchvision/prototype/transforms/_augment.py +++ b/torchvision/prototype/transforms/_augment.py @@ -377,7 +377,7 @@ def _mixup(self, sample_1: Dict[str, Any], sample_2: Dict[str, Any], ratio: floa mix_image = F.pad_image_tensor( sample_1["image"].to(dtype), padding=[0, 0, w_mixup - w_1, h_mixup - h_1], fill=None ).mul_(ratio) - mix_image[..., :h_2, :w_2] = sample_2["image"] * (1.0 - ratio) + mix_image[..., :h_2, :w_2] += sample_2["image"] * (1.0 - ratio) mix_image = mix_image.to(sample_1["image"]) mix_boxes = datapoints.BoundingBox.wrap_like( From 4486e782bec08a61ce48218fade2bb0eb7114710 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 23 Jan 2023 12:34:39 +0100 Subject: [PATCH 30/32] pass flat_inputs to get_params --- torchvision/prototype/transforms/_augment.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py index 37b366b7537..21d8a8c6f1e 100644 --- a/torchvision/prototype/transforms/_augment.py +++ b/torchvision/prototype/transforms/_augment.py @@ -350,12 +350,14 @@ def forward(self, *inputs: Any) -> Any: boxes=(datapoints.BoundingBox,), labels=(datapoints.Label, datapoints.OneHotLabel), ) - self._check_inputs(flat_batch_with_spec[0]) + + flat_inputs = flat_batch_with_spec[0] + self._check_inputs(flat_inputs) batch = self._to_image_tensor(batch) batch_output = [ - self._mixup(sample, sample_rolled, self._get_params([])["ratio"]) + self._mixup(sample, sample_rolled, self._get_params(flat_inputs)["ratio"]) for sample, sample_rolled in zip(batch, batch[-1:] + batch[:-1]) ] From 1b6dbe1ce453394f931188860e42aa0206491f86 Mon Sep 17 00:00:00 2001 From: Ambuj Pawar Date: Mon, 23 Jan 2023 13:08:20 +0100 Subject: [PATCH 31/32] Update torchvision/prototype/transforms/_augment.py Co-authored-by: Philip Meier --- torchvision/prototype/transforms/_augment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py index 611264cd40a..37b366b7537 100644 --- a/torchvision/prototype/transforms/_augment.py +++ b/torchvision/prototype/transforms/_augment.py @@ -377,7 +377,7 @@ def _mixup(self, sample_1: Dict[str, Any], sample_2: Dict[str, Any], ratio: floa mix_image = F.pad_image_tensor( sample_1["image"].to(dtype), padding=[0, 0, w_mixup - w_1, h_mixup - h_1], fill=None ).mul_(ratio) - mix_image[..., :h_2, :w_2] = sample_2["image"] * (1.0 - ratio) + mix_image[..., :h_2, :w_2] += sample_2["image"] * (1.0 - ratio) mix_image = mix_image.to(sample_1["image"]) mix_boxes = datapoints.BoundingBox.wrap_like( From 8a912ba7d2a1c2e5334afd2c92ca9193f53c06b0 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Mon, 23 Jan 2023 16:58:37 +0100 Subject: [PATCH 32/32] refactor SimpleCopyPaste --- torchvision/prototype/transforms/_augment.py | 232 +++++++++---------- 1 file changed, 114 insertions(+), 118 deletions(-) diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py index 21d8a8c6f1e..0ff48c0c3b1 100644 --- a/torchvision/prototype/transforms/_augment.py +++ b/torchvision/prototype/transforms/_augment.py @@ -1,19 +1,22 @@ import math import numbers import warnings -from typing import Any, cast, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union import PIL.Image import torch from torchvision.ops import masks_to_boxes from torchvision.prototype import datapoints -from torchvision.prototype.transforms import functional as F, InterpolationMode, Transform +from torchvision.prototype.transforms import functional as F, InterpolationMode from ._transform import _DetectionBatchTransform, _RandomApplyTransform from .utils import has_any, is_simple_tensor, query_chw, query_spatial_size +D = TypeVar("D", bound=datapoints._datapoint.Datapoint) + + class RandomErasing(_RandomApplyTransform): _transformed_types = (is_simple_tensor, datapoints.Image, PIL.Image.Image, datapoints.Video) @@ -190,7 +193,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: return inpt -class SimpleCopyPaste(Transform): +class SimpleCopyPaste(_DetectionBatchTransform): def __init__( self, blending: bool = True, @@ -202,129 +205,127 @@ def __init__( self.blending = blending self.antialias = antialias - def _copy_paste( - self, - image: datapoints.TensorImageType, - target: Dict[str, Any], - paste_image: datapoints.TensorImageType, - paste_target: Dict[str, Any], - random_selection: torch.Tensor, - blending: bool, - resize_interpolation: F.InterpolationMode, - antialias: Optional[bool], - ) -> Tuple[datapoints.TensorImageType, Dict[str, Any]]: - paste_masks = paste_target["masks"].wrap_like(paste_target["masks"], paste_target["masks"][random_selection]) - paste_boxes = paste_target["boxes"].wrap_like(paste_target["boxes"], paste_target["boxes"][random_selection]) - paste_labels = paste_target["labels"].wrap_like( - paste_target["labels"], paste_target["labels"][random_selection] + def forward(self, *inputs: Any) -> Any: + flat_batch_with_spec, batch = self._flatten_and_extract_data( + inputs, + image=(datapoints.Image, PIL.Image.Image, is_simple_tensor), + boxes=(datapoints.BoundingBox,), + masks=(datapoints.Mask,), + labels=(datapoints.Label, datapoints.OneHotLabel), ) + batch = self._to_image_tensor(batch) - masks = target["masks"] - - # We resize source and paste data if they have different sizes - # This is something different to TF implementation we introduced here as - # originally the algorithm works on equal-sized data - # (for example, coming from LSJ data augmentations) - size1 = cast(List[int], image.shape[-2:]) - size2 = paste_image.shape[-2:] - if size1 != size2: - paste_image = F.resize(paste_image, size=size1, interpolation=resize_interpolation, antialias=antialias) - paste_masks = F.resize(paste_masks, size=size1) - paste_boxes = F.resize(paste_boxes, size=size1) - - paste_alpha_mask = paste_masks.sum(dim=0) > 0 - - if blending: - paste_alpha_mask = F.gaussian_blur(paste_alpha_mask.unsqueeze(0), kernel_size=[5, 5], sigma=[2.0]) + batch_output = [] + for sample, sample_rolled in zip(batch, batch[-1:] + batch[:-1]): + num_masks = len(sample_rolled["masks"]) + if num_masks < 1: + # This might for example happen with the LSJ augmentation strategy + batch_output.append(sample) + continue - inverse_paste_alpha_mask = paste_alpha_mask.logical_not() - # Copy-paste images: - out_image = image.mul(inverse_paste_alpha_mask).add_(paste_image.mul(paste_alpha_mask)) + random_selection = torch.randint(0, num_masks, (num_masks,), device=sample_rolled["masks"].device) + random_selection = torch.unique(random_selection) - # Copy-paste masks: - masks = masks * inverse_paste_alpha_mask - non_all_zero_masks = masks.sum((-1, -2)) > 0 - masks = masks[non_all_zero_masks] + batch_output.append( + self._simple_copy_paste( + sample, + sample_rolled, + random_selection=random_selection, + blending=self.blending, + resize_interpolation=self.resize_interpolation, + antialias=self.antialias, + ) + ) - # Do a shallow copy of the target dict - out_target = {k: v for k, v in target.items()} + return self._unflatten_and_insert_data(flat_batch_with_spec, batch_output) - out_target["masks"] = torch.cat([masks, paste_masks]) + @staticmethod + def _wrapping_getitem(datapoint: D, index: Any) -> D: + return type(datapoint).wrap_like(datapoint, datapoint[index]) - # Copy-paste boxes and labels - bbox_format = target["boxes"].format - xyxy_boxes = masks_to_boxes(masks) - # masks_to_boxes produces bboxes with x2y2 inclusive but x2y2 should be exclusive - # we need to add +1 to x2y2. - # There is a similar +1 in other reference implementations: + def _simple_copy_paste( + self, + sample_1: Dict[str, Any], + sample_2: Dict[str, Any], + *, + random_selection: torch.Tensor, + blending: bool, + resize_interpolation: F.InterpolationMode, + antialias: Optional[bool], + ) -> Dict[str, Any]: + dst_image = sample_1["image"] + dst_masks = sample_1["masks"] + dst_labels = sample_1["labels"] + + src_image = sample_2["image"] + src_masks = self._wrapping_getitem(sample_2["masks"], random_selection) + src_boxes = self._wrapping_getitem(sample_2["dst_boxes"], random_selection) + src_labels = self._wrapping_getitem(sample_2["labels"], random_selection) + + # In case the `dst_image` and `src_image` have different spatial sizes, we resize `src_image` and the + # corresponding annotations to `dst_image`'s spatial size. This differs from the official implementation, since + # that only works with equally sized data, e.g. coming from the LSJ augmentation strategy. + dst_spatial_size = dst_image.shape[-2:] + src_spatial_size = src_image.shape[-2:] + if dst_spatial_size != src_spatial_size: + src_image = F.resize( + src_image, size=dst_spatial_size, interpolation=resize_interpolation, antialias=antialias + ) + src_masks = F.resize(src_masks, size=dst_spatial_size) + src_boxes = F.resize(src_boxes, size=dst_spatial_size) + + src_paste_mask = src_masks.sum(dim=0, keepdim=0) > 0 + # Although the parameter is called "blending", we don't actually blend here. `src_paste_mask` is a boolean + # mask and although `F.gaussian_blur` internally converts to floating point, it will be converted back to + # boolean on the way out. Meaning, although we blur, `src_paste_mask` will have no values other than 0 or 1. + # The original paper doesn't specify how blending should be done and the official implementation is not helpful + # either: + # https://github.com/tensorflow/tpu/blob/732902a457b2a8924f885ee832830e1bf6d7c537/models/official/detection/dataloader/maskrcnn_parser_with_copy_paste.py#L331-L334 + if blending: + src_paste_mask = F.gaussian_blur(src_paste_mask, kernel_size=[5, 5], sigma=[2.0]) + dst_paste_mask = src_paste_mask.logical_not() + + image = datapoints.Image.wrap_like(dst_image, dst_image.mul(dst_paste_mask).add_(src_image.mul(src_paste_mask))) + + dst_masks = dst_masks * dst_paste_mask + # Since we paste the `src_image` into the `dst_image`, we might completely cover an object previously visible in + # `dst_image`. Furthermore, with `blending=True` small regions to begin with might also be shrunk enough to + # vanish. Thus, we check for degenerate masks and remove them. + valid_dst_masks = dst_masks.sum((-1, -2)) > 0 + dst_masks = dst_masks[valid_dst_masks] + masks = datapoints.Mask.wrap_like(dst_masks, torch.cat([dst_masks, src_masks])) + + # Since the `dst_masks` might have changed above, we recompute the corresponding `dst_boxes`. + dst_boxes_xyxy = masks_to_boxes(dst_masks) + # `masks_to_boxes` produces boxes with x2y2 inclusive, but x2y2 should be exclusive. Thus, we increase by one. + # There is a similar behavior in other reference implementations: # https://github.com/pytorch/vision/blob/b6feccbc4387766b76a3e22b13815dbbbfa87c0f/torchvision/models/detection/roi_heads.py#L418-L422 - xyxy_boxes[:, 2:] += 1 - boxes = F.convert_format_bounding_box( - xyxy_boxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=bbox_format, inplace=True + dst_boxes_xyxy[:, 2:] += 1 + dst_boxes = F.convert_format_bounding_box( + dst_boxes_xyxy, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=src_boxes.format, inplace=True ) - out_target["boxes"] = torch.cat([boxes, paste_boxes]) + dst_boxes = datapoints.BoundingBox(dst_boxes, format=src_boxes.format, spatial_size=dst_spatial_size) + boxes = datapoints.BoundingBox.wrap_like(dst_boxes, torch.cat([dst_boxes, src_boxes])) - labels = target["labels"][non_all_zero_masks] - out_target["labels"] = torch.cat([labels, paste_labels]) + labels = datapoints.Label.wrap_like(dst_labels, torch.cat([dst_labels[valid_dst_masks], src_labels])) # Check for degenerated boxes and remove them - boxes = F.convert_format_bounding_box( - out_target["boxes"], old_format=bbox_format, new_format=datapoints.BoundingBoxFormat.XYXY + # FIXME: This can only happen for the `src_boxes`, right? Since `dst_boxes` were re-computed from `dst_masks` + # above, they should all be valid. If so, degenerate boxes at this stage should only come from the resizing of + # `src_boxes` above. Maybe we can remove already at that stage? + # TODO: Maybe unify this with `transforms.RemoveSmallBoundingBoxes()`? + boxes_xyxy = F.convert_format_bounding_box( + boxes, old_format=boxes.format, new_format=datapoints.BoundingBoxFormat.XYXY ) - degenerate_boxes = boxes[:, 2:] <= boxes[:, :2] + degenerate_boxes = boxes_xyxy[:, 2:].le(boxes_xyxy[:, :2]) if degenerate_boxes.any(): - valid_targets = ~degenerate_boxes.any(dim=1) - - out_target["boxes"] = boxes[valid_targets] - out_target["masks"] = out_target["masks"][valid_targets] - out_target["labels"] = out_target["labels"][valid_targets] - - return out_image, out_target - - def forward(self, *inputs: Any) -> Any: - flat_batch_with_spec, images, targets = flatten_and_extract_data( - inputs, - boxes=(datapoints.BoundingBox,), - masks=(datapoints.Mask,), - labels=(datapoints.Label, datapoints.OneHotLabel), - ) + valid_boxes = ~degenerate_boxes.any(dim=-1) - # images = [t1, t2, ..., tN] - # Let's define paste_images as shifted list of input images - # paste_images = [t2, t3, ..., tN, t1] - # FYI: in TF they mix data on the dataset level - images_rolled = images[-1:] + images[:-1] - targets_rolled = targets[-1:] + targets[:-1] + masks = self._wrapping_getitem(masks, valid_boxes) + boxes = self._wrapping_getitem(boxes, valid_boxes) + labels = self._wrapping_getitem(labels, valid_boxes) - output_images, output_targets = [], [] - - for image, target, paste_image, paste_target in zip(images, targets, images_rolled, targets_rolled): - - # Random paste targets selection: - num_masks = len(paste_target["masks"]) - - if num_masks < 1: - # Such degerante case with num_masks=0 can happen with LSJ - # Let's just return (image, target) - output_image, output_target = image, target - else: - random_selection = torch.randint(0, num_masks, (num_masks,), device=paste_image.device) - random_selection = torch.unique(random_selection) - - output_image, output_target = self._copy_paste( - image, - target, - paste_image, - paste_target, - random_selection=random_selection, - blending=self.blending, - resize_interpolation=self.resize_interpolation, - antialias=self.antialias, - ) - output_images.append(output_image) - output_targets.append(output_target) - - return unflatten_and_insert_data(flat_batch_with_spec, output_images, output_targets) + return dict(image=image, masks=masks, boxes=boxes, labels=labels) class MixupDetection(_DetectionBatchTransform): @@ -340,9 +341,6 @@ def _check_inputs(self, flat_inputs: List[Any]) -> None: if has_any(flat_inputs, datapoints.Mask, datapoints.Video): raise TypeError(f"{type(self).__name__}() is only supported for images and bounding boxes.") - def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: - return dict(ratio=float(self._dist.sample())) - def forward(self, *inputs: Any) -> Any: flat_batch_with_spec, batch = self._flatten_and_extract_data( inputs, @@ -350,20 +348,18 @@ def forward(self, *inputs: Any) -> Any: boxes=(datapoints.BoundingBox,), labels=(datapoints.Label, datapoints.OneHotLabel), ) - - flat_inputs = flat_batch_with_spec[0] - self._check_inputs(flat_inputs) + self._check_inputs(flat_batch_with_spec[0]) batch = self._to_image_tensor(batch) batch_output = [ - self._mixup(sample, sample_rolled, self._get_params(flat_inputs)["ratio"]) + self._mixup(sample, sample_rolled, ratio=float(self._dist.sample())) for sample, sample_rolled in zip(batch, batch[-1:] + batch[:-1]) ] return self._unflatten_and_insert_data(flat_batch_with_spec, batch_output) - def _mixup(self, sample_1: Dict[str, Any], sample_2: Dict[str, Any], ratio: float) -> Dict[str, Any]: + def _mixup(self, sample_1: Dict[str, Any], sample_2: Dict[str, Any], *, ratio: float) -> Dict[str, Any]: if ratio >= 1.0: return sample_1 elif ratio == 0.0: