[PoC] move metadata computation from prototype features into kernels (#6646)

pmeier · web-flow · commit ae83c9fdb6e8 · 2022-09-29T19:35:04.000+02:00
* move metadata computation from prototype features into kernels

* fix tests

* fix no_inplace test

* mypy

* add perf TODO
diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
@@ -709,7 +709,7 @@ def sample_inputs_crop_bounding_box():
     for bounding_box_loader, params in itertools.product(
         make_bounding_box_loaders(), [_CROP_PARAMS[0], _CROP_PARAMS[-1]]
     ):
-        yield ArgsKwargs(bounding_box_loader, format=bounding_box_loader.format, top=params["top"], left=params["left"])
+        yield ArgsKwargs(bounding_box_loader, format=bounding_box_loader.format, **params)
 
 
 def sample_inputs_crop_mask():
@@ -856,7 +856,9 @@ def sample_inputs_pad_bounding_box():
         if params["padding_mode"] != "constant":
             continue
 
-        yield ArgsKwargs(bounding_box_loader, format=bounding_box_loader.format, **params)
+        yield ArgsKwargs(
+            bounding_box_loader, format=bounding_box_loader.format, image_size=bounding_box_loader.image_size, **params
+        )
 
 
 def sample_inputs_pad_mask():
@@ -1552,8 +1554,6 @@ def reference_inputs_ten_crop_image_tensor():
             skips=[
                 skip_integer_size_jit(),
                 Skip("test_batched_vs_single", reason="Custom batching needed for five_crop_image_tensor."),
-                Skip("test_no_inplace", reason="Output of five_crop_image_tensor is not a tensor."),
-                Skip("test_dtype_and_device_consistency", reason="Output of five_crop_image_tensor is not a tensor."),
             ],
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
         ),
@@ -1565,8 +1565,6 @@ def reference_inputs_ten_crop_image_tensor():
             skips=[
                 skip_integer_size_jit(),
                 Skip("test_batched_vs_single", reason="Custom batching needed for ten_crop_image_tensor."),
-                Skip("test_no_inplace", reason="Output of ten_crop_image_tensor is not a tensor."),
-                Skip("test_dtype_and_device_consistency", reason="Output of ten_crop_image_tensor is not a tensor."),
             ],
             closeness_kwargs=DEFAULT_IMAGE_CLOSENESS_KWARGS,
         ),
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
@@ -68,17 +68,22 @@ def test_scripted_vs_eager(self, info, args_kwargs, device):
 
         assert_close(actual, expected, **info.closeness_kwargs)
 
-    def _unbind_batch_dims(self, batched_tensor, *, data_dims):
-        if batched_tensor.ndim == data_dims:
-            return batched_tensor
-
-        return [self._unbind_batch_dims(t, data_dims=data_dims) for t in batched_tensor.unbind(0)]
+    def _unbatch(self, batch, *, data_dims):
+        if isinstance(batch, torch.Tensor):
+            batched_tensor = batch
+            metadata = ()
+        else:
+            batched_tensor, *metadata = batch
 
-    def _stack_batch_dims(self, unbound_tensor):
-        if isinstance(unbound_tensor[0], torch.Tensor):
-            return torch.stack(unbound_tensor)
+        if batched_tensor.ndim == data_dims:
+            return batch
 
-        return torch.stack([self._stack_batch_dims(t) for t in unbound_tensor])
+        return [
+            self._unbatch(unbatched, data_dims=data_dims)
+            for unbatched in (
+                batched_tensor.unbind(0) if not metadata else [(t, *metadata) for t in batched_tensor.unbind(0)]
+            )
+        ]
 
     @sample_inputs
     @pytest.mark.parametrize("device", cpu_and_gpu())
@@ -106,11 +111,11 @@ def test_batched_vs_single(self, info, args_kwargs, device):
         elif not all(batched_input.shape[:-data_dims]):
             pytest.skip("Input has a degenerate batch shape.")
 
-        actual = info.kernel(batched_input, *other_args, **kwargs)
+        batched_output = info.kernel(batched_input, *other_args, **kwargs)
+        actual = self._unbatch(batched_output, data_dims=data_dims)
 
-        single_inputs = self._unbind_batch_dims(batched_input, data_dims=data_dims)
-        single_outputs = tree_map(lambda single_input: info.kernel(single_input, *other_args, **kwargs), single_inputs)
-        expected = self._stack_batch_dims(single_outputs)
+        single_inputs = self._unbatch(batched_input, data_dims=data_dims)
+        expected = tree_map(lambda single_input: info.kernel(single_input, *other_args, **kwargs), single_inputs)
 
         assert_close(actual, expected, **info.closeness_kwargs)
 
@@ -123,9 +128,9 @@ def test_no_inplace(self, info, args_kwargs, device):
             pytest.skip("The input has a degenerate shape.")
 
         input_version = input._version
-        output = info.kernel(input, *other_args, **kwargs)
+        info.kernel(input, *other_args, **kwargs)
 
-        assert output is not input or output._version == input_version
+        assert input._version == input_version
 
     @sample_inputs
     @needs_cuda
@@ -144,6 +149,9 @@ def test_dtype_and_device_consistency(self, info, args_kwargs, device):
         (input, *other_args), kwargs = args_kwargs.load(device)
 
         output = info.kernel(input, *other_args, **kwargs)
+        # Most kernels just return a tensor, but some also return some additional metadata
+        if not isinstance(output, torch.Tensor):
+            output, *_ = output
 
         assert output.dtype == input.dtype
         assert output.device == input.device
@@ -324,7 +332,7 @@ def _compute_expected_bbox(bbox, angle_, expand_, center_):
         affine_matrix = _compute_affine_matrix(angle_, [0.0, 0.0], 1.0, [0.0, 0.0], center_)
         affine_matrix = affine_matrix[:2, :]
 
-        image_size = bbox.image_size
+        height, width = bbox.image_size
         bbox_xyxy = convert_format_bounding_box(
             bbox, old_format=bbox.format, new_format=features.BoundingBoxFormat.XYXY
         )
@@ -336,9 +344,9 @@ def _compute_expected_bbox(bbox, angle_, expand_, center_):
                 [bbox_xyxy[2].item(), bbox_xyxy[3].item(), 1.0],
                 # image frame
                 [0.0, 0.0, 1.0],
-                [0.0, image_size[0], 1.0],
-                [image_size[1], image_size[0], 1.0],
-                [image_size[1], 0.0, 1.0],
+                [0.0, height, 1.0],
+                [width, height, 1.0],
+                [width, 0.0, 1.0],
             ]
         )
         transformed_points = np.matmul(points, affine_matrix.T)
@@ -356,18 +364,21 @@ def _compute_expected_bbox(bbox, angle_, expand_, center_):
             out_bbox[2] -= tr_x
             out_bbox[3] -= tr_y
 
-            # image_size should be updated, but it is OK here to skip its computation
-            # as we do not compute it in F.rotate_bounding_box
+            height = int(height - 2 * tr_y)
+            width = int(width - 2 * tr_x)
 
         out_bbox = features.BoundingBox(
             out_bbox,
             format=features.BoundingBoxFormat.XYXY,
-            image_size=image_size,
+            image_size=(height, width),
             dtype=bbox.dtype,
             device=bbox.device,
         )
-        return convert_format_bounding_box(
-            out_bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox.format, copy=False
+        return (
+            convert_format_bounding_box(
+                out_bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox.format, copy=False
+            ),
+            (height, width),
         )
 
     image_size = (32, 38)
@@ -376,7 +387,7 @@ def _compute_expected_bbox(bbox, angle_, expand_, center_):
         bboxes_format = bboxes.format
         bboxes_image_size = bboxes.image_size
 
-        output_bboxes = F.rotate_bounding_box(
+        output_bboxes, output_image_size = F.rotate_bounding_box(
             bboxes,
             bboxes_format,
             image_size=bboxes_image_size,
@@ -395,12 +406,14 @@ def _compute_expected_bbox(bbox, angle_, expand_, center_):
         expected_bboxes = []
         for bbox in bboxes:
             bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size)
-            expected_bboxes.append(_compute_expected_bbox(bbox, -angle, expand, center_))
+            expected_bbox, expected_image_size = _compute_expected_bbox(bbox, -angle, expand, center_)
+            expected_bboxes.append(expected_bbox)
         if len(expected_bboxes) > 1:
             expected_bboxes = torch.stack(expected_bboxes)
         else:
             expected_bboxes = expected_bboxes[0]
         torch.testing.assert_close(output_bboxes, expected_bboxes, atol=1, rtol=0)
+        torch.testing.assert_close(output_image_size, expected_image_size, atol=1, rtol=0)
 
 
 @pytest.mark.parametrize("device", cpu_and_gpu())
@@ -445,7 +458,7 @@ def test_correctness_rotate_bounding_box_on_fixed_input(device, expand):
             [18.36396103, 1.07968978, 46.64823228, 29.36396103],
         ]
 
-    output_boxes = F.rotate_bounding_box(
+    output_boxes, _ = F.rotate_bounding_box(
         in_boxes,
         in_boxes.format,
         in_boxes.image_size,
@@ -510,17 +523,20 @@ def test_correctness_crop_bounding_box(device, format, top, left, height, width,
     if format != features.BoundingBoxFormat.XYXY:
         in_boxes = convert_format_bounding_box(in_boxes, features.BoundingBoxFormat.XYXY, format)
 
-    output_boxes = F.crop_bounding_box(
+    output_boxes, output_image_size = F.crop_bounding_box(
         in_boxes,
         format,
         top,
         left,
+        size[0],
+        size[1],
     )
 
     if format != features.BoundingBoxFormat.XYXY:
         output_boxes = convert_format_bounding_box(output_boxes, format, features.BoundingBoxFormat.XYXY)
 
     torch.testing.assert_close(output_boxes.tolist(), expected_bboxes)
+    torch.testing.assert_close(output_image_size, size)
 
 
 @pytest.mark.parametrize("device", cpu_and_gpu())
@@ -585,12 +601,13 @@ def _compute_expected_bbox(bbox, top_, left_, height_, width_, size_):
     if format != features.BoundingBoxFormat.XYXY:
         in_boxes = convert_format_bounding_box(in_boxes, features.BoundingBoxFormat.XYXY, format)
 
-    output_boxes = F.resized_crop_bounding_box(in_boxes, format, top, left, height, width, size)
+    output_boxes, output_image_size = F.resized_crop_bounding_box(in_boxes, format, top, left, height, width, size)
 
     if format != features.BoundingBoxFormat.XYXY:
         output_boxes = convert_format_bounding_box(output_boxes, format, features.BoundingBoxFormat.XYXY)
 
     torch.testing.assert_close(output_boxes, expected_bboxes)
+    torch.testing.assert_close(output_image_size, size)
 
 
 def _parse_padding(padding):
@@ -627,12 +644,21 @@ def _compute_expected_bbox(bbox, padding_):
             bbox = bbox.to(bbox_dtype)
         return bbox
 
+    def _compute_expected_image_size(bbox, padding_):
+        pad_left, pad_up, pad_right, pad_down = _parse_padding(padding_)
+        height, width = bbox.image_size
+        return height + pad_up + pad_down, width + pad_left + pad_right
+
     for bboxes in make_bounding_boxes():
         bboxes = bboxes.to(device)
         bboxes_format = bboxes.format
         bboxes_image_size = bboxes.image_size
 
-        output_boxes = F.pad_bounding_box(bboxes, format=bboxes_format, padding=padding)
+        output_boxes, output_image_size = F.pad_bounding_box(
+            bboxes, format=bboxes_format, image_size=bboxes_image_size, padding=padding
+        )
+
+        torch.testing.assert_close(output_image_size, _compute_expected_image_size(bboxes, padding))
 
         if bboxes.ndim < 2 or bboxes.shape[0] == 0:
             bboxes = [bboxes]
@@ -781,7 +807,9 @@ def _compute_expected_bbox(bbox, output_size_):
         bboxes_format = bboxes.format
         bboxes_image_size = bboxes.image_size
 
-        output_boxes = F.center_crop_bounding_box(bboxes, bboxes_format, bboxes_image_size, output_size)
+        output_boxes, output_image_size = F.center_crop_bounding_box(
+            bboxes, bboxes_format, bboxes_image_size, output_size
+        )
 
         if bboxes.ndim < 2:
             bboxes = [bboxes]
@@ -796,6 +824,7 @@ def _compute_expected_bbox(bbox, output_size_):
         else:
             expected_bboxes = expected_bboxes[0]
         torch.testing.assert_close(output_boxes, expected_bboxes)
+        torch.testing.assert_close(output_image_size, output_size)
 
 
 @pytest.mark.parametrize("device", cpu_and_gpu())
diff --git a/torchvision/prototype/features/_bounding_box.py b/torchvision/prototype/features/_bounding_box.py
@@ -83,23 +83,19 @@ def resize(  # type: ignore[override]
         max_size: Optional[int] = None,
         antialias: bool = False,
     ) -> BoundingBox:
-        output = self._F.resize_bounding_box(self, image_size=self.image_size, size=size, max_size=max_size)
-        if isinstance(size, int):
-            size = [size]
-        image_size = (size[0], size[0]) if len(size) == 1 else (size[0], size[1])
-        return BoundingBox.new_like(self, output, image_size=image_size, dtype=output.dtype)
+        output, image_size = self._F.resize_bounding_box(self, image_size=self.image_size, size=size, max_size=max_size)
+        return BoundingBox.new_like(self, output, image_size=image_size)
 
     def crop(self, top: int, left: int, height: int, width: int) -> BoundingBox:
-        output = self._F.crop_bounding_box(self, self.format, top, left)
-        return BoundingBox.new_like(self, output, image_size=(height, width))
+        output, image_size = self._F.crop_bounding_box(
+            self, self.format, top=top, left=left, height=height, width=width
+        )
+        return BoundingBox.new_like(self, output, image_size=image_size)
 
     def center_crop(self, output_size: List[int]) -> BoundingBox:
-        output = self._F.center_crop_bounding_box(
+        output, image_size = self._F.center_crop_bounding_box(
             self, format=self.format, image_size=self.image_size, output_size=output_size
         )
-        if isinstance(output_size, int):
-            output_size = [output_size]
-        image_size = (output_size[0], output_size[0]) if len(output_size) == 1 else (output_size[0], output_size[1])
         return BoundingBox.new_like(self, output, image_size=image_size)
 
     def resized_crop(
@@ -112,29 +108,19 @@ def resized_crop(
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         antialias: bool = False,
     ) -> BoundingBox:
-        output = self._F.resized_crop_bounding_box(self, self.format, top, left, height, width, size=size)
-        image_size = (size[0], size[0]) if len(size) == 1 else (size[0], size[1])
-        return BoundingBox.new_like(self, output, image_size=image_size, dtype=output.dtype)
+        output, image_size = self._F.resized_crop_bounding_box(self, self.format, top, left, height, width, size=size)
+        return BoundingBox.new_like(self, output, image_size=image_size)
 
     def pad(
         self,
         padding: Union[int, Sequence[int]],
         fill: FillTypeJIT = None,
         padding_mode: str = "constant",
     ) -> BoundingBox:
-        # This cast does Sequence[int] -> List[int] and is required to make mypy happy
-        if not isinstance(padding, int):
-            padding = list(padding)
-
-        output = self._F.pad_bounding_box(self, format=self.format, padding=padding, padding_mode=padding_mode)
-
-        # Update output image size:
-        left, right, top, bottom = self._F._geometry._parse_pad_padding(padding)
-        height, width = self.image_size
-        height += top + bottom
-        width += left + right
-
-        return BoundingBox.new_like(self, output, image_size=(height, width))
+        output, image_size = self._F.pad_bounding_box(
+            self, format=self.format, image_size=self.image_size, padding=padding, padding_mode=padding_mode
+        )
+        return BoundingBox.new_like(self, output, image_size=image_size)
 
     def rotate(
         self,
@@ -144,23 +130,10 @@ def rotate(
         fill: FillTypeJIT = None,
         center: Optional[List[float]] = None,
     ) -> BoundingBox:
-        output = self._F.rotate_bounding_box(
+        output, image_size = self._F.rotate_bounding_box(
             self, format=self.format, image_size=self.image_size, angle=angle, expand=expand, center=center
         )
-        image_size = self.image_size
-        if expand:
-            # The way we recompute image_size is not optimal due to redundant computations of
-            # - rotation matrix (_get_inverse_affine_matrix)
-            # - points dot matrix (_compute_affine_output_size)
-            # Alternatively, we could return new image size by self._F.rotate_bounding_box
-            height, width = image_size
-            rotation_matrix = self._F._geometry._get_inverse_affine_matrix(
-                [0.0, 0.0], angle, [0.0, 0.0], 1.0, [0.0, 0.0]
-            )
-            new_width, new_height = self._F._geometry._FT._compute_affine_output_size(rotation_matrix, width, height)
-            image_size = (new_height, new_width)
-
-        return BoundingBox.new_like(self, output, dtype=output.dtype, image_size=image_size)
+        return BoundingBox.new_like(self, output, image_size=image_size)
 
     def affine(
         self,
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py