pytorch · datumbox · Nov 3, 2022 · Nov 3, 2022 · Nov 3, 2022 · Nov 3, 2022
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
@@ -181,9 +181,11 @@ def resize_bounding_box(
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
     old_height, old_width = spatial_size
     new_height, new_width = _compute_resized_output_size(spatial_size, size=size, max_size=max_size)
-    ratios = torch.tensor((new_width / old_width, new_height / old_height), device=bounding_box.device)
+    w_ratio = new_width / old_width
+    h_ratio = new_height / old_height
+    ratios = torch.tensor([w_ratio, h_ratio, w_ratio, h_ratio], device=bounding_box.device)
     return (
-        bounding_box.reshape(-1, 2, 2).mul(ratios).to(bounding_box.dtype).reshape(bounding_box.shape),
+        bounding_box.mul(ratios).to(bounding_box.dtype),
         (new_height, new_width),
     )
 
@@ -367,8 +369,7 @@ def _affine_bounding_box_xyxy(
     # 3) Reshape transformed points to [N boxes, 4 points, x/y coords]
     # and compute bounding box from 4 transformed points:
     transformed_points = transformed_points.reshape(-1, 4, 2)
-    out_bbox_mins, _ = torch.min(transformed_points, dim=1)
-    out_bbox_maxs, _ = torch.max(transformed_points, dim=1)
+    out_bbox_mins, out_bbox_maxs = torch.aminmax(transformed_points, dim=1)
     out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1)
 
     if expand:
@@ -388,8 +389,8 @@ def _affine_bounding_box_xyxy(
         new_points = torch.matmul(points, transposed_affine_matrix)
         tr, _ = torch.min(new_points, dim=0, keepdim=True)
         # Translate bounding boxes
-        out_bboxes[:, 0::2] = out_bboxes[:, 0::2] - tr[:, 0]
-        out_bboxes[:, 1::2] = out_bboxes[:, 1::2] - tr[:, 1]
+        out_bboxes[:, 0::2].sub_(tr[:, 0])
+        out_bboxes[:, 1::2].sub_(tr[:, 1])
         # Estimate meta-data for image with inverted=True and with center=[0,0]
         affine_vector = _get_inverse_affine_matrix([0.0, 0.0], angle, translate, scale, shear)
         new_width, new_height = _FT._compute_affine_output_size(affine_vector, width, height)
@@ -947,7 +948,6 @@ def perspective_bounding_box(
     # 2) Now let's transform the points using perspective matrices
     #   x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1)
     #   y_out = (coeffs[3] * x + coeffs[4] * y + coeffs[5]) / (coeffs[6] * x + coeffs[7] * y + 1)
-
     numer_points = torch.matmul(points, theta1.T)
     denom_points = torch.matmul(points, theta2.T)
     transformed_points = numer_points / denom_points
@@ -1063,23 +1063,21 @@ def elastic_bounding_box(
     # Question (vfdev-5): should we rely on good displacement shape and fetch image size from it
     # Or add spatial_size arg and check displacement shape
     spatial_size = displacement.shape[-3], displacement.shape[-2]
-
-    id_grid = _FT._create_identity_grid(list(spatial_size)).to(bounding_box.device)
+    id_grid = _FT._create_identity_grid(list(spatial_size), bounding_box.device)
     # We construct an approximation of inverse grid as inv_grid = id_grid - displacement
     # This is not an exact inverse of the grid
-    inv_grid = id_grid - displacement
+    inv_grid = id_grid.sub_(displacement)
 
     # Get points from bboxes
-    points = bounding_box[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2)
-    index_x = torch.floor(points[:, 0] + 0.5).to(dtype=torch.long)
-    index_y = torch.floor(points[:, 1] + 0.5).to(dtype=torch.long)
+    points = bounding_box[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].ceil_().reshape(-1, 2)
+    index_x = points[:, 0].to(dtype=torch.long)
+    index_y = points[:, 1].to(dtype=torch.long)
     # Transform points:
     t_size = torch.tensor(spatial_size[::-1], device=displacement.device, dtype=displacement.dtype)
-    transformed_points = (inv_grid[0, index_y, index_x, :] + 1) * 0.5 * t_size - 0.5
+    transformed_points = inv_grid[0, index_y, index_x, :].add_(1).mul_(0.5 * t_size).sub_(0.5)
 
     transformed_points = transformed_points.reshape(-1, 4, 2)
-    out_bbox_mins, _ = torch.min(transformed_points, dim=1)
-    out_bbox_maxs, _ = torch.max(transformed_points, dim=1)
+    out_bbox_mins, out_bbox_maxs = torch.aminmax(transformed_points, dim=1)
     out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_box.dtype)
 
     return convert_format_bounding_box(

diff --git a/torchvision/transforms/functional_tensor.py b/torchvision/transforms/functional_tensor.py
@@ -940,8 +940,8 @@ def erase(img: Tensor, i: int, j: int, h: int, w: int, v: Tensor, inplace: bool
     return img
 
 
-def _create_identity_grid(size: List[int]) -> Tensor:
-    hw_space = [torch.linspace((-s + 1) / s, (s - 1) / s, s) for s in size]
+def _create_identity_grid(size: List[int], device: torch.device) -> Tensor:
+    hw_space = [torch.linspace((-s + 1) / s, (s - 1) / s, s, device=device) for s in size]
     grid_y, grid_x = torch.meshgrid(hw_space, indexing="ij")
     return torch.stack([grid_x, grid_y], -1).unsqueeze(0)  # 1 x H x W x 2
 
@@ -959,6 +959,6 @@ def elastic_transform(
     size = list(img.shape[-2:])
     displacement = displacement.to(img.device)
 
-    identity_grid = _create_identity_grid(size)
+    identity_grid = _create_identity_grid(size, img.device)
     grid = identity_grid.to(img.device) + displacement
     return _apply_grid_transform(img, grid, interpolation, fill)