From 7213e5669bbdbf2f2712fecbc68354b5d09a97ac Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 8 Feb 2023 20:21:19 +0100 Subject: [PATCH 01/13] [DEBUG] test all float dtypes for images, videos, and boxes --- test/prototype_common_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py index 1cea10603ec..a38fa61cc66 100644 --- a/test/prototype_common_utils.py +++ b/test/prototype_common_utils.py @@ -295,7 +295,7 @@ def make_image_loaders( "RGBA", ), extra_dims=DEFAULT_EXTRA_DIMS, - dtypes=(torch.float32, torch.uint8), + dtypes=(torch.float32, torch.float16, torch.float64, torch.uint8), constant_alpha=True, ): for params in combinations_grid(size=sizes, color_space=color_spaces, extra_dims=extra_dims, dtype=dtypes): @@ -417,7 +417,7 @@ def make_bounding_box_loaders( extra_dims=DEFAULT_EXTRA_DIMS, formats=tuple(datapoints.BoundingBoxFormat), spatial_size="random", - dtypes=(torch.float32, torch.int64), + dtypes=(torch.float32, torch.float16, torch.float64, torch.int64), ): for params in combinations_grid(extra_dims=extra_dims, format=formats, dtype=dtypes): yield make_bounding_box_loader(**params, spatial_size=spatial_size) @@ -609,7 +609,7 @@ def make_video_loaders( ), num_frames=(1, 0, "random"), extra_dims=DEFAULT_EXTRA_DIMS, - dtypes=(torch.uint8,), + dtypes=(torch.uint8, torch.float32, torch.float16, torch.float64), ): for params in combinations_grid( size=sizes, color_space=color_spaces, num_frames=num_frames, extra_dims=extra_dims, dtype=dtypes From 4e843d9c5dc56ab0ab46366a03e7dfdc55591e96 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Thu, 9 Feb 2023 09:39:34 +0100 Subject: [PATCH 02/13] remove float16 for now --- test/prototype_common_utils.py | 6 +++--- test/prototype_transforms_kernel_infos.py | 4 +++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py index a38fa61cc66..c7bf18e22a7 100644 --- a/test/prototype_common_utils.py +++ b/test/prototype_common_utils.py @@ -295,7 +295,7 @@ def make_image_loaders( "RGBA", ), extra_dims=DEFAULT_EXTRA_DIMS, - dtypes=(torch.float32, torch.float16, torch.float64, torch.uint8), + dtypes=(torch.float32, torch.float64, torch.uint8), constant_alpha=True, ): for params in combinations_grid(size=sizes, color_space=color_spaces, extra_dims=extra_dims, dtype=dtypes): @@ -417,7 +417,7 @@ def make_bounding_box_loaders( extra_dims=DEFAULT_EXTRA_DIMS, formats=tuple(datapoints.BoundingBoxFormat), spatial_size="random", - dtypes=(torch.float32, torch.float16, torch.float64, torch.int64), + dtypes=(torch.float32, torch.float64, torch.int64), ): for params in combinations_grid(extra_dims=extra_dims, format=formats, dtype=dtypes): yield make_bounding_box_loader(**params, spatial_size=spatial_size) @@ -609,7 +609,7 @@ def make_video_loaders( ), num_frames=(1, 0, "random"), extra_dims=DEFAULT_EXTRA_DIMS, - dtypes=(torch.uint8, torch.float32, torch.float16, torch.float64), + dtypes=(torch.uint8, torch.float32, torch.float64), ): for params in combinations_grid( size=sizes, color_space=color_spaces, num_frames=num_frames, extra_dims=extra_dims, dtype=dtypes diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py index 1fac1526248..2c07224d384 100644 --- a/test/prototype_transforms_kernel_infos.py +++ b/test/prototype_transforms_kernel_infos.py @@ -1263,7 +1263,9 @@ def _get_elastic_displacement(spatial_size): def sample_inputs_elastic_image_tensor(): - for image_loader in make_image_loaders(sizes=["random"]): + for image_loader in make_image_loaders( + sizes=["random"], dtypes=[torch.uint8, torch.float16, torch.float32, torch.float64] + ): displacement = _get_elastic_displacement(image_loader.spatial_size) for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype): yield ArgsKwargs(image_loader, displacement=displacement, fill=fill) From 1abb0a820847fa8d5cc48019613e26a8b3e32901 Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Thu, 9 Feb 2023 21:20:02 +0100 Subject: [PATCH 03/13] Better handle dtype for elastic transform --- test/prototype_transforms_kernel_infos.py | 2 +- .../transforms/functional/_geometry.py | 33 ++++++++++++++----- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py index 2c07224d384..efe492eb3b7 100644 --- a/test/prototype_transforms_kernel_infos.py +++ b/test/prototype_transforms_kernel_infos.py @@ -1264,7 +1264,7 @@ def _get_elastic_displacement(spatial_size): def sample_inputs_elastic_image_tensor(): for image_loader in make_image_loaders( - sizes=["random"], dtypes=[torch.uint8, torch.float16, torch.float32, torch.float64] + sizes=["random"], dtypes=[torch.uint8, torch.float32, torch.float64] ): displacement = _get_elastic_displacement(image_loader.spatial_size) for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype): diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py index 66e777dbdcc..7245682b037 100644 --- a/torchvision/prototype/transforms/functional/_geometry.py +++ b/torchvision/prototype/transforms/functional/_geometry.py @@ -1494,9 +1494,11 @@ def elastic_image_tensor( shape = image.shape ndim = image.ndim - device = image.device fp = torch.is_floating_point(image) + device = image.device + dtype = image.dtype if fp else torch.float32 + if ndim > 4: image = image.reshape((-1,) + shape[-3:]) needs_unsquash = True @@ -1506,9 +1508,15 @@ def elastic_image_tensor( else: needs_unsquash = False + if displacement.dtype != dtype: + displacement = displacement.to(dtype) + + if displacement.device != device: + displacement = displacement.to(device) + image_height, image_width = shape[-2:] - grid = _create_identity_grid((image_height, image_width), device=device).add_(displacement.to(device)) - output = _apply_grid_transform(image if fp else image.to(torch.float32), grid, interpolation.value, fill=fill) + grid = _create_identity_grid((image_height, image_width), device=device, dtype=dtype).add_(displacement) + output = _apply_grid_transform(image if fp else image.to(dtype), grid, interpolation.value, fill=fill) if not fp: output = output.round_().to(image.dtype) @@ -1531,13 +1539,13 @@ def elastic_image_pil( return to_pil_image(output, mode=image.mode) -def _create_identity_grid(size: Tuple[int, int], device: torch.device) -> torch.Tensor: +def _create_identity_grid(size: Tuple[int, int], device: torch.device, dtype: torch.dtype) -> torch.Tensor: sy, sx = size - base_grid = torch.empty(1, sy, sx, 2, device=device) - x_grid = torch.linspace((-sx + 1) / sx, (sx - 1) / sx, sx, device=device) + base_grid = torch.empty(1, sy, sx, 2, device=device, dtype=dtype) + x_grid = torch.linspace((-sx + 1) / sx, (sx - 1) / sx, sx, device=device, dtype=dtype) base_grid[..., 0].copy_(x_grid) - y_grid = torch.linspace((-sy + 1) / sy, (sy - 1) / sy, sy, device=device).unsqueeze_(-1) + y_grid = torch.linspace((-sy + 1) / sy, (sy - 1) / sy, sy, device=device, dtype=dtype).unsqueeze_(-1) base_grid[..., 1].copy_(y_grid) return base_grid @@ -1552,7 +1560,14 @@ def elastic_bounding_box( return bounding_box # TODO: add in docstring about approximation we are doing for grid inversion - displacement = displacement.to(bounding_box.device) + device = bounding_box.device + dtype = bounding_box.dtype if torch.is_floating_point(bounding_box) else torch.float32 + + if displacement.dtype != dtype: + displacement = displacement.to(dtype) + + if displacement.device != device: + displacement = displacement.to(device) original_shape = bounding_box.shape bounding_box = ( @@ -1563,7 +1578,7 @@ def elastic_bounding_box( # Or add spatial_size arg and check displacement shape spatial_size = displacement.shape[-3], displacement.shape[-2] - id_grid = _create_identity_grid(spatial_size, bounding_box.device) + id_grid = _create_identity_grid(spatial_size, device=device, dtype=dtype) # We construct an approximation of inverse grid as inv_grid = id_grid - displacement # This is not an exact inverse of the grid inv_grid = id_grid.sub_(displacement) From 900424523fa0955f393b3336758802ff6691cf1a Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Thu, 9 Feb 2023 21:38:31 +0100 Subject: [PATCH 04/13] Fixed dtype handling for affine transform --- test/prototype_transforms_kernel_infos.py | 5 ++++- torchvision/prototype/transforms/functional/_geometry.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py index efe492eb3b7..24534d856c6 100644 --- a/test/prototype_transforms_kernel_infos.py +++ b/test/prototype_transforms_kernel_infos.py @@ -537,8 +537,10 @@ def reference_affine_bounding_box_helper(bounding_box, *, format, affine_matrix) def transform(bbox, affine_matrix_, format_): # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1 in_dtype = bbox.dtype + if not torch.is_floating_point(bbox): + bbox = bbox.float() bbox_xyxy = F.convert_format_bounding_box( - bbox.float(), old_format=format_, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True + bbox, old_format=format_, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True ) points = np.array( [ @@ -556,6 +558,7 @@ def transform(bbox, affine_matrix_, format_): np.max(transformed_points[:, 0]).item(), np.max(transformed_points[:, 1]).item(), ], + dtype=bbox_xyxy.dtype, ) out_bbox = F.convert_format_bounding_box( out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format_, inplace=True diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py index 7245682b037..c71153711cf 100644 --- a/torchvision/prototype/transforms/functional/_geometry.py +++ b/torchvision/prototype/transforms/functional/_geometry.py @@ -612,7 +612,7 @@ def _affine_bounding_box_xyxy( # Single point structure is similar to # [(xmin, ymin, 1), (xmax, ymin, 1), (xmax, ymax, 1), (xmin, ymax, 1)] points = bounding_box[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2) - points = torch.cat([points, torch.ones(points.shape[0], 1, device=points.device)], dim=-1) + points = torch.cat([points, torch.ones(points.shape[0], 1, device=device, dtype=dtype)], dim=-1) # 2) Now let's transform the points using affine matrix transformed_points = torch.matmul(points, transposed_affine_matrix) # 3) Reshape transformed points to [N boxes, 4 points, x/y coords] From f93c7431af33996f0381ddda8143eedd72df89ac Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Thu, 9 Feb 2023 22:15:23 +0100 Subject: [PATCH 05/13] Skip tests dtype f64 for perspective --- test/prototype_transforms_kernel_infos.py | 25 ++++++++++++++----- test/test_prototype_transforms_functional.py | 2 +- .../transforms/functional/_geometry.py | 4 +-- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py index 24534d856c6..feb2ecf06ba 100644 --- a/test/prototype_transforms_kernel_infos.py +++ b/test/prototype_transforms_kernel_infos.py @@ -80,6 +80,13 @@ def _pixel_difference_closeness_kwargs(uint8_atol, *, dtype=torch.uint8, mae=Fal return dict(atol=uint8_atol / 255 * get_max_value(dtype), rtol=0, mae=mae) +def scripted_vs_eager_double_pixel_difference(atol=1e-6, rtol=1e-5): + return { + (("TestKernels", "test_scripted_vs_eager"), dtype, device): {"atol": atol, "rtol": rtol, "mae": False} + for device, dtype in [("cpu", torch.float64), ("cuda", torch.float64)] + } + + def cuda_vs_cpu_pixel_difference(atol=1): return { (("TestKernels", "test_cuda_vs_cpu"), dtype, "cuda"): _pixel_difference_closeness_kwargs(atol, dtype=dtype) @@ -1174,7 +1181,8 @@ def reference_inputs_pad_bounding_box(): def sample_inputs_perspective_image_tensor(): - for image_loader in make_image_loaders(sizes=["random"]): + # Skip tests on dtype float64, otherwise scripted vs eager are failing + for image_loader in make_image_loaders(sizes=["random"], dtypes=(torch.uint8, torch.float32)): for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype): yield ArgsKwargs(image_loader, None, None, fill=fill, coefficients=_PERSPECTIVE_COEFFS[0]) @@ -1220,7 +1228,12 @@ def reference_inputs_perspective_mask(): def sample_inputs_perspective_video(): - for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]): + # Skip tests on dtype float64, otherwise scripted vs eager are failing + for video_loader in make_video_loaders( + sizes=["random"], + num_frames=["random"], + dtypes=(torch.uint8, torch.float32), + ): yield ArgsKwargs(video_loader, None, None, coefficients=_PERSPECTIVE_COEFFS[0]) @@ -1255,7 +1268,9 @@ def sample_inputs_perspective_video(): KernelInfo( F.perspective_video, sample_inputs_fn=sample_inputs_perspective_video, - closeness_kwargs=cuda_vs_cpu_pixel_difference(), + closeness_kwargs={ + **cuda_vs_cpu_pixel_difference(), + }, ), ] ) @@ -1266,9 +1281,7 @@ def _get_elastic_displacement(spatial_size): def sample_inputs_elastic_image_tensor(): - for image_loader in make_image_loaders( - sizes=["random"], dtypes=[torch.uint8, torch.float32, torch.float64] - ): + for image_loader in make_image_loaders(sizes=["random"], dtypes=[torch.uint8, torch.float32, torch.float64]): displacement = _get_elastic_displacement(image_loader.spatial_size) for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype): yield ArgsKwargs(image_loader, displacement=displacement, fill=fill) diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py index 649620eda62..2f6b3f59d17 100644 --- a/test/test_prototype_transforms_functional.py +++ b/test/test_prototype_transforms_functional.py @@ -142,7 +142,7 @@ def test_scripted_vs_eager(self, test_id, info, args_kwargs, device): actual, expected, **info.get_closeness_kwargs(test_id, dtype=input.dtype, device=input.device), - msg=parametrized_error_message(*other_args, **kwargs), + msg=parametrized_error_message(*([actual, expected] + other_args), **kwargs), ) def _unbatch(self, batch, *, data_dims): diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py index c71153711cf..fffc8068208 100644 --- a/torchvision/prototype/transforms/functional/_geometry.py +++ b/torchvision/prototype/transforms/functional/_geometry.py @@ -1237,9 +1237,9 @@ def _perspective_grid(coeffs: List[float], ow: int, oh: int, dtype: torch.dtype, d = 0.5 base_grid = torch.empty(1, oh, ow, 3, dtype=dtype, device=device) - x_grid = torch.linspace(d, ow + d - 1.0, steps=ow, device=device) + x_grid = torch.linspace(d, ow + d - 1.0, steps=ow, device=device, dtype=dtype) base_grid[..., 0].copy_(x_grid) - y_grid = torch.linspace(d, oh + d - 1.0, steps=oh, device=device).unsqueeze_(-1) + y_grid = torch.linspace(d, oh + d - 1.0, steps=oh, device=device, dtype=dtype).unsqueeze_(-1) base_grid[..., 1].copy_(y_grid) base_grid[..., 2].fill_(1) From c5e17db7228fdde60c2f5cdcb65a287f658d4e60 Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Thu, 9 Feb 2023 22:19:56 +0100 Subject: [PATCH 06/13] Skipped tests f64 for rotate bboxes transforms --- test/prototype_transforms_kernel_infos.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py index feb2ecf06ba..880a7cf262c 100644 --- a/test/prototype_transforms_kernel_infos.py +++ b/test/prototype_transforms_kernel_infos.py @@ -794,7 +794,11 @@ def reference_inputs_rotate_image_tensor(): def sample_inputs_rotate_bounding_box(): - for bounding_box_loader in make_bounding_box_loaders(): + # Skip test for dtype=float64, otherwise test_scripted_vs_eager is failing + # Mismatched elements: 6 / 24 (25.0%) + # Greatest absolute difference: 1.0638606902091396e-06 at index (1, 0, 0) (up to 1e-07 allowed) + # Greatest relative difference: 4.713177909318502e-06 at index (0, 0, 0) (up to 1e-07 allowed) + for bounding_box_loader in make_bounding_box_loaders(dtypes=(torch.float32, torch.int64)): yield ArgsKwargs( bounding_box_loader, format=bounding_box_loader.format, From b2a3071b08e8f46b463c5b401013143ff8c8903c Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Thu, 9 Feb 2023 22:38:24 +0100 Subject: [PATCH 07/13] Cast image dtype inside _apply_grid_transform --- .../transforms/functional/_geometry.py | 41 +++++++------------ 1 file changed, 15 insertions(+), 26 deletions(-) diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py index fffc8068208..b8766cfdd5c 100644 --- a/torchvision/prototype/transforms/functional/_geometry.py +++ b/torchvision/prototype/transforms/functional/_geometry.py @@ -404,9 +404,12 @@ def _compute_affine_output_size(matrix: List[float], w: int, h: int) -> Tuple[in def _apply_grid_transform( - float_img: torch.Tensor, grid: torch.Tensor, mode: str, fill: datapoints.FillTypeJIT + img: torch.Tensor, grid: torch.Tensor, mode: str, fill: datapoints.FillTypeJIT ) -> torch.Tensor: + fp = img.dtype == grid.dtype + float_img = img if fp else img.to(grid.dtype) + shape = float_img.shape if shape[0] > 1: # Apply same grid to a batch of images @@ -433,7 +436,9 @@ def _apply_grid_transform( # img * mask + (1.0 - mask) * fill = img * mask - fill * mask + fill = mask * (img - fill) + fill float_img = float_img.sub_(fill_img).mul_(mask).add_(fill_img) - return float_img + img = float_img.round_().to(img.dtype) if not fp else float_img + + return img def _assert_grid_transform_inputs( @@ -511,7 +516,6 @@ def affine_image_tensor( shape = image.shape ndim = image.ndim - fp = torch.is_floating_point(image) if ndim > 4: image = image.reshape((-1,) + shape[-3:]) @@ -535,13 +539,10 @@ def affine_image_tensor( _assert_grid_transform_inputs(image, matrix, interpolation.value, fill, ["nearest", "bilinear"]) - dtype = image.dtype if fp else torch.float32 + dtype = image.dtype if torch.is_floating_point(image) else torch.float32 theta = torch.tensor(matrix, dtype=dtype, device=image.device).reshape(1, 2, 3) grid = _affine_grid(theta, w=width, h=height, ow=width, oh=height) - output = _apply_grid_transform(image if fp else image.to(dtype), grid, interpolation.value, fill=fill) - - if not fp: - output = output.round_().to(image.dtype) + output = _apply_grid_transform(image, grid, interpolation.value, fill=fill) if needs_unsquash: output = output.reshape(shape) @@ -797,19 +798,15 @@ def rotate_image_tensor( matrix = _get_inverse_affine_matrix(center_f, -angle, [0.0, 0.0], 1.0, [0.0, 0.0]) if image.numel() > 0: - fp = torch.is_floating_point(image) image = image.reshape(-1, num_channels, height, width) _assert_grid_transform_inputs(image, matrix, interpolation.value, fill, ["nearest", "bilinear"]) ow, oh = _compute_affine_output_size(matrix, width, height) if expand else (width, height) - dtype = image.dtype if fp else torch.float32 + dtype = image.dtype if torch.is_floating_point(image) else torch.float32 theta = torch.tensor(matrix, dtype=dtype, device=image.device).reshape(1, 2, 3) grid = _affine_grid(theta, w=width, h=height, ow=ow, oh=oh) - output = _apply_grid_transform(image if fp else image.to(dtype), grid, interpolation.value, fill=fill) - - if not fp: - output = output.round_().to(image.dtype) + output = _apply_grid_transform(image, grid, interpolation.value, fill=fill) new_height, new_width = output.shape[-2:] else: @@ -1283,7 +1280,6 @@ def perspective_image_tensor( shape = image.shape ndim = image.ndim - fp = torch.is_floating_point(image) if ndim > 4: image = image.reshape((-1,) + shape[-3:]) @@ -1304,12 +1300,9 @@ def perspective_image_tensor( ) oh, ow = shape[-2:] - dtype = image.dtype if fp else torch.float32 + dtype = image.dtype if torch.is_floating_point(image) else torch.float32 grid = _perspective_grid(perspective_coeffs, ow=ow, oh=oh, dtype=dtype, device=image.device) - output = _apply_grid_transform(image if fp else image.to(dtype), grid, interpolation.value, fill=fill) - - if not fp: - output = output.round_().to(image.dtype) + output = _apply_grid_transform(image, grid, interpolation.value, fill=fill) if needs_unsquash: output = output.reshape(shape) @@ -1494,10 +1487,9 @@ def elastic_image_tensor( shape = image.shape ndim = image.ndim - fp = torch.is_floating_point(image) device = image.device - dtype = image.dtype if fp else torch.float32 + dtype = image.dtype if torch.is_floating_point(image) else torch.float32 if ndim > 4: image = image.reshape((-1,) + shape[-3:]) @@ -1516,10 +1508,7 @@ def elastic_image_tensor( image_height, image_width = shape[-2:] grid = _create_identity_grid((image_height, image_width), device=device, dtype=dtype).add_(displacement) - output = _apply_grid_transform(image if fp else image.to(dtype), grid, interpolation.value, fill=fill) - - if not fp: - output = output.round_().to(image.dtype) + output = _apply_grid_transform(image, grid, interpolation.value, fill=fill) if needs_unsquash: output = output.reshape(shape) From d02f3dd3a296eb2f7898e67180fa3b68d115acd2 Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Fri, 10 Feb 2023 14:30:23 +0100 Subject: [PATCH 08/13] Fixed dtype inconsistency in v1 and v2 for LinearTransformation --- test/test_prototype_transforms_consistency.py | 33 ++++++++++++------- torchvision/prototype/transforms/_misc.py | 12 ++++++- torchvision/transforms/transforms.py | 15 +++++++-- 3 files changed, 45 insertions(+), 15 deletions(-) diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py index 758acc7b10a..f0a7b44db3b 100644 --- a/test/test_prototype_transforms_consistency.py +++ b/test/test_prototype_transforms_consistency.py @@ -138,17 +138,28 @@ def __init__( NotScriptableArgsKwargs(5, padding_mode="symmetric"), ], ), - ConsistencyConfig( - prototype_transforms.LinearTransformation, - legacy_transforms.LinearTransformation, - [ - ArgsKwargs(LINEAR_TRANSFORMATION_MATRIX, LINEAR_TRANSFORMATION_MEAN), - ], - # Make sure that the product of the height, width and number of channels matches the number of elements in - # `LINEAR_TRANSFORMATION_MEAN`. For example 2 * 6 * 3 == 4 * 3 * 3 == 36. - make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, sizes=[(2, 6), (4, 3)], color_spaces=["RGB"]), - supports_pil=False, - ), + *[ + ConsistencyConfig( + prototype_transforms.LinearTransformation, + legacy_transforms.LinearTransformation, + [ + ArgsKwargs(LINEAR_TRANSFORMATION_MATRIX.to(matrix_dtype), LINEAR_TRANSFORMATION_MEAN.to(matrix_dtype)), + ], + # Make sure that the product of the height, width and number of channels matches the number of elements in + # `LINEAR_TRANSFORMATION_MEAN`. For example 2 * 6 * 3 == 4 * 3 * 3 == 36. + make_images_kwargs=dict( + DEFAULT_MAKE_IMAGES_KWARGS, sizes=[(2, 6), (4, 3)], color_spaces=["RGB"], dtypes=[image_dtype] + ), + supports_pil=False, + ) + for matrix_dtype, image_dtype in [ + (torch.float32, torch.float32), + (torch.float64, torch.float64), + (torch.float32, torch.uint8), + (torch.float64, torch.float32), + (torch.float32, torch.float64), + ] + ], ConsistencyConfig( prototype_transforms.Grayscale, legacy_transforms.Grayscale, diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py index e7bb62da18e..04ed519aecb 100644 --- a/torchvision/prototype/transforms/_misc.py +++ b/torchvision/prototype/transforms/_misc.py @@ -64,6 +64,11 @@ def __init__(self, transformation_matrix: torch.Tensor, mean_vector: torch.Tenso f"Input tensors should be on the same device. Got {transformation_matrix.device} and {mean_vector.device}" ) + if transformation_matrix.dtype != mean_vector.dtype: + raise ValueError( + f"Input tensors should have the same dtype. Got {transformation_matrix.dtype} and {mean_vector.dtype}" + ) + self.transformation_matrix = transformation_matrix self.mean_vector = mean_vector @@ -93,7 +98,12 @@ def _transform( ) flat_tensor = inpt.reshape(-1, n) - self.mean_vector - transformed_tensor = torch.mm(flat_tensor, self.transformation_matrix) + + transformation_matrix = self.transformation_matrix + if flat_tensor.dtype != transformation_matrix.dtype: + transformation_matrix = transformation_matrix.to(flat_tensor.dtype) + + transformed_tensor = torch.mm(flat_tensor, transformation_matrix) return transformed_tensor.reshape(shape) diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py index 9395ca674f4..857226b07ab 100644 --- a/torchvision/transforms/transforms.py +++ b/torchvision/transforms/transforms.py @@ -1078,6 +1078,11 @@ def __init__(self, transformation_matrix, mean_vector): f"Input tensors should be on the same device. Got {transformation_matrix.device} and {mean_vector.device}" ) + if transformation_matrix.dtype != mean_vector.dtype: + raise ValueError( + f"Input tensors should have the same dtype. Got {transformation_matrix.dtype} and {mean_vector.dtype}" + ) + self.transformation_matrix = transformation_matrix self.mean_vector = mean_vector @@ -1105,9 +1110,13 @@ def forward(self, tensor: Tensor) -> Tensor: ) flat_tensor = tensor.view(-1, n) - self.mean_vector - transformed_tensor = torch.mm(flat_tensor, self.transformation_matrix) - tensor = transformed_tensor.view(shape) - return tensor + + transformation_matrix = self.transformation_matrix + if flat_tensor.dtype != transformation_matrix.dtype: + transformation_matrix = transformation_matrix.to(flat_tensor.dtype) + + transformed_tensor = torch.mm(flat_tensor, transformation_matrix) + return transformed_tensor.view(shape) def __repr__(self) -> str: s = ( From c44dc55419428890148d8ec0d0440c5d09ae73d8 Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Fri, 10 Feb 2023 17:02:11 +0100 Subject: [PATCH 09/13] Reverted unused changes and addressed review comments --- test/prototype_transforms_kernel_infos.py | 11 +---------- torchvision/prototype/transforms/_misc.py | 5 +---- torchvision/transforms/transforms.py | 5 +---- 3 files changed, 3 insertions(+), 18 deletions(-) diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py index 880a7cf262c..d8ee770c293 100644 --- a/test/prototype_transforms_kernel_infos.py +++ b/test/prototype_transforms_kernel_infos.py @@ -80,13 +80,6 @@ def _pixel_difference_closeness_kwargs(uint8_atol, *, dtype=torch.uint8, mae=Fal return dict(atol=uint8_atol / 255 * get_max_value(dtype), rtol=0, mae=mae) -def scripted_vs_eager_double_pixel_difference(atol=1e-6, rtol=1e-5): - return { - (("TestKernels", "test_scripted_vs_eager"), dtype, device): {"atol": atol, "rtol": rtol, "mae": False} - for device, dtype in [("cpu", torch.float64), ("cuda", torch.float64)] - } - - def cuda_vs_cpu_pixel_difference(atol=1): return { (("TestKernels", "test_cuda_vs_cpu"), dtype, "cuda"): _pixel_difference_closeness_kwargs(atol, dtype=dtype) @@ -1272,9 +1265,7 @@ def sample_inputs_perspective_video(): KernelInfo( F.perspective_video, sample_inputs_fn=sample_inputs_perspective_video, - closeness_kwargs={ - **cuda_vs_cpu_pixel_difference(), - }, + closeness_kwargs=cuda_vs_cpu_pixel_difference(), ), ] ) diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py index 04ed519aecb..39d9dc103f4 100644 --- a/torchvision/prototype/transforms/_misc.py +++ b/torchvision/prototype/transforms/_misc.py @@ -99,10 +99,7 @@ def _transform( flat_tensor = inpt.reshape(-1, n) - self.mean_vector - transformation_matrix = self.transformation_matrix - if flat_tensor.dtype != transformation_matrix.dtype: - transformation_matrix = transformation_matrix.to(flat_tensor.dtype) - + transformation_matrix = self.transformation_matrix.to(flat_tensor.dtype) transformed_tensor = torch.mm(flat_tensor, transformation_matrix) return transformed_tensor.reshape(shape) diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py index 857226b07ab..e39e04c3478 100644 --- a/torchvision/transforms/transforms.py +++ b/torchvision/transforms/transforms.py @@ -1111,10 +1111,7 @@ def forward(self, tensor: Tensor) -> Tensor: flat_tensor = tensor.view(-1, n) - self.mean_vector - transformation_matrix = self.transformation_matrix - if flat_tensor.dtype != transformation_matrix.dtype: - transformation_matrix = transformation_matrix.to(flat_tensor.dtype) - + transformation_matrix = self.transformation_matrix.to(flat_tensor.dtype) transformed_tensor = torch.mm(flat_tensor, transformation_matrix) return transformed_tensor.view(shape) From a9be544ed2cb7aff6ebbef26ff6d4d97afd2e9ba Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Mon, 13 Feb 2023 11:47:44 +0100 Subject: [PATCH 10/13] Added more comments and enables f64 tests with large atol/rtol --- test/prototype_transforms_kernel_infos.py | 21 ++++++++++++------- .../transforms/functional/_geometry.py | 18 +++++++--------- 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py index d8ee770c293..83ca4037a37 100644 --- a/test/prototype_transforms_kernel_infos.py +++ b/test/prototype_transforms_kernel_infos.py @@ -105,6 +105,12 @@ def float32_vs_uint8_pixel_difference(atol=1, mae=False): } +def scripted_vs_eager_double_pixel_difference(device, atol=1e-6, rtol=1e-6): + return { + (("TestKernels", "test_scripted_vs_eager"), torch.float64, device): {"atol": atol, "rtol": rtol, "mae": False}, + } + + def pil_reference_wrapper(pil_kernel): @functools.wraps(pil_kernel) def wrapper(input_tensor, *other_args, **kwargs): @@ -787,11 +793,7 @@ def reference_inputs_rotate_image_tensor(): def sample_inputs_rotate_bounding_box(): - # Skip test for dtype=float64, otherwise test_scripted_vs_eager is failing - # Mismatched elements: 6 / 24 (25.0%) - # Greatest absolute difference: 1.0638606902091396e-06 at index (1, 0, 0) (up to 1e-07 allowed) - # Greatest relative difference: 4.713177909318502e-06 at index (0, 0, 0) (up to 1e-07 allowed) - for bounding_box_loader in make_bounding_box_loaders(dtypes=(torch.float32, torch.int64)): + for bounding_box_loader in make_bounding_box_loaders(dtypes=(torch.float32, torch.float64, torch.int64)): yield ArgsKwargs( bounding_box_loader, format=bounding_box_loader.format, @@ -828,6 +830,10 @@ def sample_inputs_rotate_video(): KernelInfo( F.rotate_bounding_box, sample_inputs_fn=sample_inputs_rotate_bounding_box, + closeness_kwargs={ + **scripted_vs_eager_double_pixel_difference("cpu", atol=1e-6, rtol=1e-6), + **scripted_vs_eager_double_pixel_difference("cuda", atol=1e-6, rtol=1e-6), + }, ), KernelInfo( F.rotate_mask, @@ -1178,8 +1184,7 @@ def reference_inputs_pad_bounding_box(): def sample_inputs_perspective_image_tensor(): - # Skip tests on dtype float64, otherwise scripted vs eager are failing - for image_loader in make_image_loaders(sizes=["random"], dtypes=(torch.uint8, torch.float32)): + for image_loader in make_image_loaders(sizes=["random"], dtypes=(torch.uint8, torch.float32, torch.float64)): for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype): yield ArgsKwargs(image_loader, None, None, fill=fill, coefficients=_PERSPECTIVE_COEFFS[0]) @@ -1246,6 +1251,8 @@ def sample_inputs_perspective_video(): **pil_reference_pixel_difference(2, mae=True), **cuda_vs_cpu_pixel_difference(), **float32_vs_uint8_pixel_difference(), + **scripted_vs_eager_double_pixel_difference("cpu", atol=1e-5, rtol=1e-5), + **scripted_vs_eager_double_pixel_difference("cuda", atol=1e-5, rtol=1e-5), }, ), KernelInfo( diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py index b8766cfdd5c..aa16dc0afed 100644 --- a/torchvision/prototype/transforms/functional/_geometry.py +++ b/torchvision/prototype/transforms/functional/_geometry.py @@ -407,6 +407,7 @@ def _apply_grid_transform( img: torch.Tensor, grid: torch.Tensor, mode: str, fill: datapoints.FillTypeJIT ) -> torch.Tensor: + # We are using context knowledge that grid should have float dtype fp = img.dtype == grid.dtype float_img = img if fp else img.to(grid.dtype) @@ -1490,6 +1491,9 @@ def elastic_image_tensor( device = image.device dtype = image.dtype if torch.is_floating_point(image) else torch.float32 + # We are aware that if input image dtype is uint8 and displacement is float64 then + # displacement will be casted to float32 and all computations will be done with float32 + # We can fix this later if needed if ndim > 4: image = image.reshape((-1,) + shape[-3:]) @@ -1500,11 +1504,8 @@ def elastic_image_tensor( else: needs_unsquash = False - if displacement.dtype != dtype: - displacement = displacement.to(dtype) - - if displacement.device != device: - displacement = displacement.to(device) + if displacement.dtype != dtype or displacement.device != device: + displacement = displacement.to(dtype=dtype, device=device) image_height, image_width = shape[-2:] grid = _create_identity_grid((image_height, image_width), device=device, dtype=dtype).add_(displacement) @@ -1552,11 +1553,8 @@ def elastic_bounding_box( device = bounding_box.device dtype = bounding_box.dtype if torch.is_floating_point(bounding_box) else torch.float32 - if displacement.dtype != dtype: - displacement = displacement.to(dtype) - - if displacement.device != device: - displacement = displacement.to(device) + if displacement.dtype != dtype or displacement.device != device: + displacement = displacement.to(dtype=dtype, device=device) original_shape = bounding_box.shape bounding_box = ( From 35f34128369304a4b66c9d906a80adfedcd49e32 Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Mon, 13 Feb 2023 11:52:34 +0100 Subject: [PATCH 11/13] Added f64 tests for perspective op on video input + updated atol/rtol --- test/prototype_transforms_kernel_infos.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py index 83ca4037a37..25153a7f71f 100644 --- a/test/prototype_transforms_kernel_infos.py +++ b/test/prototype_transforms_kernel_infos.py @@ -1230,11 +1230,10 @@ def reference_inputs_perspective_mask(): def sample_inputs_perspective_video(): - # Skip tests on dtype float64, otherwise scripted vs eager are failing for video_loader in make_video_loaders( sizes=["random"], num_frames=["random"], - dtypes=(torch.uint8, torch.float32), + dtypes=(torch.uint8, torch.float32, torch.float64), ): yield ArgsKwargs(video_loader, None, None, coefficients=_PERSPECTIVE_COEFFS[0]) @@ -1272,7 +1271,11 @@ def sample_inputs_perspective_video(): KernelInfo( F.perspective_video, sample_inputs_fn=sample_inputs_perspective_video, - closeness_kwargs=cuda_vs_cpu_pixel_difference(), + closeness_kwargs={ + **cuda_vs_cpu_pixel_difference(), + **scripted_vs_eager_double_pixel_difference("cpu", atol=1e-5, rtol=1e-5), + **scripted_vs_eager_double_pixel_difference("cuda", atol=1e-5, rtol=1e-5), + } ), ] ) From bf46576b7e0bf8978d0f398530848c4258fea4fd Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Mon, 13 Feb 2023 12:25:11 +0100 Subject: [PATCH 12/13] Fixed lint --- test/prototype_transforms_kernel_infos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py index c6bfce92655..c54496a57cf 100644 --- a/test/prototype_transforms_kernel_infos.py +++ b/test/prototype_transforms_kernel_infos.py @@ -1293,7 +1293,7 @@ def sample_inputs_perspective_video(): **cuda_vs_cpu_pixel_difference(), **scripted_vs_eager_double_pixel_difference("cpu", atol=1e-5, rtol=1e-5), **scripted_vs_eager_double_pixel_difference("cuda", atol=1e-5, rtol=1e-5), - } + }, ), ] ) From d143d332f991837296cc3e35f1dbedddc625c244 Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Mon, 13 Feb 2023 12:59:39 +0100 Subject: [PATCH 13/13] Relaxed atol for rotate bbox tests --- test/prototype_transforms_kernel_infos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py index c54496a57cf..ff12a61cb1c 100644 --- a/test/prototype_transforms_kernel_infos.py +++ b/test/prototype_transforms_kernel_infos.py @@ -835,7 +835,7 @@ def sample_inputs_rotate_video(): sample_inputs_fn=sample_inputs_rotate_bounding_box, closeness_kwargs={ **scripted_vs_eager_double_pixel_difference("cpu", atol=1e-6, rtol=1e-6), - **scripted_vs_eager_double_pixel_difference("cuda", atol=1e-6, rtol=1e-6), + **scripted_vs_eager_double_pixel_difference("cuda", atol=1e-5, rtol=1e-5), }, ), KernelInfo(