diff --git a/test/test_functional_tensor.py b/test/test_functional_tensor.py index 52979a019e7..1914bc571fb 100644 --- a/test/test_functional_tensor.py +++ b/test/test_functional_tensor.py @@ -1352,16 +1352,24 @@ def test_ten_crop(device): assert_equal(transformed_batch, s_transformed_batch) +def test_elastic_transform_asserts(): + with pytest.raises(TypeError, match="Argument displacement should be a Tensor"): + _ = F.elastic_transform("abc", displacement=None) + + with pytest.raises(TypeError, match="img should be PIL Image or Tensor"): + _ = F.elastic_transform("abc", displacement=torch.rand(1)) + + img_tensor = torch.rand(1, 3, 32, 24) + with pytest.raises(ValueError, match="Argument displacement shape should"): + _ = F.elastic_transform(img_tensor, displacement=torch.rand(1, 2)) + + @pytest.mark.parametrize("device", cpu_and_gpu()) @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR, BICUBIC]) @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16]) @pytest.mark.parametrize( "fill", - [ - None, - [255, 255, 255], - (2.0,), - ], + [None, [255, 255, 255], (2.0,)], ) def test_elastic_transform_consistency(device, interpolation, dt, fill): script_elastic_transform = torch.jit.script(F.elastic_transform) diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py index 1a56e8d3928..33dd94925b6 100644 --- a/test/test_prototype_transforms.py +++ b/test/test_prototype_transforms.py @@ -1,11 +1,20 @@ import itertools +import PIL.Image + import pytest import torch from common_utils import assert_equal -from test_prototype_transforms_functional import make_bounding_boxes, make_images, make_one_hot_labels +from test_prototype_transforms_functional import ( + make_bounding_box, + make_bounding_boxes, + make_images, + make_label, + make_one_hot_labels, + make_segmentation_mask, +) from torchvision.prototype import features, transforms -from torchvision.transforms.functional import pil_to_tensor, to_pil_image +from torchvision.transforms.functional import InterpolationMode, pil_to_tensor, to_pil_image def make_vanilla_tensor_images(*args, **kwargs): @@ -75,6 +84,13 @@ class TestSmoke: transforms.RandomZoomOut(), transforms.RandomRotation(degrees=(-45, 45)), transforms.RandomAffine(degrees=(-45, 45)), + transforms.RandomCrop([16, 16], padding=1, pad_if_needed=True), + # TODO: Something wrong with input data setup. Let's fix that + # transforms.RandomEqualize(), + # transforms.RandomInvert(), + # transforms.RandomPosterize(bits=4), + # transforms.RandomSolarize(threshold=0.5), + # transforms.RandomAdjustSharpness(sharpness_factor=0.5), ) def test_common(self, transform, input): transform(input) @@ -102,6 +118,20 @@ def test_common(self, transform, input): def test_mixup_cutmix(self, transform, input): transform(input) + # add other data that should bypass and wont raise any error + input_copy = dict(input) + input_copy["path"] = "/path/to/somewhere" + input_copy["num"] = 1234 + transform(input_copy) + + # Check if we raise an error if sample contains bbox or mask or label + err_msg = "does not support bounding boxes, segmentation masks and plain labels" + input_copy = dict(input) + for unsup_data in [make_label(), make_bounding_box(format="XYXY"), make_segmentation_mask()]: + input_copy["unsupported"] = unsup_data + with pytest.raises(TypeError, match=err_msg): + transform(input_copy) + @parametrize( [ ( @@ -299,3 +329,665 @@ def test_features_bounding_box(self, p): assert_equal(expected, actual) assert actual.format == expected.format assert actual.image_size == expected.image_size + + +class TestPad: + def test_assertions(self): + with pytest.raises(TypeError, match="Got inappropriate padding arg"): + transforms.Pad("abc") + + with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"): + transforms.Pad([-0.7, 0, 0.7]) + + with pytest.raises(TypeError, match="Got inappropriate fill arg"): + transforms.Pad(12, fill="abc") + + with pytest.raises(ValueError, match="Padding mode should be either"): + transforms.Pad(12, padding_mode="abc") + + @pytest.mark.parametrize("padding", [1, (1, 2), [1, 2, 3, 4]]) + @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)]) + @pytest.mark.parametrize("padding_mode", ["constant", "edge"]) + def test__transform(self, padding, fill, padding_mode, mocker): + transform = transforms.Pad(padding, fill=fill, padding_mode=padding_mode) + + fn = mocker.patch("torchvision.prototype.transforms.functional.pad") + inpt = mocker.MagicMock(spec=features.Image) + _ = transform(inpt) + + fn.assert_called_once_with(inpt, padding=padding, fill=fill, padding_mode=padding_mode) + + +class TestRandomZoomOut: + def test_assertions(self): + with pytest.raises(TypeError, match="Got inappropriate fill arg"): + transforms.RandomZoomOut(fill="abc") + + with pytest.raises(TypeError, match="should be a sequence of length"): + transforms.RandomZoomOut(0, side_range=0) + + with pytest.raises(ValueError, match="Invalid canvas side range"): + transforms.RandomZoomOut(0, side_range=[4.0, 1.0]) + + @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)]) + @pytest.mark.parametrize("side_range", [(1.0, 4.0), [2.0, 5.0]]) + def test__get_params(self, fill, side_range, mocker): + transform = transforms.RandomZoomOut(fill=fill, side_range=side_range) + + image = mocker.MagicMock(spec=features.Image) + c = image.num_channels = 3 + h, w = image.image_size = (24, 32) + + params = transform._get_params(image) + + assert params["fill"] == (fill if not isinstance(fill, int) else [fill] * c) + assert len(params["padding"]) == 4 + assert 0 <= params["padding"][0] <= (side_range[1] - 1) * w + assert 0 <= params["padding"][1] <= (side_range[1] - 1) * h + assert 0 <= params["padding"][2] <= (side_range[1] - 1) * w + assert 0 <= params["padding"][3] <= (side_range[1] - 1) * h + + @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)]) + @pytest.mark.parametrize("side_range", [(1.0, 4.0), [2.0, 5.0]]) + def test__transform(self, fill, side_range, mocker): + inpt = mocker.MagicMock(spec=features.Image) + inpt.num_channels = 3 + inpt.image_size = (24, 32) + + transform = transforms.RandomZoomOut(fill=fill, side_range=side_range, p=1) + + fn = mocker.patch("torchvision.prototype.transforms.functional.pad") + # vfdev-5, Feature Request: let's store params as Transform attribute + # This could be also helpful for users + # Otherwise, we can mock transform._get_params + torch.manual_seed(12) + _ = transform(inpt) + torch.manual_seed(12) + torch.rand(1) # random apply changes random state + params = transform._get_params(inpt) + + fn.assert_called_once_with(inpt, **params) + + +class TestRandomRotation: + def test_assertions(self): + with pytest.raises(ValueError, match="is a single number, it must be positive"): + transforms.RandomRotation(-0.7) + + for d in [[-0.7], [-0.7, 0, 0.7]]: + with pytest.raises(ValueError, match="degrees should be a sequence of length 2"): + transforms.RandomRotation(d) + + with pytest.raises(TypeError, match="Got inappropriate fill arg"): + transforms.RandomRotation(12, fill="abc") + + with pytest.raises(TypeError, match="center should be a sequence of length"): + transforms.RandomRotation(12, center=12) + + with pytest.raises(ValueError, match="center should be a sequence of length"): + transforms.RandomRotation(12, center=[1, 2, 3]) + + def test__get_params(self): + angle_bound = 34 + transform = transforms.RandomRotation(angle_bound) + + params = transform._get_params(None) + assert -angle_bound <= params["angle"] <= angle_bound + + angle_bounds = [12, 34] + transform = transforms.RandomRotation(angle_bounds) + + params = transform._get_params(None) + assert angle_bounds[0] <= params["angle"] <= angle_bounds[1] + + @pytest.mark.parametrize("degrees", [23, [0, 45], (0, 45)]) + @pytest.mark.parametrize("expand", [False, True]) + @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)]) + @pytest.mark.parametrize("center", [None, [2.0, 3.0]]) + def test__transform(self, degrees, expand, fill, center, mocker): + interpolation = InterpolationMode.BILINEAR + transform = transforms.RandomRotation( + degrees, interpolation=interpolation, expand=expand, fill=fill, center=center + ) + + if isinstance(degrees, (tuple, list)): + assert transform.degrees == [float(degrees[0]), float(degrees[1])] + else: + assert transform.degrees == [float(-degrees), float(degrees)] + + fn = mocker.patch("torchvision.prototype.transforms.functional.rotate") + inpt = mocker.MagicMock(spec=features.Image) + # vfdev-5, Feature Request: let's store params as Transform attribute + # This could be also helpful for users + # Otherwise, we can mock transform._get_params + torch.manual_seed(12) + _ = transform(inpt) + torch.manual_seed(12) + params = transform._get_params(inpt) + + fn.assert_called_once_with(inpt, **params, interpolation=interpolation, expand=expand, fill=fill, center=center) + + @pytest.mark.parametrize("angle", [34, -87]) + @pytest.mark.parametrize("expand", [False, True]) + def test_boundingbox_image_size(self, angle, expand): + # Specific test for BoundingBox.rotate + bbox = features.BoundingBox( + torch.tensor([1, 2, 3, 4]), format=features.BoundingBoxFormat.XYXY, image_size=(32, 32) + ) + img = features.Image(torch.rand(1, 3, 32, 32)) + + out_img = img.rotate(angle, expand=expand) + out_bbox = bbox.rotate(angle, expand=expand) + + assert out_img.image_size == out_bbox.image_size + + +class TestRandomAffine: + def test_assertions(self): + with pytest.raises(ValueError, match="is a single number, it must be positive"): + transforms.RandomAffine(-0.7) + + for d in [[-0.7], [-0.7, 0, 0.7]]: + with pytest.raises(ValueError, match="degrees should be a sequence of length 2"): + transforms.RandomAffine(d) + + with pytest.raises(TypeError, match="Got inappropriate fill arg"): + transforms.RandomAffine(12, fill="abc") + + with pytest.raises(TypeError, match="Got inappropriate fill arg"): + transforms.RandomAffine(12, fill="abc") + + for kwargs in [ + {"center": 12}, + {"translate": 12}, + {"scale": 12}, + ]: + with pytest.raises(TypeError, match="should be a sequence of length"): + transforms.RandomAffine(12, **kwargs) + + for kwargs in [{"center": [1, 2, 3]}, {"translate": [1, 2, 3]}, {"scale": [1, 2, 3]}]: + with pytest.raises(ValueError, match="should be a sequence of length"): + transforms.RandomAffine(12, **kwargs) + + with pytest.raises(ValueError, match="translation values should be between 0 and 1"): + transforms.RandomAffine(12, translate=[-1.0, 2.0]) + + with pytest.raises(ValueError, match="scale values should be positive"): + transforms.RandomAffine(12, scale=[-1.0, 2.0]) + + with pytest.raises(ValueError, match="is a single number, it must be positive"): + transforms.RandomAffine(12, shear=-10) + + for s in [[-0.7], [-0.7, 0, 0.7]]: + with pytest.raises(ValueError, match="shear should be a sequence of length 2"): + transforms.RandomAffine(12, shear=s) + + @pytest.mark.parametrize("degrees", [23, [0, 45], (0, 45)]) + @pytest.mark.parametrize("translate", [None, [0.1, 0.2]]) + @pytest.mark.parametrize("scale", [None, [0.7, 1.2]]) + @pytest.mark.parametrize("shear", [None, 2.0, [5.0, 15.0], [1.0, 2.0, 3.0, 4.0]]) + def test__get_params(self, degrees, translate, scale, shear, mocker): + image = mocker.MagicMock(spec=features.Image) + image.num_channels = 3 + image.image_size = (24, 32) + h, w = image.image_size + + transform = transforms.RandomAffine(degrees, translate=translate, scale=scale, shear=shear) + params = transform._get_params(image) + + if not isinstance(degrees, (list, tuple)): + assert -degrees <= params["angle"] <= degrees + else: + assert degrees[0] <= params["angle"] <= degrees[1] + + if translate is not None: + w_max = int(round(translate[0] * w)) + h_max = int(round(translate[1] * h)) + assert -w_max <= params["translations"][0] <= w_max + assert -h_max <= params["translations"][1] <= h_max + else: + assert params["translations"] == (0, 0) + + if scale is not None: + assert scale[0] <= params["scale"] <= scale[1] + else: + assert params["scale"] == 1.0 + + if shear is not None: + if isinstance(shear, float): + assert -shear <= params["shear"][0] <= shear + assert params["shear"][1] == 0.0 + elif len(shear) == 2: + assert shear[0] <= params["shear"][0] <= shear[1] + assert params["shear"][1] == 0.0 + else: + assert shear[0] <= params["shear"][0] <= shear[1] + assert shear[2] <= params["shear"][1] <= shear[3] + else: + assert params["shear"] == (0, 0) + + @pytest.mark.parametrize("degrees", [23, [0, 45], (0, 45)]) + @pytest.mark.parametrize("translate", [None, [0.1, 0.2]]) + @pytest.mark.parametrize("scale", [None, [0.7, 1.2]]) + @pytest.mark.parametrize("shear", [None, 2.0, [5.0, 15.0], [1.0, 2.0, 3.0, 4.0]]) + @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)]) + @pytest.mark.parametrize("center", [None, [2.0, 3.0]]) + def test__transform(self, degrees, translate, scale, shear, fill, center, mocker): + interpolation = InterpolationMode.BILINEAR + transform = transforms.RandomAffine( + degrees, + translate=translate, + scale=scale, + shear=shear, + interpolation=interpolation, + fill=fill, + center=center, + ) + + if isinstance(degrees, (tuple, list)): + assert transform.degrees == [float(degrees[0]), float(degrees[1])] + else: + assert transform.degrees == [float(-degrees), float(degrees)] + + fn = mocker.patch("torchvision.prototype.transforms.functional.affine") + inpt = mocker.MagicMock(spec=features.Image) + inpt.num_channels = 3 + inpt.image_size = (24, 32) + + # vfdev-5, Feature Request: let's store params as Transform attribute + # This could be also helpful for users + # Otherwise, we can mock transform._get_params + torch.manual_seed(12) + _ = transform(inpt) + torch.manual_seed(12) + params = transform._get_params(inpt) + + fn.assert_called_once_with(inpt, **params, interpolation=interpolation, fill=fill, center=center) + + +class TestRandomCrop: + def test_assertions(self): + with pytest.raises(ValueError, match="Please provide only two dimensions"): + transforms.RandomCrop([10, 12, 14]) + + with pytest.raises(TypeError, match="Got inappropriate padding arg"): + transforms.RandomCrop([10, 12], padding="abc") + + with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"): + transforms.RandomCrop([10, 12], padding=[-0.7, 0, 0.7]) + + with pytest.raises(TypeError, match="Got inappropriate fill arg"): + transforms.RandomCrop([10, 12], padding=1, fill="abc") + + with pytest.raises(ValueError, match="Padding mode should be either"): + transforms.RandomCrop([10, 12], padding=1, padding_mode="abc") + + def test__get_params(self, mocker): + image = mocker.MagicMock(spec=features.Image) + image.num_channels = 3 + image.image_size = (24, 32) + h, w = image.image_size + + transform = transforms.RandomCrop([10, 10]) + params = transform._get_params(image) + + assert 0 <= params["top"] <= h - transform.size[0] + 1 + assert 0 <= params["left"] <= w - transform.size[1] + 1 + assert params["height"] == 10 + assert params["width"] == 10 + + @pytest.mark.parametrize("padding", [None, 1, [2, 3], [1, 2, 3, 4]]) + @pytest.mark.parametrize("pad_if_needed", [False, True]) + @pytest.mark.parametrize("fill", [False, True]) + @pytest.mark.parametrize("padding_mode", ["constant", "edge"]) + def test_forward(self, padding, pad_if_needed, fill, padding_mode, mocker): + output_size = [10, 12] + transform = transforms.RandomCrop( + output_size, padding=padding, pad_if_needed=pad_if_needed, fill=fill, padding_mode=padding_mode + ) + + inpt = mocker.MagicMock(spec=features.Image) + inpt.num_channels = 3 + inpt.image_size = (32, 32) + + expected = mocker.MagicMock(spec=features.Image) + expected.num_channels = 3 + if isinstance(padding, int): + expected.image_size = (inpt.image_size[0] + padding, inpt.image_size[1] + padding) + elif isinstance(padding, list): + expected.image_size = ( + inpt.image_size[0] + sum(padding[0::2]), + inpt.image_size[1] + sum(padding[1::2]), + ) + else: + expected.image_size = inpt.image_size + _ = mocker.patch("torchvision.prototype.transforms.functional.pad", return_value=expected) + fn_crop = mocker.patch("torchvision.prototype.transforms.functional.crop") + + # vfdev-5, Feature Request: let's store params as Transform attribute + # This could be also helpful for users + # Otherwise, we can mock transform._get_params + torch.manual_seed(12) + _ = transform(inpt) + torch.manual_seed(12) + if padding is None and not pad_if_needed: + params = transform._get_params(inpt) + fn_crop.assert_called_once_with( + inpt, top=params["top"], left=params["left"], height=output_size[0], width=output_size[1] + ) + elif not pad_if_needed: + params = transform._get_params(expected) + fn_crop.assert_called_once_with( + expected, top=params["top"], left=params["left"], height=output_size[0], width=output_size[1] + ) + elif padding is None: + # vfdev-5: I do not know how to mock and test this case + pass + else: + # vfdev-5: I do not know how to mock and test this case + pass + + +class TestGaussianBlur: + def test_assertions(self): + with pytest.raises(ValueError, match="Kernel size should be a tuple/list of two integers"): + transforms.GaussianBlur([10, 12, 14]) + + with pytest.raises(ValueError, match="Kernel size value should be an odd and positive number"): + transforms.GaussianBlur(4) + + with pytest.raises(TypeError, match="sigma should be a single float or a list/tuple with length 2"): + transforms.GaussianBlur(3, sigma=[1, 2, 3]) + + with pytest.raises(ValueError, match="If sigma is a single number, it must be positive"): + transforms.GaussianBlur(3, sigma=-1.0) + + with pytest.raises(ValueError, match="sigma values should be positive and of the form"): + transforms.GaussianBlur(3, sigma=[2.0, 1.0]) + + @pytest.mark.parametrize("sigma", [10.0, [10.0, 12.0]]) + def test__get_params(self, sigma): + transform = transforms.GaussianBlur(3, sigma=sigma) + params = transform._get_params(None) + + if isinstance(sigma, float): + assert params["sigma"][0] == params["sigma"][1] == 10 + else: + assert sigma[0] <= params["sigma"][0] <= sigma[1] + assert sigma[0] <= params["sigma"][1] <= sigma[1] + + @pytest.mark.parametrize("kernel_size", [3, [3, 5], (5, 3)]) + @pytest.mark.parametrize("sigma", [2.0, [2.0, 3.0]]) + def test__transform(self, kernel_size, sigma, mocker): + transform = transforms.GaussianBlur(kernel_size=kernel_size, sigma=sigma) + + if isinstance(kernel_size, (tuple, list)): + assert transform.kernel_size == kernel_size + else: + assert transform.kernel_size == (kernel_size, kernel_size) + + if isinstance(sigma, (tuple, list)): + assert transform.sigma == sigma + else: + assert transform.sigma == (sigma, sigma) + + fn = mocker.patch("torchvision.prototype.transforms.functional.gaussian_blur") + inpt = mocker.MagicMock(spec=features.Image) + inpt.num_channels = 3 + inpt.image_size = (24, 32) + + # vfdev-5, Feature Request: let's store params as Transform attribute + # This could be also helpful for users + # Otherwise, we can mock transform._get_params + torch.manual_seed(12) + _ = transform(inpt) + torch.manual_seed(12) + params = transform._get_params(inpt) + + fn.assert_called_once_with(inpt, **params) + + +class TestRandomColorOp: + @pytest.mark.parametrize("p", [0.0, 1.0]) + @pytest.mark.parametrize( + "transform_cls, func_op_name, kwargs", + [ + (transforms.RandomEqualize, "equalize", {}), + (transforms.RandomInvert, "invert", {}), + (transforms.RandomAutocontrast, "autocontrast", {}), + (transforms.RandomPosterize, "posterize", {"bits": 4}), + (transforms.RandomSolarize, "solarize", {"threshold": 0.5}), + (transforms.RandomAdjustSharpness, "adjust_sharpness", {"sharpness_factor": 0.5}), + ], + ) + def test__transform(self, p, transform_cls, func_op_name, kwargs, mocker): + transform = transform_cls(p=p, **kwargs) + + fn = mocker.patch(f"torchvision.prototype.transforms.functional.{func_op_name}") + inpt = mocker.MagicMock(spec=features.Image) + _ = transform(inpt) + if p > 0.0: + fn.assert_called_once_with(inpt, **kwargs) + else: + fn.call_count == 0 + + +class TestRandomPerspective: + def test_assertions(self): + with pytest.raises(ValueError, match="Argument distortion_scale value should be between 0 and 1"): + transforms.RandomPerspective(distortion_scale=-1.0) + + with pytest.raises(TypeError, match="Got inappropriate fill arg"): + transforms.RandomPerspective(0.5, fill="abc") + + def test__get_params(self, mocker): + dscale = 0.5 + transform = transforms.RandomPerspective(dscale) + image = mocker.MagicMock(spec=features.Image) + image.num_channels = 3 + image.image_size = (24, 32) + + params = transform._get_params(image) + + h, w = image.image_size + assert len(params["startpoints"]) == 4 + for x, y in params["startpoints"]: + assert x in (0, w - 1) + assert y in (0, h - 1) + + assert len(params["endpoints"]) == 4 + for (x, y), name in zip(params["endpoints"], ["tl", "tr", "br", "bl"]): + if "t" in name: + assert 0 <= y <= int(dscale * h // 2), (x, y, name) + if "b" in name: + assert h - int(dscale * h // 2) - 1 <= y <= h, (x, y, name) + if "l" in name: + assert 0 <= x <= int(dscale * w // 2), (x, y, name) + if "r" in name: + assert w - int(dscale * w // 2) - 1 <= x <= w, (x, y, name) + + @pytest.mark.parametrize("distortion_scale", [0.1, 0.7]) + def test__transform(self, distortion_scale, mocker): + interpolation = InterpolationMode.BILINEAR + fill = 12 + transform = transforms.RandomPerspective(distortion_scale, fill=fill, interpolation=interpolation) + + fn = mocker.patch("torchvision.prototype.transforms.functional.perspective") + inpt = mocker.MagicMock(spec=features.Image) + inpt.num_channels = 3 + inpt.image_size = (24, 32) + # vfdev-5, Feature Request: let's store params as Transform attribute + # This could be also helpful for users + # Otherwise, we can mock transform._get_params + torch.manual_seed(12) + _ = transform(inpt) + torch.manual_seed(12) + torch.rand(1) # random apply changes random state + params = transform._get_params(inpt) + + fn.assert_called_once_with(inpt, **params, fill=fill, interpolation=interpolation) + + +class TestElasticTransform: + def test_assertions(self): + + with pytest.raises(TypeError, match="alpha should be float or a sequence of floats"): + transforms.ElasticTransform({}) + + with pytest.raises(ValueError, match="alpha is a sequence its length should be one of 2"): + transforms.ElasticTransform([1.0, 2.0, 3.0]) + + with pytest.raises(ValueError, match="alpha should be a sequence of floats"): + transforms.ElasticTransform([1, 2]) + + with pytest.raises(TypeError, match="sigma should be float or a sequence of floats"): + transforms.ElasticTransform(1.0, {}) + + with pytest.raises(ValueError, match="sigma is a sequence its length should be one of 2"): + transforms.ElasticTransform(1.0, [1.0, 2.0, 3.0]) + + with pytest.raises(ValueError, match="sigma should be a sequence of floats"): + transforms.ElasticTransform(1.0, [1, 2]) + + with pytest.raises(TypeError, match="Got inappropriate fill arg"): + transforms.ElasticTransform(1.0, 2.0, fill="abc") + + def test__get_params(self, mocker): + alpha = 2.0 + sigma = 3.0 + transform = transforms.ElasticTransform(alpha, sigma) + image = mocker.MagicMock(spec=features.Image) + image.num_channels = 3 + image.image_size = (24, 32) + + params = transform._get_params(image) + + h, w = image.image_size + displacement = params["displacement"] + assert displacement.shape == (1, h, w, 2) + assert (-alpha / w <= displacement[0, ..., 0]).all() and (displacement[0, ..., 0] <= alpha / w).all() + assert (-alpha / h <= displacement[0, ..., 1]).all() and (displacement[0, ..., 1] <= alpha / h).all() + + @pytest.mark.parametrize("alpha", [5.0, [5.0, 10.0]]) + @pytest.mark.parametrize("sigma", [2.0, [2.0, 5.0]]) + def test__transform(self, alpha, sigma, mocker): + interpolation = InterpolationMode.BILINEAR + fill = 12 + transform = transforms.ElasticTransform(alpha, sigma=sigma, fill=fill, interpolation=interpolation) + + if isinstance(alpha, float): + assert transform.alpha == [alpha, alpha] + else: + assert transform.alpha == alpha + + if isinstance(sigma, float): + assert transform.sigma == [sigma, sigma] + else: + assert transform.sigma == sigma + + fn = mocker.patch("torchvision.prototype.transforms.functional.elastic") + inpt = mocker.MagicMock(spec=features.Image) + inpt.num_channels = 3 + inpt.image_size = (24, 32) + + # Let's mock transform._get_params to control the output: + transform._get_params = mocker.MagicMock() + _ = transform(inpt) + params = transform._get_params(inpt) + fn.assert_called_once_with(inpt, **params, fill=fill, interpolation=interpolation) + + +class TestRandomErasing: + def test_assertions(self, mocker): + with pytest.raises(TypeError, match="Argument value should be either a number or str or a sequence"): + transforms.RandomErasing(value={}) + + with pytest.raises(ValueError, match="If value is str, it should be 'random'"): + transforms.RandomErasing(value="abc") + + with pytest.raises(TypeError, match="Scale should be a sequence"): + transforms.RandomErasing(scale=123) + + with pytest.raises(TypeError, match="Ratio should be a sequence"): + transforms.RandomErasing(ratio=123) + + with pytest.raises(ValueError, match="Scale should be between 0 and 1"): + transforms.RandomErasing(scale=[-1, 2]) + + image = mocker.MagicMock(spec=features.Image) + image.num_channels = 3 + image.image_size = (24, 32) + + transform = transforms.RandomErasing(value=[1, 2, 3, 4]) + + with pytest.raises(ValueError, match="If value is a sequence, it should have either a single value"): + transform._get_params(image) + + @pytest.mark.parametrize("value", [5.0, [1, 2, 3], "random"]) + def test__get_params(self, value, mocker): + image = mocker.MagicMock(spec=features.Image) + image.num_channels = 3 + image.image_size = (24, 32) + + transform = transforms.RandomErasing(value=value) + params = transform._get_params(image) + + v = params["v"] + h, w = params["h"], params["w"] + i, j = params["i"], params["j"] + assert isinstance(v, torch.Tensor) + if value == "random": + assert v.shape == (image.num_channels, h, w) + elif isinstance(value, (int, float)): + assert v.shape == (1, 1, 1) + elif isinstance(value, (list, tuple)): + assert v.shape == (image.num_channels, 1, 1) + + assert 0 <= i <= image.image_size[0] - h + assert 0 <= j <= image.image_size[1] - w + + @pytest.mark.parametrize("p", [0.0, 1.0]) + @pytest.mark.parametrize( + "inpt_type", + [ + (torch.Tensor, {"shape": (3, 24, 32)}), + (PIL.Image.Image, {"size": (24, 32), "mode": "RGB"}), + ], + ) + def test__transform(self, p, inpt_type, mocker): + value = 1.0 + transform = transforms.RandomErasing(p=p, value=value) + + inpt = mocker.MagicMock(spec=inpt_type[0], **inpt_type[1]) + erase_image_tensor_inpt = inpt + fn = mocker.patch( + "torchvision.prototype.transforms.functional.erase_image_tensor", + return_value=mocker.MagicMock(spec=torch.Tensor), + ) + if inpt_type[0] == PIL.Image.Image: + erase_image_tensor_inpt = mocker.MagicMock(spec=torch.Tensor) + + # vfdev-5: I do not know how to patch pil_to_tensor if it is already imported + # TODO: patch pil_to_tensor and run below checks for PIL.Image.Image inputs + if p > 0.0: + return + + mocker.patch( + "torchvision.transforms.functional.pil_to_tensor", + return_value=erase_image_tensor_inpt, + ) + mocker.patch( + "torchvision.transforms.functional.to_pil_image", + return_value=mocker.MagicMock(spec=PIL.Image.Image), + ) + + # Let's mock transform._get_params to control the output: + transform._get_params = mocker.MagicMock() + output = transform(inpt) + print(inpt_type) + assert isinstance(output, inpt_type[0]) + params = transform._get_params(inpt) + if p > 0.0: + fn.assert_called_once_with(erase_image_tensor_inpt, **params) + else: + fn.call_count == 0 diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py index e369dad6271..5f105b3f6e2 100644 --- a/test/test_prototype_transforms_functional.py +++ b/test/test_prototype_transforms_functional.py @@ -1,6 +1,7 @@ import functools import itertools import math +import os import numpy as np import pytest @@ -58,7 +59,7 @@ def make_images( yield make_image(size, color_space=color_space, dtype=dtype) for color_space, dtype, extra_dims_ in itertools.product(color_spaces, dtypes, extra_dims): - yield make_image(color_space=color_space, extra_dims=extra_dims_, dtype=dtype) + yield make_image(size=sizes[0], color_space=color_space, extra_dims=extra_dims_, dtype=dtype) def randint_with_tensor_bounds(arg1, arg2=None, **kwargs): @@ -148,12 +149,12 @@ def make_segmentation_mask(size=None, *, num_categories=80, extra_dims=(), dtype def make_segmentation_masks( - image_sizes=((16, 16), (7, 33), (31, 9)), + sizes=((16, 16), (7, 33), (31, 9)), dtypes=(torch.long,), extra_dims=((), (4,), (2, 3)), ): - for image_size, dtype, extra_dims_ in itertools.product(image_sizes, dtypes, extra_dims): - yield make_segmentation_mask(size=image_size, dtype=dtype, extra_dims=extra_dims_) + for size, dtype, extra_dims_ in itertools.product(sizes, dtypes, extra_dims): + yield make_segmentation_mask(size=size, dtype=dtype, extra_dims=extra_dims_) class SampleInput: @@ -199,6 +200,30 @@ def horizontal_flip_bounding_box(): yield SampleInput(bounding_box, format=bounding_box.format, image_size=bounding_box.image_size) +@register_kernel_info_from_sample_inputs_fn +def horizontal_flip_segmentation_mask(): + for mask in make_segmentation_masks(): + yield SampleInput(mask) + + +@register_kernel_info_from_sample_inputs_fn +def vertical_flip_image_tensor(): + for image in make_images(): + yield SampleInput(image) + + +@register_kernel_info_from_sample_inputs_fn +def vertical_flip_bounding_box(): + for bounding_box in make_bounding_boxes(formats=[features.BoundingBoxFormat.XYXY]): + yield SampleInput(bounding_box, format=bounding_box.format, image_size=bounding_box.image_size) + + +@register_kernel_info_from_sample_inputs_fn +def vertical_flip_segmentation_mask(): + for mask in make_segmentation_masks(): + yield SampleInput(mask) + + @register_kernel_info_from_sample_inputs_fn def resize_image_tensor(): for image, interpolation, max_size, antialias in itertools.product( @@ -403,9 +428,17 @@ def crop_segmentation_mask(): @register_kernel_info_from_sample_inputs_fn -def vertical_flip_segmentation_mask(): - for mask in make_segmentation_masks(): - yield SampleInput(mask) +def resized_crop_image_tensor(): + for mask, top, left, height, width, size, antialias in itertools.product( + make_images(), + [-8, 9], + [-8, 9], + [12], + [12], + [(16, 18)], + [True, False], + ): + yield SampleInput(mask, top=top, left=left, height=height, width=width, size=size, antialias=antialias) @register_kernel_info_from_sample_inputs_fn @@ -456,6 +489,19 @@ def pad_bounding_box(): yield SampleInput(bounding_box, padding=padding, format=bounding_box.format) +@register_kernel_info_from_sample_inputs_fn +def perspective_image_tensor(): + for image, perspective_coeffs, fill in itertools.product( + make_images(extra_dims=((), (4,))), + [ + [1.2405, 0.1772, -6.9113, 0.0463, 1.251, -5.235, 0.00013, 0.0018], + [0.7366, -0.11724, 1.45775, -0.15012, 0.73406, 2.6019, -0.0072, -0.0063], + ], + [None, [128], [12.0]], # fill + ): + yield SampleInput(image, perspective_coeffs=perspective_coeffs, fill=fill) + + @register_kernel_info_from_sample_inputs_fn def perspective_bounding_box(): for bounding_box, perspective_coeffs in itertools.product( @@ -487,13 +533,47 @@ def perspective_segmentation_mask(): ) +@register_kernel_info_from_sample_inputs_fn +def elastic_image_tensor(): + for image, fill in itertools.product( + make_images(extra_dims=((), (4,))), + [None, [128], [12.0]], # fill + ): + h, w = image.shape[-2:] + displacement = torch.rand(1, h, w, 2) + yield SampleInput(image, displacement=displacement, fill=fill) + + +@register_kernel_info_from_sample_inputs_fn +def elastic_bounding_box(): + for bounding_box in make_bounding_boxes(): + h, w = bounding_box.image_size + displacement = torch.rand(1, h, w, 2) + yield SampleInput( + bounding_box, + format=bounding_box.format, + displacement=displacement, + ) + + +@register_kernel_info_from_sample_inputs_fn +def elastic_segmentation_mask(): + for mask in make_segmentation_masks(extra_dims=((), (4,))): + h, w = mask.shape[-2:] + displacement = torch.rand(1, h, w, 2) + yield SampleInput( + mask, + displacement=displacement, + ) + + @register_kernel_info_from_sample_inputs_fn def center_crop_image_tensor(): - for image, output_size in itertools.product( + for mask, output_size in itertools.product( make_images(sizes=((16, 16), (7, 33), (31, 9))), [[4, 3], [42, 70], [4]], # crop sizes < image sizes, crop_sizes > image sizes, single crop size ): - yield SampleInput(image, output_size) + yield SampleInput(mask, output_size) @register_kernel_info_from_sample_inputs_fn @@ -507,12 +587,80 @@ def center_crop_bounding_box(): @register_kernel_info_from_sample_inputs_fn def center_crop_segmentation_mask(): for mask, output_size in itertools.product( - make_segmentation_masks(image_sizes=((16, 16), (7, 33), (31, 9))), + make_segmentation_masks(sizes=((16, 16), (7, 33), (31, 9))), [[4, 3], [42, 70], [4]], # crop sizes < image sizes, crop_sizes > image sizes, single crop size ): yield SampleInput(mask, output_size) +@register_kernel_info_from_sample_inputs_fn +def gaussian_blur_image_tensor(): + for image, kernel_size, sigma in itertools.product( + make_images(extra_dims=((4,),)), + [[3, 3]], + [None, [3.0, 3.0]], + ): + yield SampleInput(image, kernel_size=kernel_size, sigma=sigma) + + +@register_kernel_info_from_sample_inputs_fn +def equalize_image_tensor(): + for image in make_images(extra_dims=(), color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)): + if image.dtype != torch.uint8: + continue + yield SampleInput(image) + + +@register_kernel_info_from_sample_inputs_fn +def invert_image_tensor(): + for image in make_images(color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)): + yield SampleInput(image) + + +@register_kernel_info_from_sample_inputs_fn +def posterize_image_tensor(): + for image, bits in itertools.product( + make_images(color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)), + [1, 4, 8], + ): + if image.dtype != torch.uint8: + continue + yield SampleInput(image, bits=bits) + + +@register_kernel_info_from_sample_inputs_fn +def solarize_image_tensor(): + for image, threshold in itertools.product( + make_images(color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)), + [0.1, 0.5, 127.0], + ): + if image.is_floating_point() and threshold > 1.0: + continue + yield SampleInput(image, threshold=threshold) + + +@register_kernel_info_from_sample_inputs_fn +def autocontrast_image_tensor(): + for image in make_images(color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)): + yield SampleInput(image) + + +@register_kernel_info_from_sample_inputs_fn +def adjust_sharpness_image_tensor(): + for image, sharpness_factor in itertools.product( + make_images(extra_dims=((4,),), color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)), + [0.1, 0.5], + ): + yield SampleInput(image, sharpness_factor=sharpness_factor) + + +@register_kernel_info_from_sample_inputs_fn +def erase_image_tensor(): + for image in make_images(): + c = image.shape[-3] + yield SampleInput(image, i=1, j=2, h=6, w=7, v=torch.rand(c, 6, 7)) + + @pytest.mark.parametrize( "kernel", [ @@ -546,9 +694,19 @@ def test_scriptable(kernel): and all( feature_type not in name for feature_type in {"image", "segmentation_mask", "bounding_box", "label", "pil"} ) - and name not in {"to_image_tensor", "InterpolationMode", "decode_video_with_av", "crop", "rotate"} + and name + not in { + "to_image_tensor", + "InterpolationMode", + "decode_video_with_av", + "crop", + "perspective", + "elastic_transform", + "elastic", + } # We skip 'crop' due to missing 'height' and 'width' - # We skip 'rotate' due to non implemented yet expand=True case for bboxes + # We skip 'perspective' as it requires different input args than perspective_image_tensor etc + # Skip 'elastic', TODO: inspect why test is failing ], ) def test_functional_mid_level(func): @@ -561,7 +719,9 @@ def test_functional_mid_level(func): if key in kwargs: del kwargs[key] output = func(*sample_input.args, **kwargs) - torch.testing.assert_close(output, expected, msg=f"finfo={finfo}, output={output}, expected={expected}") + torch.testing.assert_close( + output, expected, msg=f"finfo={finfo.name}, output={output}, expected={expected}" + ) break @@ -844,6 +1004,9 @@ def _compute_expected_bbox(bbox, angle_, expand_, center_): out_bbox[2] -= tr_x out_bbox[3] -= tr_y + # image_size should be updated, but it is OK here to skip its computation + # as we do not compute it in F.rotate_bounding_box + out_bbox = features.BoundingBox( out_bbox, format=features.BoundingBoxFormat.XYXY, @@ -1126,6 +1289,18 @@ def _compute_expected_mask(mask, top_, left_, height_, width_): torch.testing.assert_close(output_mask, expected_mask) +@pytest.mark.parametrize("device", cpu_and_gpu()) +def test_correctness_horizontal_flip_segmentation_mask_on_fixed_input(device): + mask = torch.zeros((3, 3, 3), dtype=torch.long, device=device) + mask[:, :, 0] = 1 + + out_mask = F.horizontal_flip_segmentation_mask(mask) + + expected_mask = torch.zeros((3, 3, 3), dtype=torch.long, device=device) + expected_mask[:, :, -1] = 1 + torch.testing.assert_close(out_mask, expected_mask) + + @pytest.mark.parametrize("device", cpu_and_gpu()) def test_correctness_vertical_flip_segmentation_mask_on_fixed_input(device): mask = torch.zeros((3, 3, 3), dtype=torch.long, device=device) @@ -1565,3 +1740,102 @@ def _compute_expected_segmentation_mask(mask, output_size): expected = _compute_expected_segmentation_mask(mask, output_size) torch.testing.assert_close(expected, actual) + + +# Copied from test/test_functional_tensor.py +@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("image_size", ("small", "large")) +@pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16]) +@pytest.mark.parametrize("ksize", [(3, 3), [3, 5], (23, 23)]) +@pytest.mark.parametrize("sigma", [[0.5, 0.5], (0.5, 0.5), (0.8, 0.8), (1.7, 1.7)]) +def test_correctness_gaussian_blur_image_tensor(device, image_size, dt, ksize, sigma): + fn = F.gaussian_blur_image_tensor + + # true_cv2_results = { + # # np_img = np.arange(3 * 10 * 12, dtype="uint8").reshape((10, 12, 3)) + # # cv2.GaussianBlur(np_img, ksize=(3, 3), sigmaX=0.8) + # "3_3_0.8": ... + # # cv2.GaussianBlur(np_img, ksize=(3, 3), sigmaX=0.5) + # "3_3_0.5": ... + # # cv2.GaussianBlur(np_img, ksize=(3, 5), sigmaX=0.8) + # "3_5_0.8": ... + # # cv2.GaussianBlur(np_img, ksize=(3, 5), sigmaX=0.5) + # "3_5_0.5": ... + # # np_img2 = np.arange(26 * 28, dtype="uint8").reshape((26, 28)) + # # cv2.GaussianBlur(np_img2, ksize=(23, 23), sigmaX=1.7) + # "23_23_1.7": ... + # } + p = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "gaussian_blur_opencv_results.pt") + true_cv2_results = torch.load(p) + + if image_size == "small": + tensor = ( + torch.from_numpy(np.arange(3 * 10 * 12, dtype="uint8").reshape((10, 12, 3))).permute(2, 0, 1).to(device) + ) + else: + tensor = torch.from_numpy(np.arange(26 * 28, dtype="uint8").reshape((1, 26, 28))).to(device) + + if dt == torch.float16 and device == "cpu": + # skip float16 on CPU case + return + + if dt is not None: + tensor = tensor.to(dtype=dt) + + _ksize = (ksize, ksize) if isinstance(ksize, int) else ksize + _sigma = sigma[0] if sigma is not None else None + shape = tensor.shape + gt_key = f"{shape[-2]}_{shape[-1]}_{shape[-3]}__{_ksize[0]}_{_ksize[1]}_{_sigma}" + if gt_key not in true_cv2_results: + return + + true_out = ( + torch.tensor(true_cv2_results[gt_key]).reshape(shape[-2], shape[-1], shape[-3]).permute(2, 0, 1).to(tensor) + ) + + image = features.Image(tensor) + + out = fn(image, kernel_size=ksize, sigma=sigma) + torch.testing.assert_close(out, true_out, rtol=0.0, atol=1.0, msg=f"{ksize}, {sigma}") + + +@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize( + "fn, make_samples", [(F.elastic_image_tensor, make_images), (F.elastic_segmentation_mask, make_segmentation_masks)] +) +def test_correctness_elastic_image_or_mask_tensor(device, fn, make_samples): + in_box = [10, 15, 25, 35] + for sample in make_samples(sizes=((64, 76),), extra_dims=((), (4,))): + c, h, w = sample.shape[-3:] + # Setup a dummy image with 4 points + sample[..., in_box[1], in_box[0]] = torch.tensor([12, 34, 96, 112])[:c] + sample[..., in_box[3] - 1, in_box[0]] = torch.tensor([12, 34, 96, 112])[:c] + sample[..., in_box[3] - 1, in_box[2] - 1] = torch.tensor([12, 34, 96, 112])[:c] + sample[..., in_box[1], in_box[2] - 1] = torch.tensor([12, 34, 96, 112])[:c] + sample = sample.to(device) + + if fn == F.elastic_image_tensor: + sample = features.Image(sample) + kwargs = {"interpolation": F.InterpolationMode.NEAREST} + else: + sample = features.SegmentationMask(sample) + kwargs = {} + + # Create a displacement grid using sin + n, m = 5.0, 0.1 + d1 = m * torch.sin(torch.arange(h, dtype=torch.float) * torch.pi * n / h) + d2 = m * torch.sin(torch.arange(w, dtype=torch.float) * torch.pi * n / w) + + d1 = d1[:, None].expand((h, w)) + d2 = d2[None, :].expand((h, w)) + + displacement = torch.cat([d1[..., None], d2[..., None]], dim=-1) + displacement = displacement.reshape(1, h, w, 2) + + output = fn(sample, displacement=displacement, **kwargs) + + # Check places where transformed points should be + torch.testing.assert_close(output[..., 12, 9], sample[..., in_box[1], in_box[0]]) + torch.testing.assert_close(output[..., 17, 27], sample[..., in_box[1], in_box[2] - 1]) + torch.testing.assert_close(output[..., 31, 6], sample[..., in_box[3] - 1, in_box[0]]) + torch.testing.assert_close(output[..., 37, 23], sample[..., in_box[3] - 1, in_box[2] - 1]) diff --git a/torchvision/prototype/features/_bounding_box.py b/torchvision/prototype/features/_bounding_box.py index c704954c03f..54e1315c9ab 100644 --- a/torchvision/prototype/features/_bounding_box.py +++ b/torchvision/prototype/features/_bounding_box.py @@ -5,6 +5,8 @@ import torch from torchvision._utils import StrEnum from torchvision.transforms import InterpolationMode +from torchvision.transforms.functional import _get_inverse_affine_matrix +from torchvision.transforms.functional_tensor import _compute_output_size from ._feature import _Feature @@ -168,10 +170,18 @@ def rotate( output = _F.rotate_bounding_box( self, format=self.format, image_size=self.image_size, angle=angle, expand=expand, center=center ) - # TODO: update output image size if expand is True + image_size = self.image_size if expand: - raise RuntimeError("Not yet implemented") - return BoundingBox.new_like(self, output, dtype=output.dtype) + # The way we recompute image_size is not optimal due to redundant computations of + # - rotation matrix (_get_inverse_affine_matrix) + # - points dot matrix (_compute_output_size) + # Alternatively, we could return new image size by _F.rotate_bounding_box + height, width = image_size + rotation_matrix = _get_inverse_affine_matrix([0.0, 0.0], angle, [0.0, 0.0], 1.0, [0.0, 0.0]) + new_width, new_height = _compute_output_size(rotation_matrix, width, height) + image_size = (new_height, new_width) + + return BoundingBox.new_like(self, output, dtype=output.dtype, image_size=image_size) def affine( self, @@ -207,3 +217,14 @@ def perspective( output = _F.perspective_bounding_box(self, self.format, perspective_coeffs) return BoundingBox.new_like(self, output, dtype=output.dtype) + + def elastic( + self, + displacement: torch.Tensor, + interpolation: InterpolationMode = InterpolationMode.BILINEAR, + fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None, + ) -> BoundingBox: + from torchvision.prototype.transforms import functional as _F + + output = _F.elastic_bounding_box(self, self.format, displacement) + return BoundingBox.new_like(self, output, dtype=output.dtype) diff --git a/torchvision/prototype/features/_feature.py b/torchvision/prototype/features/_feature.py index 85f758c638c..8ccbfda57e0 100644 --- a/torchvision/prototype/features/_feature.py +++ b/torchvision/prototype/features/_feature.py @@ -157,6 +157,14 @@ def perspective( ) -> Any: return self + def elastic( + self, + displacement: torch.Tensor, + interpolation: InterpolationMode = InterpolationMode.BILINEAR, + fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None, + ) -> Any: + return self + def adjust_brightness(self, brightness_factor: float) -> Any: return self @@ -189,3 +197,6 @@ def equalize(self) -> Any: def invert(self) -> Any: return self + + def gaussian_blur(self, kernel_size: List[int], sigma: Optional[List[float]] = None) -> Any: + return self diff --git a/torchvision/prototype/features/_image.py b/torchvision/prototype/features/_image.py index 70b93478d17..303486f98ba 100644 --- a/torchvision/prototype/features/_image.py +++ b/torchvision/prototype/features/_image.py @@ -74,7 +74,7 @@ def new_like( @property def image_size(self) -> Tuple[int, int]: - return cast(Tuple[int, int], self.shape[-2:]) + return cast(Tuple[int, int], tuple(self.shape[-2:])) @property def num_channels(self) -> int: @@ -243,6 +243,19 @@ def perspective( output = _F.perspective_image_tensor(self, perspective_coeffs, interpolation=interpolation, fill=fill) return Image.new_like(self, output) + def elastic( + self, + displacement: torch.Tensor, + interpolation: InterpolationMode = InterpolationMode.BILINEAR, + fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None, + ) -> Image: + from torchvision.prototype.transforms.functional import _geometry as _F + + fill = _F._convert_fill_arg(fill) + + output = _F.elastic_image_tensor(self, displacement, interpolation=interpolation, fill=fill) + return Image.new_like(self, output) + def adjust_brightness(self, brightness_factor: float) -> Image: from torchvision.prototype.transforms import functional as _F @@ -308,3 +321,9 @@ def invert(self) -> Image: output = _F.invert_image_tensor(self) return Image.new_like(self, output) + + def gaussian_blur(self, kernel_size: List[int], sigma: Optional[List[float]] = None) -> Image: + from torchvision.prototype.transforms import functional as _F + + output = _F.gaussian_blur_image_tensor(self, kernel_size=kernel_size, sigma=sigma) + return Image.new_like(self, output) diff --git a/torchvision/prototype/features/_segmentation_mask.py b/torchvision/prototype/features/_segmentation_mask.py index fdb71358a8f..406e06aef11 100644 --- a/torchvision/prototype/features/_segmentation_mask.py +++ b/torchvision/prototype/features/_segmentation_mask.py @@ -2,6 +2,7 @@ from typing import List, Optional, Sequence, Union +import torch from torchvision.transforms import InterpolationMode from ._feature import _Feature @@ -119,3 +120,14 @@ def perspective( output = _F.perspective_segmentation_mask(self, perspective_coeffs) return SegmentationMask.new_like(self, output) + + def elastic( + self, + displacement: torch.Tensor, + interpolation: InterpolationMode = InterpolationMode.NEAREST, + fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None, + ) -> SegmentationMask: + from torchvision.prototype.transforms import functional as _F + + output = _F.elastic_segmentation_mask(self, displacement) + return SegmentationMask.new_like(self, output, dtype=output.dtype) diff --git a/torchvision/prototype/transforms/__init__.py b/torchvision/prototype/transforms/__init__.py index 3f4299f6fb9..15abd4f77f2 100644 --- a/torchvision/prototype/transforms/__init__.py +++ b/torchvision/prototype/transforms/__init__.py @@ -4,15 +4,27 @@ from ._augment import RandomCutmix, RandomErasing, RandomMixup from ._auto_augment import AugMix, AutoAugment, RandAugment, TrivialAugmentWide -from ._color import ColorJitter, RandomEqualize, RandomPhotometricDistort +from ._color import ( + ColorJitter, + RandomAdjustSharpness, + RandomAutocontrast, + RandomEqualize, + RandomInvert, + RandomPhotometricDistort, + RandomPosterize, + RandomSolarize, +) from ._container import Compose, RandomApply, RandomChoice, RandomOrder from ._geometry import ( BatchMultiCrop, CenterCrop, + ElasticTransform, FiveCrop, Pad, RandomAffine, + RandomCrop, RandomHorizontalFlip, + RandomPerspective, RandomResizedCrop, RandomRotation, RandomVerticalFlip, @@ -21,7 +33,7 @@ TenCrop, ) from ._meta import ConvertBoundingBoxFormat, ConvertImageColorSpace, ConvertImageDtype -from ._misc import Identity, Lambda, Normalize, ToDtype +from ._misc import GaussianBlur, Identity, Lambda, Normalize, ToDtype from ._type_conversion import DecodeImage, LabelToOneHot from ._deprecated import Grayscale, RandomGrayscale, ToTensor, ToPILImage, PILToTensor # usort: skip diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py index d1c3db816ad..12e2cd3cc6d 100644 --- a/torchvision/prototype/transforms/_augment.py +++ b/torchvision/prototype/transforms/_augment.py @@ -92,8 +92,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: return features.Image.new_like(inpt, output) return output elif isinstance(inpt, PIL.Image.Image): - # TODO: We should implement a fallback to tensor, like gaussian_blur etc - raise RuntimeError("Not implemented") + return F.erase_image_pil(inpt, **params) else: return inpt diff --git a/torchvision/prototype/transforms/_color.py b/torchvision/prototype/transforms/_color.py index e71be8b5934..bc29fe5b677 100644 --- a/torchvision/prototype/transforms/_color.py +++ b/torchvision/prototype/transforms/_color.py @@ -151,8 +151,42 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomEqualize(_RandomApplyTransform): - def __init__(self, p: float = 0.5): + def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: + return F.equalize(inpt) + + +class RandomInvert(_RandomApplyTransform): + def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: + return F.invert(inpt) + + +class RandomPosterize(_RandomApplyTransform): + def __init__(self, bits: int, p: float = 0.5) -> None: super().__init__(p=p) + self.bits = bits def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: - return F.equalize(inpt) + return F.posterize(inpt, bits=self.bits) + + +class RandomSolarize(_RandomApplyTransform): + def __init__(self, threshold: float, p: float = 0.5) -> None: + super().__init__(p=p) + self.threshold = threshold + + def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: + return F.solarize(inpt, threshold=self.threshold) + + +class RandomAutocontrast(_RandomApplyTransform): + def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: + return F.autocontrast(inpt) + + +class RandomAdjustSharpness(_RandomApplyTransform): + def __init__(self, sharpness_factor: float, p: float = 0.5) -> None: + super().__init__(p=p) + self.sharpness_factor = sharpness_factor + + def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: + return F.adjust_sharpness(inpt, sharpness_factor=self.sharpness_factor) diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py index 6c511635435..decdee06073 100644 --- a/torchvision/prototype/transforms/_geometry.py +++ b/torchvision/prototype/transforms/_geometry.py @@ -35,7 +35,8 @@ def __init__( antialias: Optional[bool] = None, ) -> None: super().__init__() - self.size = [size] if isinstance(size, int) else list(size) + + self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.") self.interpolation = interpolation self.max_size = max_size self.antialias = antialias @@ -80,7 +81,6 @@ def __init__( if (scale[0] > scale[1]) or (ratio[0] > ratio[1]): warnings.warn("Scale and ratio should be of kind (min, max)") - self.size = size self.scale = scale self.ratio = ratio self.interpolation = interpolation @@ -225,6 +225,21 @@ def _check_fill_arg(fill: Union[int, float, Sequence[int], Sequence[float]]) -> raise TypeError("Got inappropriate fill arg") +def _check_padding_arg(padding: Union[int, Sequence[int]]) -> None: + if not isinstance(padding, (numbers.Number, tuple, list)): + raise TypeError("Got inappropriate padding arg") + + if isinstance(padding, (tuple, list)) and len(padding) not in [1, 2, 4]: + raise ValueError(f"Padding must be an int or a 1, 2, or 4 element tuple, not a {len(padding)} element tuple") + + +# TODO: let's use torchvision._utils.StrEnum to have the best of both worlds (strings and enums) +# https://github.com/pytorch/vision/issues/6250 +def _check_padding_mode_arg(padding_mode: Literal["constant", "edge", "reflect", "symmetric"]) -> None: + if padding_mode not in ["constant", "edge", "reflect", "symmetric"]: + raise ValueError("Padding mode should be either constant, edge, reflect or symmetric") + + class Pad(Transform): def __init__( self, @@ -233,18 +248,10 @@ def __init__( padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant", ) -> None: super().__init__() - if not isinstance(padding, (numbers.Number, tuple, list)): - raise TypeError("Got inappropriate padding arg") + _check_padding_arg(padding) _check_fill_arg(fill) - - if padding_mode not in ["constant", "edge", "reflect", "symmetric"]: - raise ValueError("Padding mode should be either constant, edge, reflect or symmetric") - - if isinstance(padding, Sequence) and len(padding) not in [1, 2, 4]: - raise ValueError( - f"Padding must be an int or a 1, 2, or 4 element tuple, not a {len(padding)} element tuple" - ) + _check_padding_mode_arg(padding_mode) self.padding = padding self.fill = fill @@ -258,7 +265,7 @@ class RandomZoomOut(_RandomApplyTransform): def __init__( self, fill: Union[int, float, Sequence[int], Sequence[float]] = 0, - side_range: Tuple[float, float] = (1.0, 4.0), + side_range: Sequence[float] = (1.0, 4.0), p: float = 0.5, ) -> None: super().__init__(p=p) @@ -266,6 +273,8 @@ def __init__( _check_fill_arg(fill) self.fill = fill + _check_sequence_input(side_range, "side_range", req_sizes=(2,)) + self.side_range = side_range if side_range[0] < 1.0 or side_range[0] > side_range[1]: raise ValueError(f"Invalid canvas side range provided {side_range}.") @@ -285,6 +294,7 @@ def _get_params(self, sample: Any) -> Dict[str, Any]: bottom = canvas_height - (top + orig_h) padding = [left, top, right, bottom] + # vfdev-5: Can we put that into pad_image_tensor ? fill = self.fill if not isinstance(fill, collections.abc.Sequence): fill = [fill] * orig_c @@ -414,3 +424,203 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: fill=self.fill, center=self.center, ) + + +class RandomCrop(Transform): + def __init__( + self, + size: Union[int, Sequence[int]], + padding: Optional[Union[int, Sequence[int]]] = None, + pad_if_needed: bool = False, + fill: Union[int, float, Sequence[int], Sequence[float]] = 0, + padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant", + ) -> None: + super().__init__() + + self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.") + + self.padding = padding + self.pad_if_needed = pad_if_needed + self.fill = fill + self.padding_mode = padding_mode + + self._pad_op = None + if self.padding is not None: + self._pad_op = Pad(self.padding, fill=self.fill, padding_mode=self.padding_mode) + + if self.pad_if_needed: + self._pad_op = Pad(0, fill=self.fill, padding_mode=self.padding_mode) + + def _get_params(self, sample: Any) -> Dict[str, Any]: + image = query_image(sample) + _, height, width = get_image_dimensions(image) + output_height, output_width = self.size + + if height + 1 < output_height or width + 1 < output_width: + raise ValueError( + f"Required crop size {(output_height, output_width)} is larger then input image size {(height, width)}" + ) + + if width == output_width and height == output_height: + return dict(top=0, left=0, height=height, width=width) + + top = torch.randint(0, height - output_height + 1, size=(1,)).item() + left = torch.randint(0, width - output_width + 1, size=(1,)).item() + return dict(top=top, left=left, height=output_height, width=output_width) + + def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: + return F.crop(inpt, **params) + + def forward(self, *inputs: Any) -> Any: + sample = inputs if len(inputs) > 1 else inputs[0] + + if self._pad_op is not None: + sample = self._pad_op(sample) + + image = query_image(sample) + _, height, width = get_image_dimensions(image) + + if self.pad_if_needed: + # This check is to explicitly ensure that self._pad_op is defined + if self._pad_op is None: + raise RuntimeError( + "Internal error, self._pad_op is None. " + "Please, fill an issue about that on https://github.com/pytorch/vision/issues" + ) + + # pad the width if needed + if width < self.size[1]: + self._pad_op.padding = [self.size[1] - width, 0] + sample = self._pad_op(sample) + # pad the height if needed + if height < self.size[0]: + self._pad_op.padding = [0, self.size[0] - height] + sample = self._pad_op(sample) + + return super().forward(sample) + + +class RandomPerspective(_RandomApplyTransform): + def __init__( + self, + distortion_scale: float, + fill: Union[int, float, Sequence[int], Sequence[float]] = 0, + interpolation: InterpolationMode = InterpolationMode.BILINEAR, + p: float = 0.5, + ) -> None: + super().__init__(p=p) + + _check_fill_arg(fill) + if not (0 <= distortion_scale <= 1): + raise ValueError("Argument distortion_scale value should be between 0 and 1") + + self.distortion_scale = distortion_scale + self.interpolation = interpolation + self.fill = fill + + def _get_params(self, sample: Any) -> Dict[str, Any]: + # Get image size + # TODO: make it work with bboxes and segm masks + image = query_image(sample) + _, height, width = get_image_dimensions(image) + + distortion_scale = self.distortion_scale + + half_height = height // 2 + half_width = width // 2 + topleft = [ + int(torch.randint(0, int(distortion_scale * half_width) + 1, size=(1,)).item()), + int(torch.randint(0, int(distortion_scale * half_height) + 1, size=(1,)).item()), + ] + topright = [ + int(torch.randint(width - int(distortion_scale * half_width) - 1, width, size=(1,)).item()), + int(torch.randint(0, int(distortion_scale * half_height) + 1, size=(1,)).item()), + ] + botright = [ + int(torch.randint(width - int(distortion_scale * half_width) - 1, width, size=(1,)).item()), + int(torch.randint(height - int(distortion_scale * half_height) - 1, height, size=(1,)).item()), + ] + botleft = [ + int(torch.randint(0, int(distortion_scale * half_width) + 1, size=(1,)).item()), + int(torch.randint(height - int(distortion_scale * half_height) - 1, height, size=(1,)).item()), + ] + startpoints = [[0, 0], [width - 1, 0], [width - 1, height - 1], [0, height - 1]] + endpoints = [topleft, topright, botright, botleft] + return dict(startpoints=startpoints, endpoints=endpoints) + + def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: + return F.perspective( + inpt, + **params, + fill=self.fill, + interpolation=self.interpolation, + ) + + +def _setup_float_or_seq(arg: Union[float, Sequence[float]], name: str, req_size: int = 2) -> Sequence[float]: + if not isinstance(arg, (float, Sequence)): + raise TypeError(f"{name} should be float or a sequence of floats. Got {type(arg)}") + if isinstance(arg, Sequence) and len(arg) != req_size: + raise ValueError(f"If {name} is a sequence its length should be one of {req_size}. Got {len(arg)}") + if isinstance(arg, Sequence): + for element in arg: + if not isinstance(element, float): + raise ValueError(f"{name} should be a sequence of floats. Got {type(element)}") + + if isinstance(arg, float): + arg = [float(arg), float(arg)] + if isinstance(arg, (list, tuple)) and len(arg) == 1: + arg = [arg[0], arg[0]] + return arg + + +class ElasticTransform(Transform): + def __init__( + self, + alpha: Union[float, Sequence[float]] = 50.0, + sigma: Union[float, Sequence[float]] = 5.0, + fill: Union[int, float, Sequence[int], Sequence[float]] = 0, + interpolation: InterpolationMode = InterpolationMode.BILINEAR, + ) -> None: + super().__init__() + self.alpha = _setup_float_or_seq(alpha, "alpha", 2) + self.sigma = _setup_float_or_seq(sigma, "sigma", 2) + + _check_fill_arg(fill) + + self.interpolation = interpolation + self.fill = fill + + def _get_params(self, sample: Any) -> Dict[str, Any]: + # Get image size + # TODO: make it work with bboxes and segm masks + image = query_image(sample) + _, *size = get_image_dimensions(image) + + dx = torch.rand([1, 1] + size) * 2 - 1 + if self.sigma[0] > 0.0: + kx = int(8 * self.sigma[0] + 1) + # if kernel size is even we have to make it odd + if kx % 2 == 0: + kx += 1 + dx = F.gaussian_blur(dx, [kx, kx], list(self.sigma)) + dx = dx * self.alpha[0] / size[0] + + dy = torch.rand([1, 1] + size) * 2 - 1 + if self.sigma[1] > 0.0: + ky = int(8 * self.sigma[1] + 1) + # if kernel size is even we have to make it odd + if ky % 2 == 0: + ky += 1 + dy = F.gaussian_blur(dy, [ky, ky], list(self.sigma)) + dy = dy * self.alpha[1] / size[1] + displacement = torch.concat([dx, dy], 1).permute([0, 2, 3, 1]) # 1 x H x W x 2 + return dict(displacement=displacement) + + def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: + return F.elastic( + inpt, + **params, + fill=self.fill, + interpolation=self.interpolation, + ) diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py index 769e05809e7..14c33db3ecb 100644 --- a/torchvision/prototype/transforms/_misc.py +++ b/torchvision/prototype/transforms/_misc.py @@ -1,8 +1,9 @@ import functools -from typing import Any, Callable, Dict, List, Type +from typing import Any, Callable, Dict, List, Sequence, Type, Union import torch from torchvision.prototype.transforms import functional as F, Transform +from torchvision.transforms.transforms import _setup_size class Identity(Transform): @@ -46,6 +47,36 @@ def _transform(self, input: Any, params: Dict[str, Any]) -> Any: return input +class GaussianBlur(Transform): + def __init__( + self, kernel_size: Union[int, Sequence[int]], sigma: Union[float, Sequence[float]] = (0.1, 2.0) + ) -> None: + super().__init__() + self.kernel_size = _setup_size(kernel_size, "Kernel size should be a tuple/list of two integers") + for ks in self.kernel_size: + if ks <= 0 or ks % 2 == 0: + raise ValueError("Kernel size value should be an odd and positive number.") + + if isinstance(sigma, float): + if sigma <= 0: + raise ValueError("If sigma is a single number, it must be positive.") + sigma = (sigma, sigma) + elif isinstance(sigma, Sequence) and len(sigma) == 2: + if not 0.0 < sigma[0] <= sigma[1]: + raise ValueError("sigma values should be positive and of the form (min, max).") + else: + raise TypeError("sigma should be a single float or a list/tuple with length 2 floats.") + + self.sigma = sigma + + def _get_params(self, sample: Any) -> Dict[str, Any]: + sigma = torch.empty(1).uniform_(self.sigma[0], self.sigma[1]).item() + return dict(sigma=[sigma, sigma]) + + def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: + return F.gaussian_blur(inpt, **params) + + class ToDtype(Lambda): def __init__(self, dtype: torch.dtype, *types: Type) -> None: self.dtype = dtype diff --git a/torchvision/prototype/transforms/_transform.py b/torchvision/prototype/transforms/_transform.py index d02732f552c..e7277748d3a 100644 --- a/torchvision/prototype/transforms/_transform.py +++ b/torchvision/prototype/transforms/_transform.py @@ -1,10 +1,11 @@ import enum -import functools from typing import Any, Dict +import PIL.Image import torch from torch import nn -from torchvision.prototype.utils._internal import apply_recursively +from torch.utils._pytree import tree_flatten, tree_unflatten +from torchvision.prototype.features import _Feature from torchvision.utils import _log_api_usage_once @@ -16,12 +17,20 @@ def __init__(self) -> None: def _get_params(self, sample: Any) -> Dict[str, Any]: return dict() - def _transform(self, input: Any, params: Dict[str, Any]) -> Any: + def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: raise NotImplementedError def forward(self, *inputs: Any) -> Any: sample = inputs if len(inputs) > 1 else inputs[0] - return apply_recursively(functools.partial(self._transform, params=self._get_params(sample)), sample) + + params = self._get_params(sample) + + flat_inputs, spec = tree_flatten(sample) + transformed_types = (torch.Tensor, _Feature, PIL.Image.Image) + flat_outputs = [ + self._transform(inpt, params) if isinstance(inpt, transformed_types) else inpt for inpt in flat_inputs + ] + return tree_unflatten(flat_outputs, spec) def extra_repr(self) -> str: extra = [] diff --git a/torchvision/prototype/transforms/_utils.py b/torchvision/prototype/transforms/_utils.py index 1344790e633..3de2f196c9f 100644 --- a/torchvision/prototype/transforms/_utils.py +++ b/torchvision/prototype/transforms/_utils.py @@ -1,26 +1,20 @@ -from typing import Any, Iterator, Optional, Tuple, Type, Union +from typing import Any, Tuple, Type, Union import PIL.Image import torch +from torch.utils._pytree import tree_flatten from torchvision.prototype import features -from torchvision.prototype.utils._internal import query_recursively from .functional._meta import get_dimensions_image_pil, get_dimensions_image_tensor def query_image(sample: Any) -> Union[PIL.Image.Image, torch.Tensor, features.Image]: - def fn( - id: Tuple[Any, ...], input: Any - ) -> Optional[Tuple[Tuple[Any, ...], Union[PIL.Image.Image, torch.Tensor, features.Image]]]: - if type(input) in {torch.Tensor, features.Image} or isinstance(input, PIL.Image.Image): - return id, input + flat_sample, _ = tree_flatten(sample) + for i in flat_sample: + if type(i) == torch.Tensor or isinstance(i, (PIL.Image.Image, features.Image)): + return i - return None - - try: - return next(query_recursively(fn, sample))[1] - except StopIteration: - raise TypeError("No image was found in the sample") + raise TypeError("No image was found in the sample") def get_image_dimensions(image: Union[PIL.Image.Image, torch.Tensor, features.Image]) -> Tuple[int, int, int]: @@ -36,16 +30,14 @@ def get_image_dimensions(image: Union[PIL.Image.Image, torch.Tensor, features.Im return channels, height, width -def _extract_types(sample: Any) -> Iterator[Type]: - return query_recursively(lambda id, input: type(input), sample) - - def has_any(sample: Any, *types: Type) -> bool: - return any(issubclass(type, types) for type in _extract_types(sample)) + flat_sample, _ = tree_flatten(sample) + return any(issubclass(type(obj), types) for obj in flat_sample) def has_all(sample: Any, *types: Type) -> bool: - return not bool(set(types) - set(_extract_types(sample))) + flat_sample, _ = tree_flatten(sample) + return not bool(set(types) - set([type(obj) for obj in flat_sample])) def is_simple_tensor(input: Any) -> bool: diff --git a/torchvision/prototype/transforms/functional/__init__.py b/torchvision/prototype/transforms/functional/__init__.py index 19b1c26f2d5..82e3096821a 100644 --- a/torchvision/prototype/transforms/functional/__init__.py +++ b/torchvision/prototype/transforms/functional/__init__.py @@ -5,7 +5,7 @@ convert_image_color_space_pil, ) # usort: skip -from ._augment import erase_image_tensor +from ._augment import erase_image_pil, erase_image_tensor from ._color import ( adjust_brightness, adjust_brightness_image_pil, @@ -57,6 +57,12 @@ crop_image_pil, crop_image_tensor, crop_segmentation_mask, + elastic, + elastic_bounding_box, + elastic_image_pil, + elastic_image_tensor, + elastic_segmentation_mask, + elastic_transform, five_crop_image_pil, five_crop_image_tensor, horizontal_flip, @@ -97,7 +103,7 @@ vertical_flip_image_tensor, vertical_flip_segmentation_mask, ) -from ._misc import gaussian_blur_image_tensor, normalize_image_tensor +from ._misc import gaussian_blur, gaussian_blur_image_pil, gaussian_blur_image_tensor, normalize_image_tensor from ._type_conversion import ( decode_image_with_pil, decode_video_with_av, diff --git a/torchvision/prototype/transforms/functional/_augment.py b/torchvision/prototype/transforms/functional/_augment.py index 3920d1b3065..84b069cf396 100644 --- a/torchvision/prototype/transforms/functional/_augment.py +++ b/torchvision/prototype/transforms/functional/_augment.py @@ -1,13 +1,16 @@ +import PIL.Image + +import torch from torchvision.transforms import functional_tensor as _FT +from torchvision.transforms.functional import pil_to_tensor, to_pil_image erase_image_tensor = _FT.erase -# TODO: Don't forget to clean up from the primitives kernels those that shouldn't be kernels. -# Like the mixup and cutmix stuff - -# This function is copy-pasted to Image and OneHotLabel and may be refactored -# def _mixup_tensor(input: torch.Tensor, batch_dim: int, lam: float) -> torch.Tensor: -# input = input.clone() -# return input.roll(1, batch_dim).mul_(1 - lam).add_(input.mul_(lam)) +def erase_image_pil( + img: PIL.Image.Image, i: int, j: int, h: int, w: int, v: torch.Tensor, inplace: bool = False +) -> PIL.Image.Image: + t_img = pil_to_tensor(img) + output = erase_image_tensor(t_img, i=i, j=j, h=h, w=w, v=v, inplace=inplace) + return to_pil_image(output, mode=img.mode) diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py index 8938b2bf31c..d5eec09bf2f 100644 --- a/torchvision/prototype/transforms/functional/_geometry.py +++ b/torchvision/prototype/transforms/functional/_geometry.py @@ -9,8 +9,11 @@ from torchvision.transforms.functional import ( _compute_output_size, _get_inverse_affine_matrix, + _get_perspective_coeffs, InterpolationMode, pil_modes_mapping, + pil_to_tensor, + to_pil_image, ) from ._meta import convert_bounding_box_format, get_dimensions_image_pil, get_dimensions_image_tensor @@ -759,16 +762,21 @@ def perspective_bounding_box( ).view(original_shape) -def perspective_segmentation_mask(img: torch.Tensor, perspective_coeffs: List[float]) -> torch.Tensor: - return perspective_image_tensor(img, perspective_coeffs=perspective_coeffs, interpolation=InterpolationMode.NEAREST) +def perspective_segmentation_mask(mask: torch.Tensor, perspective_coeffs: List[float]) -> torch.Tensor: + return perspective_image_tensor( + mask, perspective_coeffs=perspective_coeffs, interpolation=InterpolationMode.NEAREST + ) def perspective( inpt: DType, - perspective_coeffs: List[float], + startpoints: List[List[int]], + endpoints: List[List[int]], interpolation: InterpolationMode = InterpolationMode.BILINEAR, fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None, ) -> DType: + perspective_coeffs = _get_perspective_coeffs(startpoints, endpoints) + if isinstance(inpt, features._Feature): return inpt.perspective(perspective_coeffs, interpolation=interpolation, fill=fill) elif isinstance(inpt, PIL.Image.Image): @@ -779,6 +787,91 @@ def perspective( return perspective_image_tensor(inpt, perspective_coeffs, interpolation=interpolation, fill=fill) +def elastic_image_tensor( + img: torch.Tensor, + displacement: torch.Tensor, + interpolation: InterpolationMode = InterpolationMode.BILINEAR, + fill: Optional[List[float]] = None, +) -> torch.Tensor: + return _FT.elastic_transform(img, displacement, interpolation=interpolation.value, fill=fill) + + +def elastic_image_pil( + img: PIL.Image.Image, + displacement: torch.Tensor, + interpolation: InterpolationMode = InterpolationMode.BILINEAR, + fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None, +) -> PIL.Image.Image: + t_img = pil_to_tensor(img) + fill = _convert_fill_arg(fill) + + output = elastic_image_tensor(t_img, displacement, interpolation=interpolation, fill=fill) + return to_pil_image(output, mode=img.mode) + + +def elastic_bounding_box( + bounding_box: torch.Tensor, + format: features.BoundingBoxFormat, + displacement: torch.Tensor, +) -> torch.Tensor: + # TODO: add in docstring about approximation we are doing for grid inversion + displacement = displacement.to(bounding_box.device) + + original_shape = bounding_box.shape + bounding_box = convert_bounding_box_format( + bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY + ).view(-1, 4) + + # Question (vfdev-5): should we rely on good displacement shape and fetch image size from it + # Or add image_size arg and check displacement shape + image_size = displacement.shape[-3], displacement.shape[-2] + + id_grid = _FT._create_identity_grid(list(image_size)).to(bounding_box.device) + # We construct an approximation of inverse grid as inv_grid = id_grid - displacement + # This is not an exact inverse of the grid + inv_grid = id_grid - displacement + + # Get points from bboxes + points = bounding_box[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].view(-1, 2) + index_x = torch.floor(points[:, 0] + 0.5).to(dtype=torch.long) + index_y = torch.floor(points[:, 1] + 0.5).to(dtype=torch.long) + # Transform points: + t_size = torch.tensor(image_size[::-1], device=displacement.device, dtype=displacement.dtype) + transformed_points = (inv_grid[0, index_y, index_x, :] + 1) * 0.5 * t_size - 0.5 + + transformed_points = transformed_points.view(-1, 4, 2) + out_bbox_mins, _ = torch.min(transformed_points, dim=1) + out_bbox_maxs, _ = torch.max(transformed_points, dim=1) + out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1) + + return convert_bounding_box_format( + out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, copy=False + ).view(original_shape) + + +def elastic_segmentation_mask(mask: torch.Tensor, displacement: torch.Tensor) -> torch.Tensor: + return elastic_image_tensor(mask, displacement=displacement, interpolation=InterpolationMode.NEAREST) + + +def elastic( + inpt: DType, + displacement: torch.Tensor, + interpolation: InterpolationMode = InterpolationMode.BILINEAR, + fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None, +) -> DType: + if isinstance(inpt, features._Feature): + return inpt.elastic(displacement, interpolation=interpolation, fill=fill) + elif isinstance(inpt, PIL.Image.Image): + return elastic_image_pil(inpt, displacement, interpolation=interpolation, fill=fill) + else: + fill = _convert_fill_arg(fill) + + return elastic_image_tensor(inpt, displacement, interpolation=interpolation, fill=fill) + + +elastic_transform = elastic + + def _center_crop_parse_output_size(output_size: List[int]) -> List[int]: if isinstance(output_size, numbers.Number): return [int(output_size), int(output_size)] diff --git a/torchvision/prototype/transforms/functional/_misc.py b/torchvision/prototype/transforms/functional/_misc.py index 096ba32f2cf..d93194e2eab 100644 --- a/torchvision/prototype/transforms/functional/_misc.py +++ b/torchvision/prototype/transforms/functional/_misc.py @@ -1,14 +1,28 @@ -from typing import List, Optional +from typing import List, Optional, Union import PIL.Image import torch +from torchvision.prototype import features from torchvision.transforms import functional_tensor as _FT from torchvision.transforms.functional import pil_to_tensor, to_pil_image +# shortcut type +DType = Union[torch.Tensor, PIL.Image.Image, features._Feature] + + normalize_image_tensor = _FT.normalize +def normalize(inpt: DType, mean: List[float], std: List[float], inplace: bool = False) -> DType: + if isinstance(inpt, features.Image): + return normalize_image_tensor(inpt, mean=mean, std=std, inplace=inplace) + elif type(inpt) == torch.Tensor: + return normalize_image_tensor(inpt, mean=mean, std=std, inplace=inplace) + else: + raise TypeError("Unsupported input type") + + def gaussian_blur_image_tensor( img: torch.Tensor, kernel_size: List[int], sigma: Optional[List[float]] = None ) -> torch.Tensor: @@ -42,3 +56,12 @@ def gaussian_blur_image_pil(img: PIL.Image, kernel_size: List[int], sigma: Optio t_img = pil_to_tensor(img) output = gaussian_blur_image_tensor(t_img, kernel_size=kernel_size, sigma=sigma) return to_pil_image(output, mode=img.mode) + + +def gaussian_blur(inpt: DType, kernel_size: List[int], sigma: Optional[List[float]] = None) -> DType: + if isinstance(inpt, features._Feature): + return inpt.gaussian_blur(kernel_size=kernel_size, sigma=sigma) + elif isinstance(inpt, PIL.Image.Image): + return gaussian_blur_image_pil(inpt, kernel_size=kernel_size, sigma=sigma) + else: + return gaussian_blur_image_tensor(inpt, kernel_size=kernel_size, sigma=sigma) diff --git a/torchvision/transforms/functional.py b/torchvision/transforms/functional.py index 77d5b33b55a..8e94733651b 100644 --- a/torchvision/transforms/functional.py +++ b/torchvision/transforms/functional.py @@ -1554,7 +1554,7 @@ def elastic_transform( If img is torch Tensor, it is expected to be in [..., 1 or 3, H, W] format, where ... means it can have an arbitrary number of leading dimensions. If img is PIL Image, it is expected to be in mode "P", "L" or "RGB". - displacement (Tensor): The displacement field. + displacement (Tensor): The displacement field. Expected shape is [1, H, W, 2]. interpolation (InterpolationMode): Desired interpolation enum defined by :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``. @@ -1576,7 +1576,7 @@ def elastic_transform( interpolation = _interpolation_modes_from_int(interpolation) if not isinstance(displacement, torch.Tensor): - raise TypeError("displacement should be a Tensor") + raise TypeError("Argument displacement should be a Tensor") t_img = img if not isinstance(img, torch.Tensor): @@ -1584,6 +1584,15 @@ def elastic_transform( raise TypeError(f"img should be PIL Image or Tensor. Got {type(img)}") t_img = pil_to_tensor(img) + shape = t_img.shape + shape = (1,) + shape[-2:] + (2,) + if shape != displacement.shape: + raise ValueError(f"Argument displacement shape should be {shape}, but given {displacement.shape}") + + # TODO: if image shape is [N1, N2, ..., C, H, W] and + # displacement is [1, H, W, 2] we need to reshape input image + # such grid_sampler takes internal code for 4D input + output = F_t.elastic_transform( t_img, displacement, diff --git a/torchvision/transforms/functional_pil.py b/torchvision/transforms/functional_pil.py index 768176e6783..ec65b62314c 100644 --- a/torchvision/transforms/functional_pil.py +++ b/torchvision/transforms/functional_pil.py @@ -260,7 +260,7 @@ def _parse_fill( ) -> Dict[str, Optional[Union[float, List[float], Tuple[float, ...]]]]: # Process fill color for affine transforms - num_bands = len(img.getbands()) + num_bands = get_image_num_channels(img) if fill is None: fill = 0 if isinstance(fill, (int, float)) and num_bands > 1: diff --git a/torchvision/transforms/functional_tensor.py b/torchvision/transforms/functional_tensor.py index a1e49f5c2d8..df5396a063c 100644 --- a/torchvision/transforms/functional_tensor.py +++ b/torchvision/transforms/functional_tensor.py @@ -634,7 +634,7 @@ def _compute_output_size(matrix: List[float], w: int, h: int) -> Tuple[int, int] cmax = torch.ceil((max_vals / tol).trunc_() * tol) cmin = torch.floor((min_vals / tol).trunc_() * tol) size = cmax - cmin - return int(size[0]), int(size[1]) + return int(size[0]), int(size[1]) # w, h def rotate( @@ -932,6 +932,12 @@ def erase(img: Tensor, i: int, j: int, h: int, w: int, v: Tensor, inplace: bool return img +def _create_identity_grid(size: List[int]) -> Tensor: + hw_space = [torch.linspace((-s + 1) / s, (s - 1) / s, s) for s in size] + grid_y, grid_x = torch.meshgrid(hw_space, indexing="ij") + return torch.stack([grid_x, grid_y], -1).unsqueeze(0) # 1 x H x W x 2 + + def elastic_transform( img: Tensor, displacement: Tensor, @@ -945,8 +951,6 @@ def elastic_transform( size = list(img.shape[-2:]) displacement = displacement.to(img.device) - hw_space = [torch.linspace((-s + 1) / s, (s - 1) / s, s) for s in size] - grid_y, grid_x = torch.meshgrid(hw_space, indexing="ij") - identity_grid = torch.stack([grid_x, grid_y], -1).unsqueeze(0) # 1 x H x W x 2 + identity_grid = _create_identity_grid(size) grid = identity_grid.to(img.device) + displacement return _apply_grid_transform(img, grid, interpolation, fill) diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py index ae7853ec5ea..095460675cc 100644 --- a/torchvision/transforms/transforms.py +++ b/torchvision/transforms/transforms.py @@ -1855,7 +1855,7 @@ def _check_sequence_input(x, name, req_sizes): if not isinstance(x, Sequence): raise TypeError(f"{name} should be a sequence of length {msg}.") if len(x) not in req_sizes: - raise ValueError(f"{name} should be sequence of length {msg}.") + raise ValueError(f"{name} should be a sequence of length {msg}.") def _setup_angle(x, name, req_sizes=(2,)):