From 969635e50fca8d484780b1eef2845e10a5d3a573 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 28 Feb 2022 21:00:42 +0000 Subject: [PATCH] Review new Transforms API --- torchvision/prototype/features/_bounding_box.py | 1 + torchvision/prototype/features/_encoded.py | 1 + torchvision/prototype/features/_image.py | 5 +++++ torchvision/prototype/transforms/__init__.py | 11 +++++++++++ torchvision/prototype/transforms/_augment.py | 10 ++++++++++ .../prototype/transforms/_auto_augment.py | 11 +++++++++++ torchvision/prototype/transforms/_container.py | 1 + torchvision/prototype/transforms/_meta.py | 8 ++++++-- torchvision/prototype/transforms/_transform.py | 4 ++++ .../prototype/transforms/functional/__init__.py | 16 ++++++++++++++++ .../prototype/transforms/functional/_geometry.py | 4 ++++ .../prototype/transforms/functional/_meta.py | 10 +++++++++- .../prototype/transforms/functional/_misc.py | 1 + 13 files changed, 80 insertions(+), 3 deletions(-) diff --git a/torchvision/prototype/features/_bounding_box.py b/torchvision/prototype/features/_bounding_box.py index 5b60d7ee55c..85d81efcb1f 100644 --- a/torchvision/prototype/features/_bounding_box.py +++ b/torchvision/prototype/features/_bounding_box.py @@ -39,6 +39,7 @@ def __new__( def to_format(self, format: Union[str, BoundingBoxFormat]) -> BoundingBox: # TODO: this is useful for developing and debugging but we should remove or at least revisit this before we # promote this out of the prototype state + # Do we still need this? # import at runtime to avoid cyclic imports from torchvision.prototype.transforms.functional import convert_bounding_box_format diff --git a/torchvision/prototype/features/_encoded.py b/torchvision/prototype/features/_encoded.py index 276aeec2529..bcc4180022f 100644 --- a/torchvision/prototype/features/_encoded.py +++ b/torchvision/prototype/features/_encoded.py @@ -41,6 +41,7 @@ def image_size(self) -> Tuple[int, int]: def decode(self) -> Image: # TODO: this is useful for developing and debugging but we should remove or at least revisit this before we # promote this out of the prototype state + # Do we still need this? # import at runtime to avoid cyclic imports from torchvision.prototype.transforms.functional import decode_image_with_pil diff --git a/torchvision/prototype/features/_image.py b/torchvision/prototype/features/_image.py index 5ecc4cbedb7..4b606550e0a 100644 --- a/torchvision/prototype/features/_image.py +++ b/torchvision/prototype/features/_image.py @@ -17,6 +17,9 @@ class ColorSpace(StrEnum): OTHER = 0 GRAYSCALE = 1 RGB = 3 + # On io, we currently support Alpha transparency for Grayscale and RGB. We also support palette for PNG. + # Our enum must know about these. We should also allow for users to strip out the alpha transparency which is + # currently not supported by any colour transform. class Image(_Feature): @@ -78,9 +81,11 @@ def guess_color_space(data: torch.Tensor) -> ColorSpace: def show(self) -> None: # TODO: this is useful for developing and debugging but we should remove or at least revisit this before we # promote this out of the prototype state + # Do we still need this? to_pil_image(make_grid(self.view(-1, *self.shape[-3:]))).show() def draw_bounding_box(self, bounding_box: BoundingBox, **kwargs: Any) -> Image: # TODO: this is useful for developing and debugging but we should remove or at least revisit this before we # promote this out of the prototype state + # Do we still need this? return Image.new_like(self, draw_bounding_boxes(self, bounding_box.to_format("xyxy").view(-1, 4), **kwargs)) diff --git a/torchvision/prototype/transforms/__init__.py b/torchvision/prototype/transforms/__init__.py index 98ad7ae0d74..d3607c2fad1 100644 --- a/torchvision/prototype/transforms/__init__.py +++ b/torchvision/prototype/transforms/__init__.py @@ -12,3 +12,14 @@ from ._misc import Identity, Normalize, ToDtype, Lambda from ._presets import CocoEval, ImageNetEval, VocEval, Kinect400Eval, RaftEval from ._type_conversion import DecodeImage, LabelToOneHot + +# What are the migration plans for Classes without new API equivalents? There are two categories: +# 1. Deprecated methods which have equivalents on the new API (_legacy.py?): +# - Grayscale, RandomGrayscale: use ConvertImageColorSpace +# 2. Those without equivalents on the new API: +# - Pad, RandomCrop, RandomHorizontalFlip, RandomVerticalFlip, RandomPerspective, FiveCrop, TenCrop, ColorJitter, +# RandomRotation, RandomAffine, GaussianBlur, RandomInvert, RandomPosterize, RandomSolarize, RandomAdjustSharpness, +# RandomAutocontrast, RandomEqualize, LinearTransformation (must be added) +# - PILToTensor, ToPILImage (_legacy.py?) +# - ToTensor (deprecate vfdev-5?) +# We need a plan for both categories implemented on the new API. diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py index 5862c6a06dc..943deba23f2 100644 --- a/torchvision/prototype/transforms/_augment.py +++ b/torchvision/prototype/transforms/_augment.py @@ -34,6 +34,10 @@ def __init__( if p < 0 or p > 1: raise ValueError("Random erasing probability should be between 0 and 1") # TODO: deprecate p in favor of wrapping the transform in a RandomApply + # The above approach is very composable but will lead to verbose code. Instead, we can create a base class + # that inherits from Transform (say RandomTransform) that receives the `p` on constructor and by default + # implements the `p` random check on forward. This is likely to happen on the final clean ups, so perhaps + # update the comment to indicate accordingly OR create an issue to track this discussion. self.p = p self.scale = scale self.ratio = ratio @@ -84,6 +88,8 @@ def _get_params(self, sample: Any) -> Dict[str, Any]: break else: i, j, h, w, v = 0, 0, img_h, img_w, image + # FYI: Now taht we are not JIT-scriptable, I probably can avoid copying-pasting the image to itself in this + # scenario. Perhaps a simple clone would do. return dict(zip("ijhwv", (i, j, h, w, v))) @@ -95,6 +101,7 @@ def _transform(self, input: Any, params: Dict[str, Any]) -> Any: return features.Image.new_like(input, output) elif isinstance(input, torch.Tensor): return F.erase_image_tensor(input, **params) + # FYI: we plan to add support for PIL, as part of Batteries Included else: return input @@ -112,6 +119,8 @@ def __init__(self, *, alpha: float) -> None: self._dist = torch.distributions.Beta(torch.tensor([alpha]), torch.tensor([alpha])) def _get_params(self, sample: Any) -> Dict[str, Any]: + # Question: Shall we enforce here the existence of Labels in the sample? If yes, this method of validating + # input won't work if get_params() gets public and the user sis able to provide params in forward. return dict(lam=float(self._dist.sample(()))) def _transform(self, input: Any, params: Dict[str, Any]) -> Any: @@ -134,6 +143,7 @@ def __init__(self, *, alpha: float) -> None: self._dist = torch.distributions.Beta(torch.tensor([alpha]), torch.tensor([alpha])) def _get_params(self, sample: Any) -> Dict[str, Any]: + # Question: Same as above for Labels. lam = float(self._dist.sample(())) image = query_image(sample) diff --git a/torchvision/prototype/transforms/_auto_augment.py b/torchvision/prototype/transforms/_auto_augment.py index 892162fa296..107591a841c 100644 --- a/torchvision/prototype/transforms/_auto_augment.py +++ b/torchvision/prototype/transforms/_auto_augment.py @@ -35,6 +35,9 @@ def __init__( self.fill = fill def _get_random_item(self, dct: Dict[K, V]) -> Tuple[K, V]: + # Question: This looks like a utility method that doesn't depend on self and could move out of this + # class to simplify its structure (for someone who tries to understand its API to implement a new method). + # Thoughts? keys = tuple(dct.keys()) key = keys[int(torch.randint(len(keys), ()))] return key, dct[key] @@ -46,6 +49,9 @@ def _check_unsupported(self, input: Any) -> None: def _extract_image( self, sample: Any ) -> Tuple[Tuple[Any, ...], Union[PIL.Image.Image, torch.Tensor, features.Image]]: + # How about providing the unsupported types as parameter to this method to avoid hardcoding it? This can + # allow us to remove the separate _check_unsupported method. Also this method could be removed from the + # class to simplify its structure. def fn( id: Tuple[Any, ...], input: Any ) -> Optional[Tuple[Tuple[Any, ...], Union[PIL.Image.Image, torch.Tensor, features.Image]]]: @@ -67,6 +73,7 @@ def fn( def _parse_fill( self, image: Union[PIL.Image.Image, torch.Tensor, features.Image], num_channels: int ) -> Optional[List[float]]: + # Question: How do you feel about turning this also a util? fill = self.fill if isinstance(image, PIL.Image.Image) or fill is None: @@ -204,6 +211,8 @@ class AutoAugment(_AutoAugmentBase): "ShearY": (lambda num_bins, image_size: torch.linspace(0.0, 0.3, num_bins), True), "TranslateX": (lambda num_bins, image_size: torch.linspace(0.0, 150.0 / 331.0 * image_size[1], num_bins), True), "TranslateY": (lambda num_bins, image_size: torch.linspace(0.0, 150.0 / 331.0 * image_size[0], num_bins), True), + # How do you feel about explicitly passing height, width here instead of image_size? I did it in an earlier PR + # but removed it due to the verbosity. Thoughts? "Rotate": (lambda num_bins, image_size: torch.linspace(0.0, 30.0, num_bins), True), "Brightness": (lambda num_bins, image_size: torch.linspace(0.0, 0.9, num_bins), True), "Color": (lambda num_bins, image_size: torch.linspace(0.0, 0.9, num_bins), True), @@ -222,6 +231,8 @@ class AutoAugment(_AutoAugmentBase): } def __init__(self, policy: AutoAugmentPolicy = AutoAugmentPolicy.IMAGENET, **kwargs: Any) -> None: + # I think using kwargs on the constructor makes it hard to users to understand which parameters to provide. + # I recommend passing explicitly interpolation and fill in all AA constructors. super().__init__(**kwargs) self.policy = policy self._policies = self._get_policies(policy) diff --git a/torchvision/prototype/transforms/_container.py b/torchvision/prototype/transforms/_container.py index bd20d0c701a..dfe6fb4def2 100644 --- a/torchvision/prototype/transforms/_container.py +++ b/torchvision/prototype/transforms/_container.py @@ -38,6 +38,7 @@ def extra_repr(self) -> str: class RandomChoice(Transform): def __init__(self, *transforms: Transform) -> None: + # This method should receive optionally a list of probabilities and sample transforms proportionally. super().__init__() self.transforms = transforms for idx, transform in enumerate(transforms): diff --git a/torchvision/prototype/transforms/_meta.py b/torchvision/prototype/transforms/_meta.py index 3675e1d8ada..207ef0b8242 100644 --- a/torchvision/prototype/transforms/_meta.py +++ b/torchvision/prototype/transforms/_meta.py @@ -4,7 +4,7 @@ import torch from torchvision.prototype import features from torchvision.prototype.transforms import Transform, functional as F -from torchvision.transforms.functional import convert_image_dtype +from torchvision.transforms.functional import convert_image_dtype # We should have our an alias for this on the new API class ConvertBoundingBoxFormat(Transform): @@ -23,6 +23,10 @@ def _transform(self, input: Any, params: Dict[str, Any]) -> Any: class ConvertImageDtype(Transform): + # Question: Why do we have both this and a ToDtype Transform? Is this due to BC? Ideally we could move people off + # from methods that did an implicit normalization of the values (like this one, or to_tensor). cc @vfdev-5 + # If that's the case, we should move to _legacy and add deprecation warnings from day one to push people to use + # the new methods. def __init__(self, dtype: torch.dtype = torch.float32) -> None: super().__init__() self.dtype = dtype @@ -59,7 +63,7 @@ def _transform(self, input: Any, params: Dict[str, Any]) -> Any: return features.Image.new_like(input, output, color_space=self.color_space) elif isinstance(input, torch.Tensor): if self.old_color_space is None: - raise RuntimeError("") + raise RuntimeError("") # Add better exception message return F.convert_image_color_space_tensor( input, old_color_space=self.old_color_space, new_color_space=self.color_space diff --git a/torchvision/prototype/transforms/_transform.py b/torchvision/prototype/transforms/_transform.py index 923b90c6777..e364bdd8dc4 100644 --- a/torchvision/prototype/transforms/_transform.py +++ b/torchvision/prototype/transforms/_transform.py @@ -12,6 +12,10 @@ def __init__(self) -> None: super().__init__() _log_api_usage_once(self) + # FYI: I spoke with @NicolasHug offline and provided a valid use-case where it would be useful to make this public, + # as you originally intended. We don't have to do this now but if we do end up exposing this publicly, we should + # rename it to avoid conflicts with the previous static get_params() which worked completely differently. + # We dn't have to do anything now but we should discuss this again after the feedback. def _get_params(self, sample: Any) -> Dict[str, Any]: return dict() diff --git a/torchvision/prototype/transforms/functional/__init__.py b/torchvision/prototype/transforms/functional/__init__.py index e3fe60a7919..ec149362e3e 100644 --- a/torchvision/prototype/transforms/functional/__init__.py +++ b/torchvision/prototype/transforms/functional/__init__.py @@ -1,3 +1,9 @@ +# He should create an issue that lists the steps that need to be performed for rolling out the API to main TorchVision. +# I got a similar for the models, see here: https://github.com/pytorch/vision/issues/4679 +# One of the key things we would need to take care of is that all the kernels below will need logging. This is because +# there will be no high-level kernel (like `F` on main) and we would instead need to put tracking directly in each +# low-level kernels which will be now public (now functional_pil|tensor are private). + from torchvision.transforms import InterpolationMode # usort: skip from ._meta import ( convert_bounding_box_format, @@ -63,3 +69,13 @@ ) from ._misc import normalize_image_tensor, gaussian_blur_image_tensor from ._type_conversion import decode_image_with_pil, decode_video_with_av, label_to_one_hot + +# What are the migration plans for public methods without new API equivalents? There are two categories: +# 1. Deprecated methods which have equivalents on the new API (_legacy.py?): +# - get_image_size, get_image_num_channels: use get_dimensions_image_tensor|pil +# - to_grayscale, rgb_to_grayscale: use convert_image_color_space_tensor|pil +# 2. Those without equivalents on the new API: +# - five_crop, ten_crop (must be added) +# - pil_to_tensor, to_pil_image (_legacy.py?) +# - to_tensor() (deprecate vfdev-5?) +# We need a plan for both categories implemented on the new API. diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py index 080fe5da891..3c6f07d807d 100644 --- a/torchvision/prototype/transforms/functional/_geometry.py +++ b/torchvision/prototype/transforms/functional/_geometry.py @@ -66,6 +66,7 @@ def resize_segmentation_mask( # TODO: handle max_size +# Where is the issue for this TODO? def resize_bounding_box(bounding_box: torch.Tensor, size: List[int], image_size: Tuple[int, int]) -> torch.Tensor: old_height, old_width = image_size new_height, new_width = size @@ -220,6 +221,8 @@ def perspective_image_tensor( interpolation: InterpolationMode = InterpolationMode.BILINEAR, fill: Optional[List[float]] = None, ) -> torch.Tensor: + # We should go ahead and update the _FT API to accept InterpolationMode. It's currently considered private and + # there are no BC guarantees. This will allow you to stop needing to do `.value` in many of these places. return _FT.perspective(img, perspective_coeffs, interpolation=interpolation.value, fill=fill) @@ -229,6 +232,7 @@ def perspective_image_pil( interpolation: InterpolationMode = InterpolationMode.BICUBIC, fill: Optional[List[float]] = None, ) -> PIL.Image.Image: + # Same thing here. We should move the pil_modes_mapping convertion in the `_FP` side. return _FP.perspective(img, perspective_coeffs, interpolation=pil_modes_mapping[interpolation], fill=fill) diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py index 6ecb5aff257..3ba277cdfa5 100644 --- a/torchvision/prototype/transforms/functional/_meta.py +++ b/torchvision/prototype/transforms/functional/_meta.py @@ -58,12 +58,13 @@ def convert_bounding_box_format( def _grayscale_to_rgb_tensor(grayscale: torch.Tensor) -> torch.Tensor: - return grayscale.expand(3, 1, 1) + return grayscale.expand(3, 1, 1) # This approach assumes single image and not batch def convert_image_color_space_tensor( image: torch.Tensor, old_color_space: ColorSpace, new_color_space: ColorSpace ) -> torch.Tensor: + # the new color_space should only be RGB or Grayscale if new_color_space == old_color_space: return image.clone() @@ -73,6 +74,12 @@ def convert_image_color_space_tensor( if new_color_space == ColorSpace.GRAYSCALE: image = _FT.rgb_to_grayscale(image) + # we need a way to strip off alpha transparencies: + # - RGBA => RGB + # - RGBA => Grayscale + # - Gray-Alpha => RGB + # - Gray-Alpha => Gray + return image @@ -83,6 +90,7 @@ def _grayscale_to_rgb_pil(grayscale: PIL.Image.Image) -> PIL.Image.Image: def convert_image_color_space_pil( image: PIL.Image.Image, old_color_space: ColorSpace, new_color_space: ColorSpace ) -> PIL.Image.Image: + # the new color_space should only be RGB or Grayscale if new_color_space == old_color_space: return image.copy() diff --git a/torchvision/prototype/transforms/functional/_misc.py b/torchvision/prototype/transforms/functional/_misc.py index fd0507cca4d..c1e5a702ceb 100644 --- a/torchvision/prototype/transforms/functional/_misc.py +++ b/torchvision/prototype/transforms/functional/_misc.py @@ -39,4 +39,5 @@ def gaussian_blur_image_tensor( def gaussian_blur_image_pil(img: PIL.Image, kernel_size: List[int], sigma: Optional[List[float]] = None) -> PIL.Image: + # It would really help to remove to_tensor from here vfdev-5 return to_pil_image(gaussian_blur_image_tensor(to_tensor(img), kernel_size=kernel_size, sigma=sigma))