From 969635e50fca8d484780b1eef2845e10a5d3a573 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <vvryniotis@fb.com>
Date: Mon, 28 Feb 2022 21:00:42 +0000
Subject: [PATCH] Review new Transforms API

---
 torchvision/prototype/features/_bounding_box.py  |  1 +
 torchvision/prototype/features/_encoded.py       |  1 +
 torchvision/prototype/features/_image.py         |  5 +++++
 torchvision/prototype/transforms/__init__.py     | 11 +++++++++++
 torchvision/prototype/transforms/_augment.py     | 10 ++++++++++
 .../prototype/transforms/_auto_augment.py        | 11 +++++++++++
 torchvision/prototype/transforms/_container.py   |  1 +
 torchvision/prototype/transforms/_meta.py        |  8 ++++++--
 torchvision/prototype/transforms/_transform.py   |  4 ++++
 .../prototype/transforms/functional/__init__.py  | 16 ++++++++++++++++
 .../prototype/transforms/functional/_geometry.py |  4 ++++
 .../prototype/transforms/functional/_meta.py     | 10 +++++++++-
 .../prototype/transforms/functional/_misc.py     |  1 +
 13 files changed, 80 insertions(+), 3 deletions(-)

diff --git a/torchvision/prototype/features/_bounding_box.py b/torchvision/prototype/features/_bounding_box.py
index 5b60d7ee55c..85d81efcb1f 100644
--- a/torchvision/prototype/features/_bounding_box.py
+++ b/torchvision/prototype/features/_bounding_box.py
@@ -39,6 +39,7 @@ def __new__(
     def to_format(self, format: Union[str, BoundingBoxFormat]) -> BoundingBox:
         # TODO: this is useful for developing and debugging but we should remove or at least revisit this before we
         #  promote this out of the prototype state
+        # Do we still need this?
 
         # import at runtime to avoid cyclic imports
         from torchvision.prototype.transforms.functional import convert_bounding_box_format
diff --git a/torchvision/prototype/features/_encoded.py b/torchvision/prototype/features/_encoded.py
index 276aeec2529..bcc4180022f 100644
--- a/torchvision/prototype/features/_encoded.py
+++ b/torchvision/prototype/features/_encoded.py
@@ -41,6 +41,7 @@ def image_size(self) -> Tuple[int, int]:
     def decode(self) -> Image:
         # TODO: this is useful for developing and debugging but we should remove or at least revisit this before we
         #  promote this out of the prototype state
+        # Do we still need this?
 
         # import at runtime to avoid cyclic imports
         from torchvision.prototype.transforms.functional import decode_image_with_pil
diff --git a/torchvision/prototype/features/_image.py b/torchvision/prototype/features/_image.py
index 5ecc4cbedb7..4b606550e0a 100644
--- a/torchvision/prototype/features/_image.py
+++ b/torchvision/prototype/features/_image.py
@@ -17,6 +17,9 @@ class ColorSpace(StrEnum):
     OTHER = 0
     GRAYSCALE = 1
     RGB = 3
+    # On io, we currently support Alpha transparency for Grayscale and RGB. We also support palette for PNG.
+    # Our enum must know about these. We should also allow for users to strip out the alpha transparency which is
+    # currently not supported by any colour transform.
 
 
 class Image(_Feature):
@@ -78,9 +81,11 @@ def guess_color_space(data: torch.Tensor) -> ColorSpace:
     def show(self) -> None:
         # TODO: this is useful for developing and debugging but we should remove or at least revisit this before we
         #  promote this out of the prototype state
+        # Do we still need this?
         to_pil_image(make_grid(self.view(-1, *self.shape[-3:]))).show()
 
     def draw_bounding_box(self, bounding_box: BoundingBox, **kwargs: Any) -> Image:
         # TODO: this is useful for developing and debugging but we should remove or at least revisit this before we
         #  promote this out of the prototype state
+        # Do we still need this?
         return Image.new_like(self, draw_bounding_boxes(self, bounding_box.to_format("xyxy").view(-1, 4), **kwargs))
diff --git a/torchvision/prototype/transforms/__init__.py b/torchvision/prototype/transforms/__init__.py
index 98ad7ae0d74..d3607c2fad1 100644
--- a/torchvision/prototype/transforms/__init__.py
+++ b/torchvision/prototype/transforms/__init__.py
@@ -12,3 +12,14 @@
 from ._misc import Identity, Normalize, ToDtype, Lambda
 from ._presets import CocoEval, ImageNetEval, VocEval, Kinect400Eval, RaftEval
 from ._type_conversion import DecodeImage, LabelToOneHot
+
+# What are the migration plans for Classes without new API equivalents? There are two categories:
+# 1. Deprecated methods which have equivalents on the new API (_legacy.py?):
+# - Grayscale, RandomGrayscale: use ConvertImageColorSpace
+# 2. Those without equivalents on the new API:
+# - Pad, RandomCrop, RandomHorizontalFlip, RandomVerticalFlip, RandomPerspective, FiveCrop, TenCrop, ColorJitter,
+# RandomRotation, RandomAffine, GaussianBlur, RandomInvert, RandomPosterize, RandomSolarize, RandomAdjustSharpness,
+# RandomAutocontrast, RandomEqualize, LinearTransformation (must be added)
+# - PILToTensor, ToPILImage (_legacy.py?)
+# - ToTensor (deprecate vfdev-5?)
+# We need a plan for both categories implemented on the new API.
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
index 5862c6a06dc..943deba23f2 100644
--- a/torchvision/prototype/transforms/_augment.py
+++ b/torchvision/prototype/transforms/_augment.py
@@ -34,6 +34,10 @@ def __init__(
         if p < 0 or p > 1:
             raise ValueError("Random erasing probability should be between 0 and 1")
         # TODO: deprecate p in favor of wrapping the transform in a RandomApply
+        # The above approach is very composable but will lead to verbose code. Instead, we can create a base class
+        # that inherits from Transform (say RandomTransform) that receives the `p` on constructor and by default
+        # implements the `p` random check on forward. This is likely to happen on the final clean ups, so perhaps
+        # update the comment to indicate accordingly OR create an issue to track this discussion.
         self.p = p
         self.scale = scale
         self.ratio = ratio
@@ -84,6 +88,8 @@ def _get_params(self, sample: Any) -> Dict[str, Any]:
             break
         else:
             i, j, h, w, v = 0, 0, img_h, img_w, image
+            # FYI: Now taht we are not JIT-scriptable, I probably can avoid copying-pasting the image to itself in this
+            # scenario. Perhaps a simple clone would do.
 
         return dict(zip("ijhwv", (i, j, h, w, v)))
 
@@ -95,6 +101,7 @@ def _transform(self, input: Any, params: Dict[str, Any]) -> Any:
             return features.Image.new_like(input, output)
         elif isinstance(input, torch.Tensor):
             return F.erase_image_tensor(input, **params)
+        # FYI: we plan to add support for PIL, as part of Batteries Included
         else:
             return input
 
@@ -112,6 +119,8 @@ def __init__(self, *, alpha: float) -> None:
         self._dist = torch.distributions.Beta(torch.tensor([alpha]), torch.tensor([alpha]))
 
     def _get_params(self, sample: Any) -> Dict[str, Any]:
+        # Question: Shall we enforce here the existence of Labels in the sample? If yes, this method of validating
+        # input won't work if get_params() gets public and the user sis able to provide params in forward.
         return dict(lam=float(self._dist.sample(())))
 
     def _transform(self, input: Any, params: Dict[str, Any]) -> Any:
@@ -134,6 +143,7 @@ def __init__(self, *, alpha: float) -> None:
         self._dist = torch.distributions.Beta(torch.tensor([alpha]), torch.tensor([alpha]))
 
     def _get_params(self, sample: Any) -> Dict[str, Any]:
+        # Question: Same as above for Labels.
         lam = float(self._dist.sample(()))
 
         image = query_image(sample)
diff --git a/torchvision/prototype/transforms/_auto_augment.py b/torchvision/prototype/transforms/_auto_augment.py
index 892162fa296..107591a841c 100644
--- a/torchvision/prototype/transforms/_auto_augment.py
+++ b/torchvision/prototype/transforms/_auto_augment.py
@@ -35,6 +35,9 @@ def __init__(
         self.fill = fill
 
     def _get_random_item(self, dct: Dict[K, V]) -> Tuple[K, V]:
+        # Question: This looks like a utility method that doesn't depend on self and could move out of this
+        # class to simplify its structure (for someone who tries to understand its API to implement a new method).
+        # Thoughts?
         keys = tuple(dct.keys())
         key = keys[int(torch.randint(len(keys), ()))]
         return key, dct[key]
@@ -46,6 +49,9 @@ def _check_unsupported(self, input: Any) -> None:
     def _extract_image(
         self, sample: Any
     ) -> Tuple[Tuple[Any, ...], Union[PIL.Image.Image, torch.Tensor, features.Image]]:
+        # How about providing the unsupported types as parameter to this method to avoid hardcoding it? This can
+        # allow us to remove the separate _check_unsupported method. Also this method could be removed from the
+        # class to simplify its structure.
         def fn(
             id: Tuple[Any, ...], input: Any
         ) -> Optional[Tuple[Tuple[Any, ...], Union[PIL.Image.Image, torch.Tensor, features.Image]]]:
@@ -67,6 +73,7 @@ def fn(
     def _parse_fill(
         self, image: Union[PIL.Image.Image, torch.Tensor, features.Image], num_channels: int
     ) -> Optional[List[float]]:
+        # Question: How do you feel about turning this also a util?
         fill = self.fill
 
         if isinstance(image, PIL.Image.Image) or fill is None:
@@ -204,6 +211,8 @@ class AutoAugment(_AutoAugmentBase):
         "ShearY": (lambda num_bins, image_size: torch.linspace(0.0, 0.3, num_bins), True),
         "TranslateX": (lambda num_bins, image_size: torch.linspace(0.0, 150.0 / 331.0 * image_size[1], num_bins), True),
         "TranslateY": (lambda num_bins, image_size: torch.linspace(0.0, 150.0 / 331.0 * image_size[0], num_bins), True),
+        # How do you feel about explicitly passing height, width here instead of image_size? I did it in an earlier PR
+        # but removed it due to the verbosity. Thoughts?
         "Rotate": (lambda num_bins, image_size: torch.linspace(0.0, 30.0, num_bins), True),
         "Brightness": (lambda num_bins, image_size: torch.linspace(0.0, 0.9, num_bins), True),
         "Color": (lambda num_bins, image_size: torch.linspace(0.0, 0.9, num_bins), True),
@@ -222,6 +231,8 @@ class AutoAugment(_AutoAugmentBase):
     }
 
     def __init__(self, policy: AutoAugmentPolicy = AutoAugmentPolicy.IMAGENET, **kwargs: Any) -> None:
+        # I think using kwargs on the constructor makes it hard to users to understand which parameters to provide.
+        # I recommend passing explicitly interpolation and fill in all AA constructors.
         super().__init__(**kwargs)
         self.policy = policy
         self._policies = self._get_policies(policy)
diff --git a/torchvision/prototype/transforms/_container.py b/torchvision/prototype/transforms/_container.py
index bd20d0c701a..dfe6fb4def2 100644
--- a/torchvision/prototype/transforms/_container.py
+++ b/torchvision/prototype/transforms/_container.py
@@ -38,6 +38,7 @@ def extra_repr(self) -> str:
 
 class RandomChoice(Transform):
     def __init__(self, *transforms: Transform) -> None:
+        # This method should receive optionally a list of probabilities and sample transforms proportionally.
         super().__init__()
         self.transforms = transforms
         for idx, transform in enumerate(transforms):
diff --git a/torchvision/prototype/transforms/_meta.py b/torchvision/prototype/transforms/_meta.py
index 3675e1d8ada..207ef0b8242 100644
--- a/torchvision/prototype/transforms/_meta.py
+++ b/torchvision/prototype/transforms/_meta.py
@@ -4,7 +4,7 @@
 import torch
 from torchvision.prototype import features
 from torchvision.prototype.transforms import Transform, functional as F
-from torchvision.transforms.functional import convert_image_dtype
+from torchvision.transforms.functional import convert_image_dtype  # We should have our an alias for this on the new API
 
 
 class ConvertBoundingBoxFormat(Transform):
@@ -23,6 +23,10 @@ def _transform(self, input: Any, params: Dict[str, Any]) -> Any:
 
 
 class ConvertImageDtype(Transform):
+    # Question: Why do we have both this and a ToDtype Transform? Is this due to BC? Ideally we could move people off
+    # from methods that did an implicit normalization of the values (like this one, or to_tensor). cc @vfdev-5
+    # If that's the case, we should move to _legacy and add deprecation warnings from day one to push people to use
+    # the new methods.
     def __init__(self, dtype: torch.dtype = torch.float32) -> None:
         super().__init__()
         self.dtype = dtype
@@ -59,7 +63,7 @@ def _transform(self, input: Any, params: Dict[str, Any]) -> Any:
             return features.Image.new_like(input, output, color_space=self.color_space)
         elif isinstance(input, torch.Tensor):
             if self.old_color_space is None:
-                raise RuntimeError("")
+                raise RuntimeError("")  # Add better exception message
 
             return F.convert_image_color_space_tensor(
                 input, old_color_space=self.old_color_space, new_color_space=self.color_space
diff --git a/torchvision/prototype/transforms/_transform.py b/torchvision/prototype/transforms/_transform.py
index 923b90c6777..e364bdd8dc4 100644
--- a/torchvision/prototype/transforms/_transform.py
+++ b/torchvision/prototype/transforms/_transform.py
@@ -12,6 +12,10 @@ def __init__(self) -> None:
         super().__init__()
         _log_api_usage_once(self)
 
+    # FYI: I spoke with @NicolasHug offline and provided a valid use-case where it would be useful to make this public,
+    # as you originally intended. We don't have to do this now but if we do end up exposing this publicly, we should
+    # rename it to avoid conflicts with the previous static get_params() which worked completely differently.
+    # We dn't have to do anything now but we should discuss this again after the feedback.
     def _get_params(self, sample: Any) -> Dict[str, Any]:
         return dict()
 
diff --git a/torchvision/prototype/transforms/functional/__init__.py b/torchvision/prototype/transforms/functional/__init__.py
index e3fe60a7919..ec149362e3e 100644
--- a/torchvision/prototype/transforms/functional/__init__.py
+++ b/torchvision/prototype/transforms/functional/__init__.py
@@ -1,3 +1,9 @@
+# He should create an issue that lists the steps that need to be performed for rolling out the API to main TorchVision.
+# I got a similar for the models, see here: https://github.com/pytorch/vision/issues/4679
+# One of the key things we would need to take care of is that all the kernels below will need logging. This is because
+# there will be no high-level kernel (like `F` on main) and we would instead need to put tracking directly in each
+# low-level kernels which will be now public (now functional_pil|tensor are private).
+
 from torchvision.transforms import InterpolationMode  # usort: skip
 from ._meta import (
     convert_bounding_box_format,
@@ -63,3 +69,13 @@
 )
 from ._misc import normalize_image_tensor, gaussian_blur_image_tensor
 from ._type_conversion import decode_image_with_pil, decode_video_with_av, label_to_one_hot
+
+# What are the migration plans for public methods without new API equivalents? There are two categories:
+# 1. Deprecated methods which have equivalents on the new API (_legacy.py?):
+# - get_image_size, get_image_num_channels: use get_dimensions_image_tensor|pil
+# - to_grayscale, rgb_to_grayscale: use convert_image_color_space_tensor|pil
+# 2. Those without equivalents on the new API:
+# - five_crop, ten_crop (must be added)
+# - pil_to_tensor, to_pil_image (_legacy.py?)
+# - to_tensor() (deprecate vfdev-5?)
+# We need a plan for both categories implemented on the new API.
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 080fe5da891..3c6f07d807d 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -66,6 +66,7 @@ def resize_segmentation_mask(
 
 
 # TODO: handle max_size
+# Where is the issue for this TODO?
 def resize_bounding_box(bounding_box: torch.Tensor, size: List[int], image_size: Tuple[int, int]) -> torch.Tensor:
     old_height, old_width = image_size
     new_height, new_width = size
@@ -220,6 +221,8 @@ def perspective_image_tensor(
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
     fill: Optional[List[float]] = None,
 ) -> torch.Tensor:
+    # We should go ahead and update the _FT API to accept InterpolationMode. It's currently considered private and
+    # there are no BC guarantees. This will allow you to stop needing to do `.value` in many of these places.
     return _FT.perspective(img, perspective_coeffs, interpolation=interpolation.value, fill=fill)
 
 
@@ -229,6 +232,7 @@ def perspective_image_pil(
     interpolation: InterpolationMode = InterpolationMode.BICUBIC,
     fill: Optional[List[float]] = None,
 ) -> PIL.Image.Image:
+    # Same thing here. We should move the pil_modes_mapping convertion in the `_FP` side.
     return _FP.perspective(img, perspective_coeffs, interpolation=pil_modes_mapping[interpolation], fill=fill)
 
 
diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py
index 6ecb5aff257..3ba277cdfa5 100644
--- a/torchvision/prototype/transforms/functional/_meta.py
+++ b/torchvision/prototype/transforms/functional/_meta.py
@@ -58,12 +58,13 @@ def convert_bounding_box_format(
 
 
 def _grayscale_to_rgb_tensor(grayscale: torch.Tensor) -> torch.Tensor:
-    return grayscale.expand(3, 1, 1)
+    return grayscale.expand(3, 1, 1)  # This approach assumes single image and not batch
 
 
 def convert_image_color_space_tensor(
     image: torch.Tensor, old_color_space: ColorSpace, new_color_space: ColorSpace
 ) -> torch.Tensor:
+    # the new color_space should only be RGB or Grayscale
     if new_color_space == old_color_space:
         return image.clone()
 
@@ -73,6 +74,12 @@ def convert_image_color_space_tensor(
     if new_color_space == ColorSpace.GRAYSCALE:
         image = _FT.rgb_to_grayscale(image)
 
+    # we need a way to strip off alpha transparencies:
+    # - RGBA => RGB
+    # - RGBA => Grayscale
+    # - Gray-Alpha => RGB
+    # - Gray-Alpha => Gray
+
     return image
 
 
@@ -83,6 +90,7 @@ def _grayscale_to_rgb_pil(grayscale: PIL.Image.Image) -> PIL.Image.Image:
 def convert_image_color_space_pil(
     image: PIL.Image.Image, old_color_space: ColorSpace, new_color_space: ColorSpace
 ) -> PIL.Image.Image:
+    # the new color_space should only be RGB or Grayscale
     if new_color_space == old_color_space:
         return image.copy()
 
diff --git a/torchvision/prototype/transforms/functional/_misc.py b/torchvision/prototype/transforms/functional/_misc.py
index fd0507cca4d..c1e5a702ceb 100644
--- a/torchvision/prototype/transforms/functional/_misc.py
+++ b/torchvision/prototype/transforms/functional/_misc.py
@@ -39,4 +39,5 @@ def gaussian_blur_image_tensor(
 
 
 def gaussian_blur_image_pil(img: PIL.Image, kernel_size: List[int], sigma: Optional[List[float]] = None) -> PIL.Image:
+    # It would really help to remove to_tensor from here vfdev-5
     return to_pil_image(gaussian_blur_image_tensor(to_tensor(img), kernel_size=kernel_size, sigma=sigma))