From 409df6856f589d5c2b3d52378fadbd9b920f94a7 Mon Sep 17 00:00:00 2001
From: mantasu <mantix7@gmail.com>
Date: Wed, 7 Feb 2024 10:50:38 +0000
Subject: [PATCH 1/7] Clarify default TVTensor shapes

---
 torchvision/tv_tensors/_bounding_boxes.py | 2 +-
 torchvision/tv_tensors/_image.py          | 2 +-
 torchvision/tv_tensors/_mask.py           | 2 +-
 torchvision/tv_tensors/_video.py          | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/torchvision/tv_tensors/_bounding_boxes.py b/torchvision/tv_tensors/_bounding_boxes.py
index 56e77c2a85e..62997040c66 100644
--- a/torchvision/tv_tensors/_bounding_boxes.py
+++ b/torchvision/tv_tensors/_bounding_boxes.py
@@ -25,7 +25,7 @@ class BoundingBoxFormat(Enum):
 
 
 class BoundingBoxes(TVTensor):
-    """:class:`torch.Tensor` subclass for bounding boxes.
+    """:class:`torch.Tensor` subclass for bounding boxes with a shape of ``(N, 4)``.
 
     .. note::
         There should be only one :class:`~torchvision.tv_tensors.BoundingBoxes`
diff --git a/torchvision/tv_tensors/_image.py b/torchvision/tv_tensors/_image.py
index c2f82c8d0df..c920f98fe78 100644
--- a/torchvision/tv_tensors/_image.py
+++ b/torchvision/tv_tensors/_image.py
@@ -9,7 +9,7 @@
 
 
 class Image(TVTensor):
-    """:class:`torch.Tensor` subclass for images.
+    """:class:`torch.Tensor` subclass for images with a minimum shape of ``(C x H x W)``.
 
     .. note::
 
diff --git a/torchvision/tv_tensors/_mask.py b/torchvision/tv_tensors/_mask.py
index a8f6f4d62cb..82ccb38a4a9 100644
--- a/torchvision/tv_tensors/_mask.py
+++ b/torchvision/tv_tensors/_mask.py
@@ -9,7 +9,7 @@
 
 
 class Mask(TVTensor):
-    """:class:`torch.Tensor` subclass for segmentation and detection masks.
+    """:class:`torch.Tensor` subclass for segmentation and detection masks with a minimum shape of ``(H x W)``.
 
     Args:
         data (tensor-like, PIL.Image.Image): Any data that can be turned into a tensor with :func:`torch.as_tensor` as
diff --git a/torchvision/tv_tensors/_video.py b/torchvision/tv_tensors/_video.py
index a0466b001ee..1f0c3317092 100644
--- a/torchvision/tv_tensors/_video.py
+++ b/torchvision/tv_tensors/_video.py
@@ -8,7 +8,7 @@
 
 
 class Video(TVTensor):
-    """:class:`torch.Tensor` subclass for videos.
+    """:class:`torch.Tensor` subclass for videos with a minimum shape of ``(T x C x H x W)``.
 
     Args:
         data (tensor-like): Any data that can be turned into a tensor with :func:`torch.as_tensor`.

From 2cacb30218d493905b13c8bc76e5e2db1aebf112 Mon Sep 17 00:00:00 2001
From: mantasu <mantix7@gmail.com>
Date: Wed, 7 Feb 2024 11:13:26 +0000
Subject: [PATCH 2/7] Allow to_image to handle image paths

---
 torchvision/transforms/v2/_type_conversion.py            | 8 ++++----
 torchvision/transforms/v2/functional/_type_conversion.py | 9 +++++++--
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/torchvision/transforms/v2/_type_conversion.py b/torchvision/transforms/v2/_type_conversion.py
index 7c7439b1d02..3ed72ca070f 100644
--- a/torchvision/transforms/v2/_type_conversion.py
+++ b/torchvision/transforms/v2/_type_conversion.py
@@ -25,16 +25,16 @@ def _transform(self, inpt: PIL.Image.Image, params: Dict[str, Any]) -> torch.Ten
 
 
 class ToImage(Transform):
-    """Convert a tensor, ndarray, or PIL Image to :class:`~torchvision.tv_tensors.Image`
-    ; this does not scale values.
+    """Convert a tensor, ndarray, PIL Image, or string representing image path to
+    :class:`~torchvision.tv_tensors.Image`; this does not scale values.
 
     This transform does not support torchscript.
     """
 
-    _transformed_types = (is_pure_tensor, PIL.Image.Image, np.ndarray)
+    _transformed_types = (is_pure_tensor, PIL.Image.Image, np.ndarray, str)
 
     def _transform(
-        self, inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray], params: Dict[str, Any]
+        self, inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray, str], params: Dict[str, Any]
     ) -> tv_tensors.Image:
         return F.to_image(inpt)
 
diff --git a/torchvision/transforms/v2/functional/_type_conversion.py b/torchvision/transforms/v2/functional/_type_conversion.py
index c5a731fe143..7087666c81b 100644
--- a/torchvision/transforms/v2/functional/_type_conversion.py
+++ b/torchvision/transforms/v2/functional/_type_conversion.py
@@ -6,9 +6,11 @@
 from torchvision import tv_tensors
 from torchvision.transforms import functional as _F
 
+from ....io import read_image
+
 
 @torch.jit.unused
-def to_image(inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray]) -> tv_tensors.Image:
+def to_image(inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray, str]) -> tv_tensors.Image:
     """See :class:`~torchvision.transforms.v2.ToImage` for details."""
     if isinstance(inpt, np.ndarray):
         output = torch.from_numpy(np.atleast_3d(inpt)).permute((2, 0, 1)).contiguous()
@@ -16,9 +18,12 @@ def to_image(inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray]) -> tv_tenso
         output = pil_to_tensor(inpt)
     elif isinstance(inpt, torch.Tensor):
         output = inpt
+    elif isinstance(inpt, str):
+        output = read_image(inpt)
     else:
         raise TypeError(
-            f"Input can either be a pure Tensor, a numpy array, or a PIL image, but got {type(inpt)} instead."
+            f"Input can either be a pure Tensor, a numpy array, a PIL image, "
+            f"or a string representing image path, but got {type(inpt)} instead."
         )
     return tv_tensors.Image(output)
 

From 1fbcfc525aabd36ae981d92462d452a6ee75f1d7 Mon Sep 17 00:00:00 2001
From: mantasu <mantix7@gmail.com>
Date: Wed, 7 Feb 2024 11:59:57 +0000
Subject: [PATCH 3/7] Add test_image_file for TestToImage

---
 test/test_transforms_v2.py | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 458f83f01c3..6d22e59d0ba 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -8,6 +8,7 @@
 import random
 import re
 import sys
+import tempfile
 from copy import deepcopy
 from pathlib import Path
 from unittest import mock
@@ -1106,9 +1107,11 @@ def test_kernel_image(self, param, value, dtype, device):
             make_image(dtype=dtype, device=device),
             **{param: value},
             check_scripted_vs_eager=not (param in {"shear", "fill"} and isinstance(value, (int, float))),
-            check_cuda_vs_cpu=dict(atol=1, rtol=0)
-            if dtype is torch.uint8 and param == "interpolation" and value is transforms.InterpolationMode.BILINEAR
-            else True,
+            check_cuda_vs_cpu=(
+                dict(atol=1, rtol=0)
+                if dtype is torch.uint8 and param == "interpolation" and value is transforms.InterpolationMode.BILINEAR
+                else True
+            ),
         )
 
     @param_value_parametrization(
@@ -5182,13 +5185,28 @@ def test_functional_and_transform(self, make_input, fn):
         if isinstance(input, torch.Tensor):
             assert output.data_ptr() == input.data_ptr()
 
+    @pytest.mark.parametrize("fn", [F.to_image, transform_cls_to_functional(transforms.ToImage)])
+    def test_image_file(self, fn):
+        # Non-regression test for https://github.com/pytorch/vision/issues/8261
+        img_np = np.random.randint(0, 256, (10, 10, 3), dtype=np.uint8)
+        temp_file = tempfile.NamedTemporaryFile(suffix=".jpg", delete=True)
+        PIL.Image.fromarray(img_np).save(temp_file.name)
+
+        output = fn(temp_file.name)
+        assert isinstance(output, tv_tensors.Image)
+        assert F.get_size(output) == list(img_np.shape[:2])
+
+        temp_file.close()
+
     def test_2d_np_array(self):
         # Non-regression test for https://github.com/pytorch/vision/issues/8255
         input = np.random.rand(10, 10)
         assert F.to_image(input).shape == (1, 10, 10)
 
     def test_functional_error(self):
-        with pytest.raises(TypeError, match="Input can either be a pure Tensor, a numpy array, or a PIL image"):
+        with pytest.raises(
+            TypeError, match="Input can either be a pure Tensor, a numpy array, a PIL image, or a string path"
+        ):
             F.to_image(object())
 
 

From 4c1bde21688e22b3b781d313bb45827dfdfbb724 Mon Sep 17 00:00:00 2001
From: mantasu <mantix7@gmail.com>
Date: Wed, 7 Feb 2024 14:03:02 +0000
Subject: [PATCH 4/7] Replace relative import with absolute

---
 torchvision/transforms/v2/functional/_type_conversion.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/torchvision/transforms/v2/functional/_type_conversion.py b/torchvision/transforms/v2/functional/_type_conversion.py
index 7087666c81b..46392bf8f1e 100644
--- a/torchvision/transforms/v2/functional/_type_conversion.py
+++ b/torchvision/transforms/v2/functional/_type_conversion.py
@@ -4,10 +4,9 @@
 import PIL.Image
 import torch
 from torchvision import tv_tensors
+from torchvision.io import read_image
 from torchvision.transforms import functional as _F
 
-from ....io import read_image
-
 
 @torch.jit.unused
 def to_image(inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray, str]) -> tv_tensors.Image:

From e16b71ca877ad2bf26a8c39f1cd9d190217777d8 Mon Sep 17 00:00:00 2001
From: mantasu <mantix7@gmail.com>
Date: Tue, 5 Mar 2024 14:11:21 +0000
Subject: [PATCH 5/7] Revert "Add test_image_file for TestToImage" (see #8261)

This reverts commit 1fbcfc525aabd36ae981d92462d452a6ee75f1d7.
---
 test/test_transforms_v2.py | 26 ++++----------------------
 1 file changed, 4 insertions(+), 22 deletions(-)

diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 731d855d51a..0fb3ee6c11f 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -8,7 +8,6 @@
 import random
 import re
 import sys
-import tempfile
 from copy import deepcopy
 from pathlib import Path
 from unittest import mock
@@ -1107,11 +1106,9 @@ def test_kernel_image(self, param, value, dtype, device):
             make_image(dtype=dtype, device=device),
             **{param: value},
             check_scripted_vs_eager=not (param in {"shear", "fill"} and isinstance(value, (int, float))),
-            check_cuda_vs_cpu=(
-                dict(atol=1, rtol=0)
-                if dtype is torch.uint8 and param == "interpolation" and value is transforms.InterpolationMode.BILINEAR
-                else True
-            ),
+            check_cuda_vs_cpu=dict(atol=1, rtol=0)
+            if dtype is torch.uint8 and param == "interpolation" and value is transforms.InterpolationMode.BILINEAR
+            else True,
         )
 
     @param_value_parametrization(
@@ -5202,28 +5199,13 @@ def test_functional_and_transform(self, make_input, fn):
         if isinstance(input, torch.Tensor):
             assert output.data_ptr() == input.data_ptr()
 
-    @pytest.mark.parametrize("fn", [F.to_image, transform_cls_to_functional(transforms.ToImage)])
-    def test_image_file(self, fn):
-        # Non-regression test for https://github.com/pytorch/vision/issues/8261
-        img_np = np.random.randint(0, 256, (10, 10, 3), dtype=np.uint8)
-        temp_file = tempfile.NamedTemporaryFile(suffix=".jpg", delete=True)
-        PIL.Image.fromarray(img_np).save(temp_file.name)
-
-        output = fn(temp_file.name)
-        assert isinstance(output, tv_tensors.Image)
-        assert F.get_size(output) == list(img_np.shape[:2])
-
-        temp_file.close()
-
     def test_2d_np_array(self):
         # Non-regression test for https://github.com/pytorch/vision/issues/8255
         input = np.random.rand(10, 10)
         assert F.to_image(input).shape == (1, 10, 10)
 
     def test_functional_error(self):
-        with pytest.raises(
-            TypeError, match="Input can either be a pure Tensor, a numpy array, a PIL image, or a string path"
-        ):
+        with pytest.raises(TypeError, match="Input can either be a pure Tensor, a numpy array, or a PIL image"):
             F.to_image(object())
 
 

From 9324a4f725fbeb036905e0bd526a92b3de0a7958 Mon Sep 17 00:00:00 2001
From: mantasu <mantix7@gmail.com>
Date: Tue, 5 Mar 2024 14:16:41 +0000
Subject: [PATCH 6/7] Revert "Allow to_image to handle image paths" (see #8261)

This reverts commit 2cacb30218d493905b13c8bc76e5e2db1aebf112.
---
 torchvision/transforms/v2/_type_conversion.py            | 8 ++++----
 torchvision/transforms/v2/functional/_type_conversion.py | 7 ++-----
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/torchvision/transforms/v2/_type_conversion.py b/torchvision/transforms/v2/_type_conversion.py
index 3ed72ca070f..7c7439b1d02 100644
--- a/torchvision/transforms/v2/_type_conversion.py
+++ b/torchvision/transforms/v2/_type_conversion.py
@@ -25,16 +25,16 @@ def _transform(self, inpt: PIL.Image.Image, params: Dict[str, Any]) -> torch.Ten
 
 
 class ToImage(Transform):
-    """Convert a tensor, ndarray, PIL Image, or string representing image path to
-    :class:`~torchvision.tv_tensors.Image`; this does not scale values.
+    """Convert a tensor, ndarray, or PIL Image to :class:`~torchvision.tv_tensors.Image`
+    ; this does not scale values.
 
     This transform does not support torchscript.
     """
 
-    _transformed_types = (is_pure_tensor, PIL.Image.Image, np.ndarray, str)
+    _transformed_types = (is_pure_tensor, PIL.Image.Image, np.ndarray)
 
     def _transform(
-        self, inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray, str], params: Dict[str, Any]
+        self, inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray], params: Dict[str, Any]
     ) -> tv_tensors.Image:
         return F.to_image(inpt)
 
diff --git a/torchvision/transforms/v2/functional/_type_conversion.py b/torchvision/transforms/v2/functional/_type_conversion.py
index 46392bf8f1e..089a751e05d 100644
--- a/torchvision/transforms/v2/functional/_type_conversion.py
+++ b/torchvision/transforms/v2/functional/_type_conversion.py
@@ -9,7 +9,7 @@
 
 
 @torch.jit.unused
-def to_image(inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray, str]) -> tv_tensors.Image:
+def to_image(inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray]) -> tv_tensors.Image:
     """See :class:`~torchvision.transforms.v2.ToImage` for details."""
     if isinstance(inpt, np.ndarray):
         output = torch.from_numpy(np.atleast_3d(inpt)).permute((2, 0, 1)).contiguous()
@@ -17,12 +17,9 @@ def to_image(inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray, str]) -> tv_
         output = pil_to_tensor(inpt)
     elif isinstance(inpt, torch.Tensor):
         output = inpt
-    elif isinstance(inpt, str):
-        output = read_image(inpt)
     else:
         raise TypeError(
-            f"Input can either be a pure Tensor, a numpy array, a PIL image, "
-            f"or a string representing image path, but got {type(inpt)} instead."
+            f"Input can either be a pure Tensor, a numpy array, or a PIL image, but got {type(inpt)} instead."
         )
     return tv_tensors.Image(output)
 

From 1cfe17df3300ba9057507fc442a481e731d52272 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 19 Apr 2024 12:56:07 +0100
Subject: [PATCH 7/7] minor changes

---
 torchvision/transforms/v2/functional/_type_conversion.py | 1 -
 torchvision/tv_tensors/_bounding_boxes.py                | 2 +-
 torchvision/tv_tensors/_image.py                         | 2 +-
 torchvision/tv_tensors/_mask.py                          | 2 +-
 torchvision/tv_tensors/_video.py                         | 2 +-
 5 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/torchvision/transforms/v2/functional/_type_conversion.py b/torchvision/transforms/v2/functional/_type_conversion.py
index 089a751e05d..c5a731fe143 100644
--- a/torchvision/transforms/v2/functional/_type_conversion.py
+++ b/torchvision/transforms/v2/functional/_type_conversion.py
@@ -4,7 +4,6 @@
 import PIL.Image
 import torch
 from torchvision import tv_tensors
-from torchvision.io import read_image
 from torchvision.transforms import functional as _F
 
 
diff --git a/torchvision/tv_tensors/_bounding_boxes.py b/torchvision/tv_tensors/_bounding_boxes.py
index 62997040c66..ea02fa3dc7b 100644
--- a/torchvision/tv_tensors/_bounding_boxes.py
+++ b/torchvision/tv_tensors/_bounding_boxes.py
@@ -25,7 +25,7 @@ class BoundingBoxFormat(Enum):
 
 
 class BoundingBoxes(TVTensor):
-    """:class:`torch.Tensor` subclass for bounding boxes with a shape of ``(N, 4)``.
+    """:class:`torch.Tensor` subclass for bounding boxes with shape ``[N, 4]``.
 
     .. note::
         There should be only one :class:`~torchvision.tv_tensors.BoundingBoxes`
diff --git a/torchvision/tv_tensors/_image.py b/torchvision/tv_tensors/_image.py
index c920f98fe78..2a0a2ec7209 100644
--- a/torchvision/tv_tensors/_image.py
+++ b/torchvision/tv_tensors/_image.py
@@ -9,7 +9,7 @@
 
 
 class Image(TVTensor):
-    """:class:`torch.Tensor` subclass for images with a minimum shape of ``(C x H x W)``.
+    """:class:`torch.Tensor` subclass for images with shape ``[..., C, H, W]``.
 
     .. note::
 
diff --git a/torchvision/tv_tensors/_mask.py b/torchvision/tv_tensors/_mask.py
index 82ccb38a4a9..ef9d96159fb 100644
--- a/torchvision/tv_tensors/_mask.py
+++ b/torchvision/tv_tensors/_mask.py
@@ -9,7 +9,7 @@
 
 
 class Mask(TVTensor):
-    """:class:`torch.Tensor` subclass for segmentation and detection masks with a minimum shape of ``(H x W)``.
+    """:class:`torch.Tensor` subclass for segmentation and detection masks with shape ``[..., H, W]``.
 
     Args:
         data (tensor-like, PIL.Image.Image): Any data that can be turned into a tensor with :func:`torch.as_tensor` as
diff --git a/torchvision/tv_tensors/_video.py b/torchvision/tv_tensors/_video.py
index 1f0c3317092..aa923e781ef 100644
--- a/torchvision/tv_tensors/_video.py
+++ b/torchvision/tv_tensors/_video.py
@@ -8,7 +8,7 @@
 
 
 class Video(TVTensor):
-    """:class:`torch.Tensor` subclass for videos with a minimum shape of ``(T x C x H x W)``.
+    """:class:`torch.Tensor` subclass for videos with shape ``[..., T, C, H, W]``.
 
     Args:
         data (tensor-like): Any data that can be turned into a tensor with :func:`torch.as_tensor`.