Merge branch 'main' into revamp-prototype-features-transforms

pmeier · pmeier · commit fad04f479780 · 2022-01-31T16:08:33.000+01:00
diff --git a/README.rst b/README.rst
@@ -23,6 +23,8 @@ supported Python versions.
 +==========================+==========================+=================================+
 | ``main`` / ``nightly``   | ``main`` / ``nightly``   | ``>=3.7``, ``<=3.9``            |
 +--------------------------+--------------------------+---------------------------------+
+| ``1.10.2``               | ``0.11.3``               | ``>=3.6``, ``<=3.9``            |
++--------------------------+--------------------------+---------------------------------+
 | ``1.10.1``               | ``0.11.2``               | ``>=3.6``, ``<=3.9``            |
 +--------------------------+--------------------------+---------------------------------+
 | ``1.10.0``               | ``0.11.1``               | ``>=3.6``, ``<=3.9``            |
diff --git a/setup.py b/setup.py
@@ -58,6 +58,7 @@ def write_version_file():
     pytorch_dep += "==" + os.getenv("PYTORCH_VERSION")
 
 requirements = [
+    "typing_extensions",
     "numpy",
     "requests",
     pytorch_dep,
diff --git a/test/expect/ModelTester.test_vitc_b_16_expect.pkl b/test/expect/ModelTester.test_vitc_b_16_expect.pkl
diff --git a/test/test_models.py b/test/test_models.py
@@ -8,6 +8,7 @@
 import warnings
 from collections import OrderedDict
 from tempfile import TemporaryDirectory
+from typing import Any
 
 import pytest
 import torch
@@ -514,6 +515,35 @@ def test_generalizedrcnn_transform_repr():
     assert t.__repr__() == expected_string
 
 
+test_vit_conv_stem_configs = [
+    models.vision_transformer.ConvStemConfig(kernel_size=3, stride=2, out_channels=64),
+    models.vision_transformer.ConvStemConfig(kernel_size=3, stride=2, out_channels=128),
+    models.vision_transformer.ConvStemConfig(kernel_size=3, stride=1, out_channels=128),
+    models.vision_transformer.ConvStemConfig(kernel_size=3, stride=2, out_channels=256),
+    models.vision_transformer.ConvStemConfig(kernel_size=3, stride=1, out_channels=256),
+    models.vision_transformer.ConvStemConfig(kernel_size=3, stride=2, out_channels=512),
+]
+
+
+def vitc_b_16(**kwargs: Any):
+    return models.VisionTransformer(
+        image_size=224,
+        patch_size=16,
+        num_layers=12,
+        num_heads=12,
+        hidden_dim=768,
+        mlp_dim=3072,
+        conv_stem_configs=test_vit_conv_stem_configs,
+        **kwargs,
+    )
+
+
+@pytest.mark.parametrize("model_fn", [vitc_b_16])
+@pytest.mark.parametrize("dev", cpu_and_gpu())
+def test_vitc_models(model_fn, dev):
+    test_classification_model(model_fn, dev)
+
+
 @pytest.mark.parametrize("model_fn", get_models_from_module(models))
 @pytest.mark.parametrize("dev", cpu_and_gpu())
 def test_classification_model(model_fn, dev):
diff --git a/test/test_utils.py b/test/test_utils.py
@@ -317,29 +317,42 @@ def test_draw_keypoints_errors():
         utils.draw_keypoints(image=img, keypoints=invalid_keypoints)
 
 
-def test_flow_to_image():
+@pytest.mark.parametrize("batch", (True, False))
+def test_flow_to_image(batch):
     h, w = 100, 100
     flow = torch.meshgrid(torch.arange(h), torch.arange(w), indexing="ij")
     flow = torch.stack(flow[::-1], dim=0).float()
     flow[0] -= h / 2
     flow[1] -= w / 2
+
+    if batch:
+        flow = torch.stack([flow, flow])
+
     img = utils.flow_to_image(flow)
+    assert img.shape == (2, 3, h, w) if batch else (3, h, w)
+
     path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "expected_flow.pt")
     expected_img = torch.load(path, map_location="cpu")
-    assert_equal(expected_img, img)
 
+    if batch:
+        expected_img = torch.stack([expected_img, expected_img])
+
+    assert_equal(expected_img, img)
 
-def test_flow_to_image_errors():
-    wrong_flow1 = torch.full((3, 10, 10), 0, dtype=torch.float)
-    wrong_flow2 = torch.full((2, 10), 0, dtype=torch.float)
-    wrong_flow3 = torch.full((2, 10, 30), 0, dtype=torch.int)
 
-    with pytest.raises(ValueError, match="Input flow should have shape"):
-        utils.flow_to_image(flow=wrong_flow1)
-    with pytest.raises(ValueError, match="Input flow should have shape"):
-        utils.flow_to_image(flow=wrong_flow2)
-    with pytest.raises(ValueError, match="Flow should be of dtype torch.float"):
-        utils.flow_to_image(flow=wrong_flow3)
+@pytest.mark.parametrize(
+    "input_flow, match",
+    (
+        (torch.full((3, 10, 10), 0, dtype=torch.float), "Input flow should have shape"),
+        (torch.full((5, 3, 10, 10), 0, dtype=torch.float), "Input flow should have shape"),
+        (torch.full((2, 10), 0, dtype=torch.float), "Input flow should have shape"),
+        (torch.full((5, 2, 10), 0, dtype=torch.float), "Input flow should have shape"),
+        (torch.full((2, 10, 30), 0, dtype=torch.int), "Flow should be of dtype torch.float"),
+    ),
+)
+def test_flow_to_image_errors(input_flow, match):
+    with pytest.raises(ValueError, match=match):
+        utils.flow_to_image(flow=input_flow)
 
 
 if __name__ == "__main__":
diff --git a/torchvision/datasets/hmdb51.py b/torchvision/datasets/hmdb51.py
@@ -11,7 +11,7 @@
 
 class HMDB51(VisionDataset):
     """
-    `HMDB51 <http://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/>`_
+    `HMDB51 <https://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/>`_
     dataset.
 
     HMDB51 is an action recognition video dataset.
@@ -47,9 +47,9 @@ class HMDB51(VisionDataset):
             - label (int): class of the video clip
     """
 
-    data_url = "http://serre-lab.clps.brown.edu/wp-content/uploads/2013/10/hmdb51_org.rar"
+    data_url = "https://serre-lab.clps.brown.edu/wp-content/uploads/2013/10/hmdb51_org.rar"
     splits = {
-        "url": "http://serre-lab.clps.brown.edu/wp-content/uploads/2013/10/test_train_splits.rar",
+        "url": "https://serre-lab.clps.brown.edu/wp-content/uploads/2013/10/test_train_splits.rar",
         "md5": "15e67781e70dcfbdce2d7dbb9b3344b5",
     }
     TRAIN_TAG = 1
diff --git a/torchvision/datasets/stl10.py b/torchvision/datasets/stl10.py
@@ -1,5 +1,5 @@
 import os.path
-from typing import Any, Callable, Optional, Tuple
+from typing import Any, Callable, Optional, Tuple, cast
 
 import numpy as np
 from PIL import Image
@@ -65,10 +65,12 @@ def __init__(
         self.labels: Optional[np.ndarray]
         if self.split == "train":
             self.data, self.labels = self.__loadfile(self.train_list[0][0], self.train_list[1][0])
+            self.labels = cast(np.ndarray, self.labels)
             self.__load_folds(folds)
 
         elif self.split == "train+unlabeled":
             self.data, self.labels = self.__loadfile(self.train_list[0][0], self.train_list[1][0])
+            self.labels = cast(np.ndarray, self.labels)
             self.__load_folds(folds)
             unlabeled_data, _ = self.__loadfile(self.train_list[2][0])
             self.data = np.concatenate((self.data, unlabeled_data))
diff --git a/torchvision/models/segmentation/deeplabv3.py b/torchvision/models/segmentation/deeplabv3.py
@@ -6,7 +6,7 @@
 
 from .. import mobilenetv3
 from .. import resnet
-from ..feature_extraction import create_feature_extractor
+from .._utils import IntermediateLayerGetter
 from ._utils import _SimpleSegmentationModel, _load_weights
 from .fcn import FCNHead
 
@@ -121,7 +121,7 @@ def _deeplabv3_resnet(
     return_layers = {"layer4": "out"}
     if aux:
         return_layers["layer3"] = "aux"
-    backbone = create_feature_extractor(backbone, return_layers)
+    backbone = IntermediateLayerGetter(backbone, return_layers=return_layers)
 
     aux_classifier = FCNHead(1024, num_classes) if aux else None
     classifier = DeepLabHead(2048, num_classes)
@@ -144,7 +144,7 @@ def _deeplabv3_mobilenetv3(
     return_layers = {str(out_pos): "out"}
     if aux:
         return_layers[str(aux_pos)] = "aux"
-    backbone = create_feature_extractor(backbone, return_layers)
+    backbone = IntermediateLayerGetter(backbone, return_layers=return_layers)
 
     aux_classifier = FCNHead(aux_inplanes, num_classes) if aux else None
     classifier = DeepLabHead(out_inplanes, num_classes)
diff --git a/torchvision/models/segmentation/fcn.py b/torchvision/models/segmentation/fcn.py
@@ -3,7 +3,7 @@
 from torch import nn
 
 from .. import resnet
-from ..feature_extraction import create_feature_extractor
+from .._utils import IntermediateLayerGetter
 from ._utils import _SimpleSegmentationModel, _load_weights
 
 
@@ -57,7 +57,7 @@ def _fcn_resnet(
     return_layers = {"layer4": "out"}
     if aux:
         return_layers["layer3"] = "aux"
-    backbone = create_feature_extractor(backbone, return_layers)
+    backbone = IntermediateLayerGetter(backbone, return_layers=return_layers)
 
     aux_classifier = FCNHead(1024, num_classes) if aux else None
     classifier = FCNHead(2048, num_classes)
diff --git a/torchvision/models/segmentation/lraspp.py b/torchvision/models/segmentation/lraspp.py
@@ -6,7 +6,7 @@
 
 from ...utils import _log_api_usage_once
 from .. import mobilenetv3
-from ..feature_extraction import create_feature_extractor
+from .._utils import IntermediateLayerGetter
 from ._utils import _load_weights
 
 
@@ -90,7 +90,7 @@ def _lraspp_mobilenetv3(backbone: mobilenetv3.MobileNetV3, num_classes: int) ->
     high_pos = stage_indices[-1]  # use C5 which has output_stride = 16
     low_channels = backbone[low_pos].out_channels
     high_channels = backbone[high_pos].out_channels
-    backbone = create_feature_extractor(backbone, {str(low_pos): "low", str(high_pos): "high"})
+    backbone = IntermediateLayerGetter(backbone, return_layers={str(low_pos): "low", str(high_pos): "high"})
 
     return LRASPP(backbone, low_channels, high_channels, num_classes)
 
diff --git a/torchvision/models/vision_transformer.py b/torchvision/models/vision_transformer.py
@@ -1,12 +1,13 @@
 import math
 from collections import OrderedDict
 from functools import partial
-from typing import Any, Callable, Optional
+from typing import Any, Callable, List, NamedTuple, Optional
 
 import torch
 import torch.nn as nn
 
 from .._internally_replaced_utils import load_state_dict_from_url
+from ..ops.misc import ConvNormActivation
 from ..utils import _log_api_usage_once
 
 __all__ = [
@@ -25,6 +26,14 @@
 }
 
 
+class ConvStemConfig(NamedTuple):
+    out_channels: int
+    kernel_size: int
+    stride: int
+    norm_layer: Callable[..., nn.Module] = nn.BatchNorm2d
+    activation_layer: Callable[..., nn.Module] = nn.ReLU
+
+
 class MLPBlock(nn.Sequential):
     """Transformer MLP block."""
 
@@ -134,6 +143,7 @@ def __init__(
         num_classes: int = 1000,
         representation_size: Optional[int] = None,
         norm_layer: Callable[..., torch.nn.Module] = partial(nn.LayerNorm, eps=1e-6),
+        conv_stem_configs: Optional[List[ConvStemConfig]] = None,
     ):
         super().__init__()
         _log_api_usage_once(self)
@@ -148,11 +158,31 @@ def __init__(
         self.representation_size = representation_size
         self.norm_layer = norm_layer
 
-        input_channels = 3
-
-        # The conv_proj is a more efficient version of reshaping, permuting
-        # and projecting the input
-        self.conv_proj = nn.Conv2d(input_channels, hidden_dim, kernel_size=patch_size, stride=patch_size)
+        if conv_stem_configs is not None:
+            # As per https://arxiv.org/abs/2106.14881
+            seq_proj = nn.Sequential()
+            prev_channels = 3
+            for i, conv_stem_layer_config in enumerate(conv_stem_configs):
+                seq_proj.add_module(
+                    f"conv_bn_relu_{i}",
+                    ConvNormActivation(
+                        in_channels=prev_channels,
+                        out_channels=conv_stem_layer_config.out_channels,
+                        kernel_size=conv_stem_layer_config.kernel_size,
+                        stride=conv_stem_layer_config.stride,
+                        norm_layer=conv_stem_layer_config.norm_layer,
+                        activation_layer=conv_stem_layer_config.activation_layer,
+                    ),
+                )
+                prev_channels = conv_stem_layer_config.out_channels
+            seq_proj.add_module(
+                "conv_last", nn.Conv2d(in_channels=prev_channels, out_channels=hidden_dim, kernel_size=1)
+            )
+            self.conv_proj: nn.Module = seq_proj
+        else:
+            self.conv_proj = nn.Conv2d(
+                in_channels=3, out_channels=hidden_dim, kernel_size=patch_size, stride=patch_size
+            )
 
         seq_length = (image_size // patch_size) ** 2
 
@@ -184,9 +214,17 @@ def __init__(
         self._init_weights()
 
     def _init_weights(self):
-        fan_in = self.conv_proj.in_channels * self.conv_proj.kernel_size[0] * self.conv_proj.kernel_size[1]
-        nn.init.trunc_normal_(self.conv_proj.weight, std=math.sqrt(1 / fan_in))
-        nn.init.zeros_(self.conv_proj.bias)
+        if isinstance(self.conv_proj, nn.Conv2d):
+            # Init the patchify stem
+            fan_in = self.conv_proj.in_channels * self.conv_proj.kernel_size[0] * self.conv_proj.kernel_size[1]
+            nn.init.trunc_normal_(self.conv_proj.weight, std=math.sqrt(1 / fan_in))
+            nn.init.zeros_(self.conv_proj.bias)
+        else:
+            # Init the last 1x1 conv of the conv stem
+            nn.init.normal_(
+                self.conv_proj.conv_last.weight, mean=0.0, std=math.sqrt(2.0 / self.conv_proj.conv_last.out_channels)
+            )
+            nn.init.zeros_(self.conv_proj.conv_last.bias)
 
         if hasattr(self.heads, "pre_logits"):
             fan_in = self.heads.pre_logits.in_features
diff --git a/torchvision/transforms/functional_pil.py b/torchvision/transforms/functional_pil.py
@@ -4,6 +4,7 @@
 import numpy as np
 import torch
 from PIL import Image, ImageOps, ImageEnhance
+from typing_extensions import Literal
 
 try:
     import accimage
@@ -130,7 +131,7 @@ def pad(
     img: Image.Image,
     padding: Union[int, List[int], Tuple[int, ...]],
     fill: Optional[Union[float, List[float], Tuple[float, ...]]] = 0,
-    padding_mode: str = "constant",
+    padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
 ) -> Image.Image:
 
     if not _is_pil_image(img):
@@ -189,7 +190,7 @@ def pad(
         if img.mode == "P":
             palette = img.getpalette()
             img = np.asarray(img)
-            img = np.pad(img, ((pad_top, pad_bottom), (pad_left, pad_right)), padding_mode)
+            img = np.pad(img, ((pad_top, pad_bottom), (pad_left, pad_right)), mode=padding_mode)
             img = Image.fromarray(img)
             img.putpalette(palette)
             return img
diff --git a/torchvision/utils.py b/torchvision/utils.py