First rough implementation of ViTDet.

hgaiser · hgaiser · commit d80c4eb2d3ea · 2023-06-21T16:20:28.000+02:00
diff --git a/torchvision/models/detection/backbone_utils.py b/torchvision/models/detection/backbone_utils.py
@@ -3,9 +3,14 @@
 
 from torch import nn, Tensor
 from torchvision.ops import misc as misc_nn_ops
-from torchvision.ops.feature_pyramid_network import ExtraFPNBlock, FeaturePyramidNetwork, LastLevelMaxPool
+from torchvision.ops.feature_pyramid_network import (
+    ExtraFPNBlock,
+    FeaturePyramidNetwork,
+    LastLevelMaxPool,
+    SimpleFeaturePyramidNetwork,
+)
 
-from .. import mobilenet, resnet
+from .. import mobilenet, resnet, vision_transformer
 from .._api import _get_enum_from_fn, WeightsEnum
 from .._utils import handle_legacy_interface, IntermediateLayerGetter
 
@@ -59,6 +64,46 @@ def forward(self, x: Tensor) -> Dict[str, Tensor]:
         return x
 
 
+class BackboneWithSimpleFPN(nn.Module):
+    """
+    Adds a Simple FPN on top of a model.
+    Args:
+        backbone (nn.Module)
+        in_channels_list (int): number of channels from the provided feature map.
+        out_channels (int): number of channels in the SFPN.
+        extra_blocks (ExtraFPNBlock or None): if provided, extra operations will
+            be performed. It is expected to take the fpn features, the original
+            features and the names of features as input, and returns
+            a new list of feature maps and their corresponding names. By
+            default a ``LastLevelMaxPool`` is used.
+    """
+
+    def __init__(
+        self,
+        backbone: nn.Module,
+        in_channels: int,
+        out_channels: int,
+        extra_blocks: Optional[ExtraFPNBlock] = None,
+    ) -> None:
+        super().__init__()
+
+        if extra_blocks is None:
+            extra_blocks = LastLevelMaxPool()
+
+        self.backbone = backbone
+        self.fpn = SimpleFeaturePyramidNetwork(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            extra_blocks=extra_blocks,
+        )
+        self.out_channels = out_channels
+
+    def forward(self, x: Tensor) -> List[Tensor]:
+        x = self.backbone(x)
+        x = self.fpn(x)
+        return x
+
+
 @handle_legacy_interface(
     weights=(
         "pretrained",
@@ -240,3 +285,54 @@ def _mobilenet_extractor(
         )
         m.out_channels = out_channels  # type: ignore[assignment]
         return m
+
+
+def vit_sfpn_backbone(
+    *,
+    backbone_name: str,
+    weights: Optional[WeightsEnum],
+    extra_blocks: Optional[ExtraFPNBlock] = None,
+) -> BackboneWithSimpleFPN:
+    """
+    Constructs a specified ViT backbone with SFPN on top.
+
+    Examples::
+
+        >>> from torchvision.models.detection.backbone_utils import vit_sfpn_backbone
+        >>> backbone = vit_sfpn_backbone('vit_b_16', weights=ViT_B_16_Weights.DEFAULT)
+        >>> # get some dummy image
+        >>> x = torch.rand(1,3,64,64)
+        >>> # compute the output
+        >>> output = backbone(x)
+        >>> print([(k, v.shape) for k, v in output.items()])
+        >>> # returns
+        >>>   # TODO
+
+    Args:
+        backbone_name (string): resnet architecture. Possible values are 'vit_b_16', 'vit_b_32', 'vit_l_16',
+             'vit_l_32', 'vit_h_14'
+        weights (WeightsEnum, optional): The pretrained weights for the model
+        extra_blocks (ExtraFPNBlock or None): if provided, extra operations will
+            be performed. It is expected to take the sfpn features, the original
+            features and the names of the original features as input, and returns
+            a new list of feature maps and their corresponding names. By
+            default a ``LastLevelMaxPool`` is used.
+    """
+    backbone = vision_transformer.__dict__[backbone_name](weights=weights, include_head=False)
+    return _vit_sfpn_extractor(backbone, extra_blocks)
+
+
+def _vit_sfpn_extractor(
+    backbone: vision_transformer.VisionTransformer,
+    extra_blocks: Optional[ExtraFPNBlock] = None,
+) -> BackboneWithSimpleFPN:
+    # TODO: set trainable layers?
+
+    if extra_blocks is None:
+        extra_blocks = LastLevelMaxPool()
+
+    in_channels = backbone.hidden_dim
+    out_channels = 256
+    return BackboneWithSimpleFPN(
+        backbone, in_channels, out_channels, extra_blocks=extra_blocks
+    )
diff --git a/torchvision/models/detection/mask_rcnn.py b/torchvision/models/detection/mask_rcnn.py
@@ -10,8 +10,9 @@
 from .._meta import _COCO_CATEGORIES
 from .._utils import _ovewrite_value_param, handle_legacy_interface
 from ..resnet import resnet50, ResNet50_Weights
+from ..vision_transformer import ViT_B_16_Weights, vit_b_16
 from ._utils import overwrite_eps
-from .backbone_utils import _resnet_fpn_extractor, _validate_trainable_layers
+from .backbone_utils import _resnet_fpn_extractor, _vit_sfpn_extractor, _validate_trainable_layers
 from .faster_rcnn import _default_anchorgen, FasterRCNN, FastRCNNConvFCHead, RPNHead
 
 
@@ -585,3 +586,96 @@ def maskrcnn_resnet50_fpn_v2(
         model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
+
+
+def maskrcnn_vit_b_16_sfpn(
+    *,
+    weights: Optional[WeightsEnum] = None,  # TODO: Change this to MaskRCNN_ViT_B_16_SFPN_V1_Weights
+    progress: bool = True,
+    num_classes: Optional[int] = None,
+    weights_backbone: Optional[ViT_B_16_Weights] = ViT_B_16_Weights.IMAGENET1K_V1,
+    **kwargs: Any,
+) -> MaskRCNN:
+    """Mask R-CNN model with a ViT-B/16 backbone from the `ViT <https://arxiv.org/abs/2010.11929>`_ paper.
+
+    .. betastatus:: detection module
+
+    The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
+    image, and should be in ``0-1`` range. Different images can have different sizes.
+
+    The behavior of the model changes depending if it is in training or evaluation mode.
+
+    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    containing:
+
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (``Int64Tensor[N]``): the class label for each ground-truth box
+        - masks (``UInt8Tensor[N, H, W]``): the segmentation binary masks for each instance
+
+    The model returns a ``Dict[Tensor]`` during training, containing the classification and regression
+    losses for both the RPN and the R-CNN, and the mask loss.
+
+    During inference, the model requires only the input tensors, and returns the post-processed
+    predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
+    follows, where ``N`` is the number of detected instances:
+
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (``Int64Tensor[N]``): the predicted labels for each instance
+        - scores (``Tensor[N]``): the scores or each instance
+        - masks (``UInt8Tensor[N, 1, H, W]``): the predicted masks for each instance, in ``0-1`` range. In order to
+          obtain the final segmentation masks, the soft masks can be thresholded, generally
+          with a value of 0.5 (``mask >= 0.5``)
+
+    For more details on the output and on how to plot the masks, you may refer to :ref:`instance_seg_output`.
+
+    Mask R-CNN is exportable to ONNX for a fixed batch size with inputs images of fixed size.
+
+    Example::
+
+        >>> model = torchvision.models.detection.maskrcnn_vit_b_16_sfpn(weights=MaskRCNN_ViT_B_16_SFPN_Weights.DEFAULT)
+        >>> model.eval()
+        >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
+        >>> predictions = model(x)
+        >>>
+        >>> # optionally, if you want to export the model to ONNX:
+        >>> torch.onnx.export(model, x, "mask_rcnn.onnx", opset_version = 11)
+
+    Args:
+        weights (:class:`~torchvision.models.detection.MaskRCNN_ViT_B_16_SFPN_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.detection.MaskRCNN_ViT_B_16_SFPN_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        num_classes (int, optional): number of output classes of the model (including the background)
+        weights_backbone (:class:`~torchvision.models.ViT_B_16_Weights`, optional): The
+            pretrained weights for the backbone.
+        **kwargs: parameters passed to the ``torchvision.models.detection.mask_rcnn.MaskRCNN``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/detection/mask_rcnn.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.detection.MaskRCNN_ViT_B_16_SFPN_V1_Weights
+        :members:
+    """
+    # TODO: Add ViTDet weights.
+    # weights = MaskRCNN_ViT_B_16_SFPN_V1_Weights.verify(weights)
+    weights_backbone = ViT_B_16_Weights.verify(weights_backbone)
+
+    if weights is not None:
+        weights_backbone = None
+        num_classes = _ovewrite_value_param(num_classes, len(weights.meta["categories"]))
+    elif num_classes is None:
+        num_classes = 91
+
+    backbone = vit_b_16(weights=weights_backbone, progress=progress, include_head=False)
+    backbone = _vit_sfpn_extractor(backbone)
+    model = MaskRCNN(backbone, num_classes=num_classes, **kwargs)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress))
+
+    return model
diff --git a/torchvision/models/vision_transformer.py b/torchvision/models/vision_transformer.py