diff --git a/docs/source/models.rst b/docs/source/models.rst index 56dce6a76dc..e39b38faff3 100644 --- a/docs/source/models.rst +++ b/docs/source/models.rst @@ -431,6 +431,7 @@ Faster R-CNN MobileNetV3-Large FPN 32.8 - - Faster R-CNN MobileNetV3-Large 320 FPN 22.8 - - RetinaNet ResNet-50 FPN 36.4 - - SSD300 VGG16 25.1 - - +SSD512 ResNet-50 30.2 - - SSDlite320 MobileNetV3-Large 21.3 - - Mask R-CNN ResNet-50 FPN 37.9 34.6 - ====================================== ======= ======== =========== @@ -491,6 +492,7 @@ Faster R-CNN MobileNetV3-Large FPN 0.1020 0.0415 Faster R-CNN MobileNetV3-Large 320 FPN 0.0978 0.0376 0.6 RetinaNet ResNet-50 FPN 0.2514 0.0939 4.1 SSD300 VGG16 0.2093 0.0744 1.5 +SSD512 ResNet-50 0.2316 0.0772 3.0 SSDlite320 MobileNetV3-Large 0.1773 0.0906 1.5 Mask R-CNN ResNet-50 FPN 0.2728 0.0903 5.4 Keypoint R-CNN ResNet-50 FPN 0.3789 0.1242 6.8 @@ -515,6 +517,7 @@ SSD --- .. autofunction:: torchvision.models.detection.ssd300_vgg16 +.. autofunction:: torchvision.models.detection.ssd512_resnet50 SSDlite diff --git a/references/detection/README.md b/references/detection/README.md index ea5be6ea791..be105e02761 100644 --- a/references/detection/README.md +++ b/references/detection/README.md @@ -56,6 +56,14 @@ python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py\ --weight-decay 0.0005 --data-augmentation ssd ``` +### SSD512 ResNet-50 +``` +python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py\ + --dataset coco --model ssd512_resnet50 --epochs 120\ + --lr-steps 80 110 --aspect-ratio-group-factor 3 --lr 0.002 --batch-size 4\ + --weight-decay 0.0005 --data-augmentation ssd +``` + ### SSDlite320 MobileNetV3-Large ``` python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py\ diff --git a/test/expect/ModelTester.test_ssd512_resnet50_expect.pkl b/test/expect/ModelTester.test_ssd512_resnet50_expect.pkl new file mode 100644 index 00000000000..da8cf577652 Binary files /dev/null and b/test/expect/ModelTester.test_ssd512_resnet50_expect.pkl differ diff --git a/test/test_models.py b/test/test_models.py index 401c4175ccf..ca329c40b2d 100644 --- a/test/test_models.py +++ b/test/test_models.py @@ -46,6 +46,7 @@ def get_available_video_models(): "keypointrcnn_resnet50_fpn": lambda x: x[1], "retinanet_resnet50_fpn": lambda x: x[1], "ssd300_vgg16": lambda x: x[1], + "ssd512_resnet50": lambda x: x[1], "ssdlite320_mobilenet_v3_large": lambda x: x[1], } diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index 6695167d462..f0a47a325c3 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -10,14 +10,15 @@ from .anchor_utils import DefaultBoxGenerator from .backbone_utils import _validate_trainable_layers from .transform import GeneralizedRCNNTransform -from .. import vgg +from .. import vgg, resnet from ..utils import load_state_dict_from_url from ...ops import boxes as box_ops -__all__ = ['SSD', 'ssd300_vgg16'] +__all__ = ['SSD', 'ssd300_vgg16', 'ssd512_resnet50'] model_urls = { 'ssd300_vgg16_coco': 'https://download.pytorch.org/models/ssd300_vgg16_coco-b556d3b4.pth', + 'ssd512_resnet50_coco': 'https://download.pytorch.org/models/ssd512_resnet50_coco-d6d7edbb.pth', } backbone_urls = { @@ -594,3 +595,136 @@ def ssd300_vgg16(pretrained: bool = False, progress: bool = True, num_classes: i state_dict = load_state_dict_from_url(model_urls[weights_name], progress=progress) model.load_state_dict(state_dict) return model + + +class SSDFeatureExtractorResNet(nn.Module): + def __init__(self, backbone: resnet.ResNet): + super().__init__() + + self.features = nn.Sequential( + backbone.conv1, + backbone.bn1, + backbone.relu, + backbone.maxpool, + backbone.layer1, + backbone.layer2, + backbone.layer3, + backbone.layer4, + ) + + # Patch last block's strides to get valid output sizes + for m in self.features[-1][0].modules(): + if hasattr(m, 'stride'): + m.stride = 1 + + backbone_out_channels = self.features[-1][-1].bn3.num_features + extra = nn.ModuleList([ + nn.Sequential( + nn.Conv2d(backbone_out_channels, 256, kernel_size=1, bias=False), + nn.BatchNorm2d(256), + nn.ReLU(inplace=True), + nn.Conv2d(256, 512, kernel_size=3, padding=1, stride=2, bias=False), + nn.BatchNorm2d(512), + nn.ReLU(inplace=True), + ), + nn.Sequential( + nn.Conv2d(512, 256, kernel_size=1, bias=False), + nn.BatchNorm2d(256), + nn.ReLU(inplace=True), + nn.Conv2d(256, 512, kernel_size=3, padding=1, stride=2, bias=False), + nn.BatchNorm2d(512), + nn.ReLU(inplace=True), + ), + nn.Sequential( + nn.Conv2d(512, 128, kernel_size=1, bias=False), + nn.BatchNorm2d(128), + nn.ReLU(inplace=True), + nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2, bias=False), + nn.BatchNorm2d(256), + nn.ReLU(inplace=True), + ), + nn.Sequential( + nn.Conv2d(256, 128, kernel_size=1, bias=False), + nn.BatchNorm2d(128), + nn.ReLU(inplace=True), + nn.Conv2d(128, 256, kernel_size=3, bias=False), + nn.BatchNorm2d(256), + nn.ReLU(inplace=True), + ), + nn.Sequential( + nn.Conv2d(256, 128, kernel_size=1, bias=False), + nn.BatchNorm2d(128), + nn.ReLU(inplace=True), + nn.Conv2d(128, 256, kernel_size=2, bias=False), + nn.BatchNorm2d(256), + nn.ReLU(inplace=True), + ) + ]) + _xavier_init(extra) + self.extra = extra + + def forward(self, x: Tensor) -> Dict[str, Tensor]: + x = self.features(x) + output = [x] + + for block in self.extra: + x = block(x) + output.append(x) + + return OrderedDict([(str(i), v) for i, v in enumerate(output)]) + + +def _resnet_extractor(backbone_name: str, pretrained: bool, trainable_layers: int): + backbone = resnet.__dict__[backbone_name](pretrained=pretrained) + + assert 0 <= trainable_layers <= 5 + layers_to_train = ['layer4', 'layer3', 'layer2', 'layer1', 'conv1'][:trainable_layers] + if trainable_layers == 5: + layers_to_train.append('bn1') + for name, parameter in backbone.named_parameters(): + if all([not name.startswith(layer) for layer in layers_to_train]): + parameter.requires_grad_(False) + + return SSDFeatureExtractorResNet(backbone) + + +def ssd512_resnet50(pretrained: bool = False, progress: bool = True, num_classes: int = 91, + pretrained_backbone: bool = True, trainable_backbone_layers: Optional[int] = None, **kwargs: Any): + """ + Constructs an SSD model with input size 512x512 and a ResNet50 backbone. See `SSD` for more details. + + Example: + + >>> model = torchvision.models.detection.ssd512_resnet50(pretrained=True) + >>> model.eval() + >>> x = [torch.rand(3, 512, 512), torch.rand(3, 750, 600)] + >>> predictions = model(x) + + Args: + pretrained (bool): If True, returns a model pre-trained on COCO train2017 + progress (bool): If True, displays a progress bar of the download to stderr + num_classes (int): number of output classes of the model (including the background) + pretrained_backbone (bool): If True, returns a model with backbone pre-trained on Imagenet + trainable_backbone_layers (int): number of trainable (not frozen) resnet layers starting from final block. + Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable. + """ + if "size" in kwargs: + warnings.warn("The size of the model is already fixed; ignoring the argument.") + + trainable_backbone_layers = _validate_trainable_layers( + pretrained or pretrained_backbone, trainable_backbone_layers, 5, 5) + + if pretrained: + pretrained_backbone = False + + backbone = _resnet_extractor("resnet50", pretrained_backbone, trainable_backbone_layers) + anchor_generator = DefaultBoxGenerator([[2], [2, 3], [2, 3], [2, 3], [2], [2]], + scales=[0.07, 0.15, 0.33, 0.51, 0.69, 0.87, 1.05]) + model = SSD(backbone, anchor_generator, (512, 512), num_classes, **kwargs) + if pretrained: + weights_name = 'ssd512_resnet50_coco' + if model_urls.get(weights_name, None) is None: + raise ValueError("No checkpoint is available for model {}".format(weights_name)) + state_dict = load_state_dict_from_url(model_urls[weights_name], progress=progress) + model.load_state_dict(state_dict) + return model