1
1
import math
2
2
import warnings
3
3
from collections import OrderedDict
4
+ from functools import partial
4
5
from typing import Dict , List , Tuple , Optional
5
6
6
7
import torch
26
27
class FCOSHead (nn .Module ):
27
28
"""
28
29
A regression and classification head for use in FCOS.
30
+
29
31
Args:
30
32
in_channels (int): number of channels of the input feature
31
33
num_anchors (int): number of anchors to be predicted
@@ -117,6 +119,7 @@ def forward(self, x):
117
119
class FCOSClassificationHead (nn .Module ):
118
120
"""
119
121
A classification head for use in FCOS.
122
+
120
123
Args:
121
124
in_channels (int): number of channels of the input feature
122
125
num_anchors (int): number of anchors to be predicted
@@ -131,7 +134,7 @@ def __init__(self, in_channels, num_anchors, num_classes, num_convs=4, prior_pro
131
134
self .num_anchors = num_anchors
132
135
133
136
if norm_layer is None :
134
- norm_layer = lambda channels : nn .GroupNorm ( 32 , channels )
137
+ norm_layer = partial ( nn .GroupNorm , 32 )
135
138
136
139
conv = []
137
140
for _ in range (num_convs ):
@@ -149,8 +152,7 @@ def __init__(self, in_channels, num_anchors, num_classes, num_convs=4, prior_pro
149
152
torch .nn .init .normal_ (self .cls_logits .weight , std = 0.01 )
150
153
torch .nn .init .constant_ (self .cls_logits .bias , - math .log ((1 - prior_probability ) / prior_probability ))
151
154
152
- def forward (self , x ):
153
- # type: (List[Tensor]) -> Tensor
155
+ def forward (self , x : List [Tensor ]) -> Tensor :
154
156
all_cls_logits = []
155
157
156
158
for features in x :
@@ -171,6 +173,7 @@ def forward(self, x):
171
173
class FCOSRegressionHead (nn .Module ):
172
174
"""
173
175
A regression head for use in FCOS.
176
+
174
177
Args:
175
178
in_channels (int): number of channels of the input feature
176
179
num_anchors (int): number of anchors to be predicted
@@ -181,7 +184,7 @@ def __init__(self, in_channels, num_anchors, num_convs=4, norm_layer=None):
181
184
super ().__init__ ()
182
185
183
186
if norm_layer is None :
184
- norm_layer = lambda channels : nn .GroupNorm ( 32 , channels )
187
+ norm_layer = partial ( nn .GroupNorm , 32 )
185
188
186
189
conv = []
187
190
for _ in range (num_convs ):
@@ -201,8 +204,7 @@ def __init__(self, in_channels, num_anchors, num_convs=4, norm_layer=None):
201
204
torch .nn .init .normal_ (layer .weight , std = 0.01 )
202
205
torch .nn .init .zeros_ (layer .bias )
203
206
204
- def forward (self , x ):
205
- # type: (List[Tensor]) -> Tensor
207
+ def forward (self , x : List [Tensor ]) -> Tensor :
206
208
all_bbox_regression = []
207
209
all_bbox_ctrness = []
208
210
@@ -230,23 +232,29 @@ def forward(self, x):
230
232
class FCOS (nn .Module ):
231
233
"""
232
234
Implements FCOS.
235
+
233
236
The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
234
237
image, and should be in 0-1 range. Different images can have different sizes.
238
+
235
239
The behavior of the model changes depending if it is in training or evaluation mode.
240
+
236
241
During training, the model expects both the input tensors, as well as a targets (list of dictionary),
237
242
containing:
238
243
- boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
239
244
``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
240
245
- labels (Int64Tensor[N]): the class label for each ground-truth box
246
+
241
247
The model returns a Dict[Tensor] during training, containing the classification and regression
242
248
losses.
249
+
243
250
During inference, the model requires only the input tensors, and returns the post-processed
244
251
predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
245
252
follows:
246
253
- boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
247
254
``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
248
255
- labels (Int64Tensor[N]): the predicted labels for each image
249
256
- scores (Tensor[N]): the scores for each prediction
257
+
250
258
Args:
251
259
backbone (nn.Module): the network used to compute the features for the model.
252
260
It should contain an out_channels attribute, which indicates the number of output
@@ -272,7 +280,9 @@ class FCOS(nn.Module):
272
280
nms_thresh (float): NMS threshold used for postprocessing the detections.
273
281
detections_per_img (int): Number of best detections to keep after NMS.
274
282
topk_candidates (int): Number of best detections to keep before NMS.
283
+
275
284
Example:
285
+
276
286
>>> import torch
277
287
>>> import torchvision
278
288
>>> from torchvision.models.detection import FCOS
@@ -364,15 +374,23 @@ def __init__(
364
374
self ._has_warned = False
365
375
366
376
@torch .jit .unused
367
- def eager_outputs (self , losses , detections ):
368
- # type: (Dict[str, Tensor], List[Dict[str, Tensor]]) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]
377
+ def eager_outputs (
378
+ self ,
379
+ losses : Dict [str , Tensor ],
380
+ detections : List [Dict [str , Tensor ]]
381
+ ) -> Tuple [Dict [str , Tensor ], List [Dict [str , Tensor ]]]:
369
382
if self .training :
370
383
return losses
371
384
372
385
return detections
373
386
374
- def compute_loss (self , targets , head_outputs , anchors , num_anchors_per_level ):
375
- # type: (List[Dict[str, Tensor]], Dict[str, Tensor], List[Tensor], List[int]) -> Dict[str, Tensor]
387
+ def compute_loss (
388
+ self ,
389
+ targets : List [Dict [str , Tensor ]],
390
+ head_outputs : Dict [str , Tensor ],
391
+ anchors : List [Tensor ],
392
+ num_anchors_per_level : List [int ],
393
+ ) -> Dict [str , Tensor ]:
376
394
matched_idxs = []
377
395
for anchors_per_image , targets_per_image in zip (anchors , targets ):
378
396
if targets_per_image ["boxes" ].numel () == 0 :
@@ -417,8 +435,12 @@ def compute_loss(self, targets, head_outputs, anchors, num_anchors_per_level):
417
435
418
436
return self .head .compute_loss (targets , head_outputs , anchors , matched_idxs , self .box_coder )
419
437
420
- def postprocess_detections (self , head_outputs , anchors , image_shapes ):
421
- # type: (Dict[str, List[Tensor]], List[List[Tensor]], List[Tuple[int, int]]) -> List[Dict[str, Tensor]]
438
+ def postprocess_detections (
439
+ self ,
440
+ head_outputs : Dict [str , List [Tensor ]],
441
+ anchors : List [List [Tensor ]],
442
+ image_shapes : List [Tuple [int , int ]]
443
+ ) -> List [Dict [str , Tensor ]]:
422
444
class_logits = head_outputs ["cls_logits" ]
423
445
box_regression = head_outputs ["bbox_regression" ]
424
446
box_ctrness = head_outputs ["bbox_ctrness" ]
@@ -484,12 +506,16 @@ def postprocess_detections(self, head_outputs, anchors, image_shapes):
484
506
485
507
return detections
486
508
487
- def forward (self , images , targets = None ):
488
- # type: (List[Tensor], Optional[List[Dict[str, Tensor]]]) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]
509
+ def forward (
510
+ self ,
511
+ images : List [Tensor ],
512
+ targets : Optional [List [Dict [str , Tensor ]]] = None ,
513
+ ) -> Tuple [Dict [str , Tensor ], List [Dict [str , Tensor ]]]:
489
514
"""
490
515
Args:
491
516
images (list[Tensor]): images to be processed
492
517
targets (list[Dict[Tensor]]): ground-truth boxes present in the image (optional)
518
+
493
519
Returns:
494
520
result (list[BoxList] or dict[Tensor]): the output from the model.
495
521
During training, it returns a dict[Tensor] which contains the losses.
@@ -570,14 +596,15 @@ def forward(self, images, targets=None):
570
596
571
597
if torch .jit .is_scripting ():
572
598
if not self ._has_warned :
573
- warnings .warn ("RetinaNet always returns a (Losses, Detections) tuple in scripting" )
599
+ warnings .warn ("FCOS always returns a (Losses, Detections) tuple in scripting" )
574
600
self ._has_warned = True
575
601
return losses , detections
576
602
return self .eager_outputs (losses , detections )
577
603
578
604
579
605
model_urls = {
580
- "fcos_resnet50_fpn_coco" : "" ,
606
+ "fcos_resnet50_fpn_coco" :
607
+ "https://github.com/o295/checkpoints/releases/download/coco/fcos_resnet50_fpn_coco-46080c1a.pth" ,
581
608
}
582
609
583
610
@@ -587,16 +614,20 @@ def fcos_resnet50_fpn(
587
614
"""
588
615
Constructs a FCOS model with a ResNet-50-FPN backbone.
589
616
Reference: `"FCOS: Fully Convolutional One-Stage Object Detection" <https://arxiv.org/abs/1904.01355>`_.
617
+
590
618
The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
591
619
image, and should be in ``0-1`` range. Different images can have different sizes.
620
+
592
621
The behavior of the model changes depending if it is in training or evaluation mode.
622
+
593
623
During training, the model expects both the input tensors, as well as a targets (list of dictionary),
594
624
containing:
595
625
- boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
596
626
``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
597
627
- labels (``Int64Tensor[N]``): the class label for each ground-truth box
598
628
The model returns a ``Dict[Tensor]`` during training, containing the classification and regression
599
629
losses.
630
+
600
631
During inference, the model requires only the input tensors, and returns the post-processed
601
632
predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
602
633
follows, where ``N`` is the number of detections:
@@ -605,11 +636,14 @@ def fcos_resnet50_fpn(
605
636
- labels (``Int64Tensor[N]``): the predicted labels for each detection
606
637
- scores (``Tensor[N]``): the scores of each detection
607
638
For more details on the output, you may refer to :ref:`instance_seg_output`.
608
- Example::
639
+
640
+ Example:
641
+
609
642
>>> model = torchvision.models.detection.fcos_resnet50_fpn(pretrained=True)
610
643
>>> model.eval()
611
644
>>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
612
645
>>> predictions = model(x)
646
+
613
647
Args:
614
648
pretrained (bool): If True, returns a model pre-trained on COCO train2017
615
649
progress (bool): If True, displays a progress bar of the download to stderr
0 commit comments