Update S3D weights (#6537)

datumbox · web-flow · commit 9b432d074a1c · 2022-09-05T14:18:47.000+01:00
* S3D weight deployment

* Update accuracies.

* Address review comments.
diff --git a/references/video_classification/README.md b/references/video_classification/README.md
@@ -81,6 +81,7 @@ Video resnet models:
 ```
 # number of frames per clip
 --clip_len 16 \ 
+--frame-rate 15 \
 # allow for temporal jittering
 --clips_per_video 5 \
 --batch-size 24 \
@@ -97,6 +98,21 @@ Video resnet models:
 --val-crop-size 112 112
 ```
 
+### S3D
+
+The S3D model was trained similarly to the above but with the following changes on the default configuration:
+```
+--batch-size=12 --lr 0.2 --clip-len 64 --clips-per-video 5 --sync-bn \
+--train-resize-size 256 256 --train-crop-size 224 224 --val-resize-size 256 256 --val-crop-size 224 224
+```
+
+We used 64 GPUs to train the architecture. 
+
+To estimate the validation statistics of the model, we run the reference script with the following configuration:
+```
+--batch-size=16 --test-only --clip-len 128 --clips-per-video 1 
+```
+
 ### Additional video modelling resources
 
 - [Video Model Zoo](https://github.com/facebookresearch/VMZ)
diff --git a/torchvision/models/video/s3d.py b/torchvision/models/video/s3d.py
@@ -104,7 +104,7 @@ class S3D(nn.Module):
     def __init__(
         self,
         num_classes: int = 400,
-        dropout: float = 0.0,
+        dropout: float = 0.2,
         norm_layer: Optional[Callable[..., torch.nn.Module]] = None,
     ) -> None:
         super().__init__()
@@ -153,28 +153,26 @@ def forward(self, x):
 
 class S3D_Weights(WeightsEnum):
     KINETICS400_V1 = Weights(
-        url="https://download.pytorch.org/models/s3d-1bd8ae63.pth",
+        url="https://download.pytorch.org/models/s3d-d76dad2f.pth",
         transforms=partial(
             VideoClassification,
             crop_size=(224, 224),
             resize_size=(256, 256),
-            mean=(0.5, 0.5, 0.5),
-            std=(0.5, 0.5, 0.5),
         ),
         meta={
             "min_size": (224, 224),
             "min_temporal_size": 14,
             "categories": _KINETICS400_CATEGORIES,
-            "recipe": "https://github.com/pytorch/vision/pull/6412#issuecomment-1219687434",
+            "recipe": "https://github.com/pytorch/vision/tree/main/references/video_classification#s3d",
             "_docs": (
-                "The weights are ported from a community repository. The accuracies are estimated on clip-level "
+                "The weights aim to approximate the accuracy of the paper. The accuracies are estimated on clip-level "
                 "with parameters `frame_rate=15`, `clips_per_video=1`, and `clip_len=128`."
             ),
             "num_params": 8320048,
             "_metrics": {
                 "Kinetics-400": {
-                    "acc@1": 67.315,
-                    "acc@5": 87.593,
+                    "acc@1": 68.368,
+                    "acc@5": 88.050,
                 }
             },
         },