Merge branch 'main' into gdrive-virus-scan

pmeier · web-flow · commit 4d0980df3a9c · 2022-04-01T19:29:58.000+02:00
diff --git a/README.rst b/README.rst
@@ -185,3 +185,10 @@ Disclaimer on Datasets
 This is a utility library that downloads and prepares public datasets. We do not host or distribute these datasets, vouch for their quality or fairness, or claim that you have license to use the dataset. It is your responsibility to determine whether you have permission to use the dataset under the dataset's license.
 
 If you're a dataset owner and wish to update any part of it (description, citation, etc.), or do not want your dataset to be included in this library, please get in touch through a GitHub issue. Thanks for your contribution to the ML community!
+
+Pre-trained Model License
+=========================
+
+The pre-trained models provided in this library may have their own licenses or terms and conditions derived from the dataset used for training. It is your responsibility to determine whether you have permission to use the models for your use case.
+
+More specifically, SWAG models are released under the CC-BY-NC 4.0 license. See `SWAG LICENSE <https://github.com/facebookresearch/SWAG/blob/main/LICENSE>`_ for additional details.
diff --git a/references/classification/train.py b/references/classification/train.py
@@ -229,12 +229,18 @@ def main(args):
 
     criterion = nn.CrossEntropyLoss(label_smoothing=args.label_smoothing)
 
-    if args.norm_weight_decay is None:
-        parameters = [p for p in model.parameters() if p.requires_grad]
-    else:
-        param_groups = torchvision.ops._utils.split_normalization_params(model)
-        wd_groups = [args.norm_weight_decay, args.weight_decay]
-        parameters = [{"params": p, "weight_decay": w} for p, w in zip(param_groups, wd_groups) if p]
+    custom_keys_weight_decay = []
+    if args.bias_weight_decay is not None:
+        custom_keys_weight_decay.append(("bias", args.bias_weight_decay))
+    if args.transformer_embedding_decay is not None:
+        for key in ["class_token", "position_embedding", "relative_position_bias"]:
+            custom_keys_weight_decay.append((key, args.transformer_embedding_decay))
+    parameters = utils.set_weight_decay(
+        model,
+        args.weight_decay,
+        norm_weight_decay=args.norm_weight_decay,
+        custom_keys_weight_decay=custom_keys_weight_decay if len(custom_keys_weight_decay) > 0 else None,
+    )
 
     opt_name = args.opt.lower()
     if opt_name.startswith("sgd"):
@@ -393,6 +399,18 @@ def get_args_parser(add_help=True):
         type=float,
         help="weight decay for Normalization layers (default: None, same value as --wd)",
     )
+    parser.add_argument(
+        "--bias-weight-decay",
+        default=None,
+        type=float,
+        help="weight decay for bias parameters of all layers (default: None, same value as --wd)",
+    )
+    parser.add_argument(
+        "--transformer-embedding-decay",
+        default=None,
+        type=float,
+        help="weight decay for embedding parameters for vision transformer models (default: None, same value as --wd)",
+    )
     parser.add_argument(
         "--label-smoothing", default=0.0, type=float, help="label smoothing (default: 0.0)", dest="label_smoothing"
     )
diff --git a/references/classification/utils.py b/references/classification/utils.py
@@ -5,6 +5,7 @@
 import os
 import time
 from collections import defaultdict, deque, OrderedDict
+from typing import List, Optional, Tuple
 
 import torch
 import torch.distributed as dist
@@ -400,3 +401,65 @@ def reduce_across_processes(val):
     dist.barrier()
     dist.all_reduce(t)
     return t
+
+
+def set_weight_decay(
+    model: torch.nn.Module,
+    weight_decay: float,
+    norm_weight_decay: Optional[float] = None,
+    norm_classes: Optional[List[type]] = None,
+    custom_keys_weight_decay: Optional[List[Tuple[str, float]]] = None,
+):
+    if not norm_classes:
+        norm_classes = [
+            torch.nn.modules.batchnorm._BatchNorm,
+            torch.nn.LayerNorm,
+            torch.nn.GroupNorm,
+            torch.nn.modules.instancenorm._InstanceNorm,
+            torch.nn.LocalResponseNorm,
+        ]
+    norm_classes = tuple(norm_classes)
+
+    params = {
+        "other": [],
+        "norm": [],
+    }
+    params_weight_decay = {
+        "other": weight_decay,
+        "norm": norm_weight_decay,
+    }
+    custom_keys = []
+    if custom_keys_weight_decay is not None:
+        for key, weight_decay in custom_keys_weight_decay:
+            params[key] = []
+            params_weight_decay[key] = weight_decay
+            custom_keys.append(key)
+
+    def _add_params(module, prefix=""):
+        for name, p in module.named_parameters(recurse=False):
+            if not p.requires_grad:
+                continue
+            is_custom_key = False
+            for key in custom_keys:
+                target_name = f"{prefix}.{name}" if prefix != "" and "." in key else name
+                if key == target_name:
+                    params[key].append(p)
+                    is_custom_key = True
+                    break
+            if not is_custom_key:
+                if norm_weight_decay is not None and isinstance(module, norm_classes):
+                    params["norm"].append(p)
+                else:
+                    params["other"].append(p)
+
+        for child_name, child_module in module.named_children():
+            child_prefix = f"{prefix}.{child_name}" if prefix != "" else child_name
+            _add_params(child_module, prefix=child_prefix)
+
+    _add_params(model)
+
+    param_groups = []
+    for key in params:
+        if len(params[key]) > 0:
+            param_groups.append({"params": params[key], "weight_decay": params_weight_decay[key]})
+    return param_groups
diff --git a/test/test_extended_models.py b/test/test_extended_models.py
@@ -115,7 +115,8 @@ def test_schema_meta_validation(model_fn):
                     incorrect_params.append(w)
         else:
             if w.meta.get("num_params") != weights_enum.DEFAULT.meta.get("num_params"):
-                incorrect_params.append(w)
+                if w.meta.get("num_params") != sum(p.numel() for p in model_fn(weights=w).parameters()):
+                    incorrect_params.append(w)
         if not w.name.isupper():
             bad_names.append(w)
 
diff --git a/test/test_video_reader.py b/test/test_video_reader.py
@@ -1225,7 +1225,7 @@ def test_invalid_file(self):
 
     @pytest.mark.parametrize("test_video", test_videos.keys())
     @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
-    @pytest.mark.parametrize("start_offset", [0, 1000])
+    @pytest.mark.parametrize("start_offset", [0, 500])
     @pytest.mark.parametrize("end_offset", [3000, None])
     def test_audio_present_pts(self, test_video, backend, start_offset, end_offset):
         """Test if audio frames are returned with pts unit."""
diff --git a/torchvision/io/_video_opt.py b/torchvision/io/_video_opt.py
@@ -423,16 +423,6 @@ def _probe_video_from_memory(
     return info
 
 
-def _convert_to_sec(
-    start_pts: Union[float, Fraction], end_pts: Union[float, Fraction], pts_unit: str, time_base: Fraction
-) -> Tuple[Union[float, Fraction], Union[float, Fraction], str]:
-    if pts_unit == "pts":
-        start_pts = float(start_pts * time_base)
-        end_pts = float(end_pts * time_base)
-        pts_unit = "sec"
-    return start_pts, end_pts, pts_unit
-
-
 def _read_video(
     filename: str,
     start_pts: Union[float, Fraction] = 0,
@@ -452,38 +442,28 @@ def _read_video(
 
     has_video = info.has_video
     has_audio = info.has_audio
-    video_pts_range = (0, -1)
-    video_timebase = default_timebase
-    audio_pts_range = (0, -1)
-    audio_timebase = default_timebase
-    time_base = default_timebase
-
-    if has_video:
-        video_timebase = Fraction(info.video_timebase.numerator, info.video_timebase.denominator)
-        time_base = video_timebase
-
-    if has_audio:
-        audio_timebase = Fraction(info.audio_timebase.numerator, info.audio_timebase.denominator)
-        time_base = time_base if time_base else audio_timebase
-
-    # video_timebase is the default time_base
-    start_pts_sec, end_pts_sec, pts_unit = _convert_to_sec(start_pts, end_pts, pts_unit, time_base)
 
     def get_pts(time_base):
-        start_offset = start_pts_sec
-        end_offset = end_pts_sec
+        start_offset = start_pts
+        end_offset = end_pts
         if pts_unit == "sec":
-            start_offset = int(math.floor(start_pts_sec * (1 / time_base)))
+            start_offset = int(math.floor(start_pts * (1 / time_base)))
             if end_offset != float("inf"):
-                end_offset = int(math.ceil(end_pts_sec * (1 / time_base)))
+                end_offset = int(math.ceil(end_pts * (1 / time_base)))
         if end_offset == float("inf"):
             end_offset = -1
         return start_offset, end_offset
 
+    video_pts_range = (0, -1)
+    video_timebase = default_timebase
     if has_video:
+        video_timebase = Fraction(info.video_timebase.numerator, info.video_timebase.denominator)
         video_pts_range = get_pts(video_timebase)
 
+    audio_pts_range = (0, -1)
+    audio_timebase = default_timebase
     if has_audio:
+        audio_timebase = Fraction(info.audio_timebase.numerator, info.audio_timebase.denominator)
         audio_pts_range = get_pts(audio_timebase)
 
     vframes, aframes, info = _read_video_from_file(
diff --git a/torchvision/io/video.py b/torchvision/io/video.py
@@ -287,13 +287,6 @@ def read_video(
         with av.open(filename, metadata_errors="ignore") as container:
             if container.streams.audio:
                 audio_timebase = container.streams.audio[0].time_base
-            time_base = _video_opt.default_timebase
-            if container.streams.video:
-                time_base = container.streams.video[0].time_base
-            elif container.streams.audio:
-                time_base = container.streams.audio[0].time_base
-            # video_timebase is the default time_base
-            start_pts, end_pts, pts_unit = _video_opt._convert_to_sec(start_pts, end_pts, pts_unit, time_base)
             if container.streams.video:
                 video_frames = _read_from_stream(
                     container,
diff --git a/torchvision/models/vision_transformer.py b/torchvision/models/vision_transformer.py
@@ -1,7 +1,7 @@
 import math
 from collections import OrderedDict
 from functools import partial
-from typing import Any, Callable, List, NamedTuple, Optional
+from typing import Any, Callable, List, NamedTuple, Optional, Sequence
 
 import torch
 import torch.nn as nn
@@ -284,10 +284,21 @@ def _vision_transformer(
     progress: bool,
     **kwargs: Any,
 ) -> VisionTransformer:
-    image_size = kwargs.pop("image_size", 224)
-
     if weights is not None:
         _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+        if isinstance(weights.meta["size"], int):
+            _ovewrite_named_param(kwargs, "image_size", weights.meta["size"])
+        elif isinstance(weights.meta["size"], Sequence):
+            if len(weights.meta["size"]) != 2 or weights.meta["size"][0] != weights.meta["size"][1]:
+                raise ValueError(
+                    f'size: {weights.meta["size"]} is not valid! Currently we only support a 2-dimensional square and width = height'
+                )
+            _ovewrite_named_param(kwargs, "image_size", weights.meta["size"][0])
+        else:
+            raise ValueError(
+                f'weights.meta["size"]: {weights.meta["size"]} is not valid, the type should be either an int or a Sequence[int]'
+            )
+    image_size = kwargs.pop("image_size", 224)
 
     model = VisionTransformer(
         image_size=image_size,
@@ -313,6 +324,14 @@ def _vision_transformer(
     "interpolation": InterpolationMode.BILINEAR,
 }
 
+_COMMON_SWAG_META = {
+    **_COMMON_META,
+    "publication_year": 2022,
+    "recipe": "https://github.com/facebookresearch/SWAG",
+    "license": "https://github.com/facebookresearch/SWAG/blob/main/LICENSE",
+    "interpolation": InterpolationMode.BICUBIC,
+}
+
 
 class ViT_B_16_Weights(WeightsEnum):
     IMAGENET1K_V1 = Weights(
@@ -328,6 +347,23 @@ class ViT_B_16_Weights(WeightsEnum):
             "acc@5": 95.318,
         },
     )
+    IMAGENET1K_SWAG_V1 = Weights(
+        url="https://download.pytorch.org/models/vit_b_16_swag-9ac1b537.pth",
+        transforms=partial(
+            ImageClassification,
+            crop_size=384,
+            resize_size=384,
+            interpolation=InterpolationMode.BICUBIC,
+        ),
+        meta={
+            **_COMMON_SWAG_META,
+            "num_params": 86859496,
+            "size": (384, 384),
+            "min_size": (384, 384),
+            "acc@1": 85.304,
+            "acc@5": 97.650,
+        },
+    )
     DEFAULT = IMAGENET1K_V1
 
 
@@ -362,6 +398,23 @@ class ViT_L_16_Weights(WeightsEnum):
             "acc@5": 94.638,
         },
     )
+    IMAGENET1K_SWAG_V1 = Weights(
+        url="https://download.pytorch.org/models/vit_l_16_swag-4f3808c9.pth",
+        transforms=partial(
+            ImageClassification,
+            crop_size=512,
+            resize_size=512,
+            interpolation=InterpolationMode.BICUBIC,
+        ),
+        meta={
+            **_COMMON_SWAG_META,
+            "num_params": 305174504,
+            "size": (512, 512),
+            "min_size": (512, 512),
+            "acc@1": 88.064,
+            "acc@5": 98.512,
+        },
+    )
     DEFAULT = IMAGENET1K_V1