Revert D33426965: [fbsync] Adding pretrained ViT weights (#5085)

Haonan Sun · facebook-github-bot · commit 81aa1b051570 · 2022-01-06T12:20:18.000-08:00
Differential Revision:
D33426965

Original commit changeset: 753ce1d1318d

Original Phabricator Diff: D33426965

fbshipit-source-id: db9a9f51c5365b2dd9c002aa681da0be33b3cb7d
diff --git a/references/classification/README.md b/references/classification/README.md
@@ -143,60 +143,6 @@ torchrun --nproc_per_node=8 train.py\
 ```
 Here `$MODEL` is one of `regnet_x_32gf`, `regnet_y_16gf` and `regnet_y_32gf`.
 
-### Vision Transformer
-
-#### vit_b_16
-```
-torchrun --nproc_per_node=8 train.py\
-    --model vit_b_16 --epochs 300 --batch-size 512 --opt adamw --lr 0.003 --wd 0.3\
-    --lr-scheduler cosineannealinglr --lr-warmup-method linear --lr-warmup-epochs 30\
-    --lr-warmup-decay 0.033 --amp --label-smoothing 0.11 --mixup-alpha 0.2 --auto-augment ra\
-    --clip-grad-norm 1 --ra-sampler --cutmix-alpha 1.0 --model-ema
-```
-
-Note that the above command corresponds to training on a single node with 8 GPUs.
-For generatring the pre-trained weights, we trained with 8 nodes, each with 8 GPUs (for a total of 64 GPUs),
-and `--batch_size 64`.
-
-#### vit_b_32
-```
-torchrun --nproc_per_node=8 train.py\
-    --model vit_b_32 --epochs 300 --batch-size 512 --opt adamw --lr 0.003 --wd 0.3\
-    --lr-scheduler cosineannealinglr --lr-warmup-method linear --lr-warmup-epochs 30\
-    --lr-warmup-decay 0.033 --amp --label-smoothing 0.11 --mixup-alpha 0.2 --auto-augment imagenet\
-    --clip-grad-norm 1 --ra-sampler --cutmix-alpha 1.0 --model-ema
-```
-
-Note that the above command corresponds to training on a single node with 8 GPUs.
-For generatring the pre-trained weights, we trained with 2 nodes, each with 8 GPUs (for a total of 16 GPUs),
-and `--batch_size 256`.
-
-#### vit_l_16
-```
-torchrun --nproc_per_node=8 train.py\
-    --model vit_l_16 --epochs 600 --batch-size 128 --lr 0.5 --lr-scheduler cosineannealinglr\
-    --lr-warmup-method linear --lr-warmup-epochs 5 --label-smoothing 0.1 --mixup-alpha 0.2\
-    --auto-augment ta_wide --random-erase 0.1 --weight-decay 0.00002 --norm-weight-decay 0.0\
-    --clip-grad-norm 1 --ra-sampler --cutmix-alpha 1.0 --model-ema --val-resize-size 232
-```
-
-Note that the above command corresponds to training on a single node with 8 GPUs.
-For generatring the pre-trained weights, we trained with 2 nodes, each with 8 GPUs (for a total of 16 GPUs),
-and `--batch_size 64`.
-
-#### vit_l_32
-```
-torchrun --nproc_per_node=8 train.py\
-    --model vit_l_32 --epochs 300 --batch-size 512 --opt adamw --lr 0.003 --wd 0.3\
-    --lr-scheduler cosineannealinglr --lr-warmup-method linear --lr-warmup-epochs 30\
-    --lr-warmup-decay 0.033 --amp --label-smoothing 0.11 --mixup-alpha 0.2 --auto-augment ra\
-    --clip-grad-norm 1 --ra-sampler --cutmix-alpha 1.0 --model-ema
-```
-
-Note that the above command corresponds to training on a single node with 8 GPUs.
-For generatring the pre-trained weights, we trained with 8 nodes, each with 8 GPUs (for a total of 64 GPUs),
-and `--batch_size 64`.
-
 ## Mixed precision training
 Automatic Mixed Precision (AMP) training on GPU for Pytorch can be enabled with the [torch.cuda.amp](https://pytorch.org/docs/stable/amp.html?highlight=amp#module-torch.cuda.amp).
 
diff --git a/torchvision/prototype/models/vision_transformer.py b/torchvision/prototype/models/vision_transformer.py
@@ -10,14 +10,12 @@
 import torch
 import torch.nn as nn
 from torch import Tensor
-from torchvision.prototype.transforms import ImageNetEval
-from torchvision.transforms.functional import InterpolationMode
 
 from ...utils import _log_api_usage_once
-from ._api import WeightsEnum, Weights
-from ._meta import _IMAGENET_CATEGORIES
+from ._api import WeightsEnum
 from ._utils import handle_legacy_interface
 
+
 __all__ = [
     "VisionTransformer",
     "ViT_B_16_Weights",
@@ -235,70 +233,24 @@ def forward(self, x: torch.Tensor):
         return x
 
 
-_COMMON_META = {
-    "categories": _IMAGENET_CATEGORIES,
-    "interpolation": InterpolationMode.BILINEAR,
-}
-
-
 class ViT_B_16_Weights(WeightsEnum):
-    ImageNet1K_V1 = Weights(
-        url="https://download.pytorch.org/models/vit_b_16-c867db91.pth",
-        transforms=partial(ImageNetEval, crop_size=224),
-        meta={
-            **_COMMON_META,
-            "size": (224, 224),
-            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#vit_b_16",
-            "acc@1": 81.072,
-            "acc@5": 95.318,
-        },
-    )
-    default = ImageNet1K_V1
+    # If a default model is added here the corresponding changes need to be done in vit_b_16
+    pass
 
 
 class ViT_B_32_Weights(WeightsEnum):
-    ImageNet1K_V1 = Weights(
-        url="https://download.pytorch.org/models/vit_b_32-d86f8d99.pth",
-        transforms=partial(ImageNetEval, crop_size=224),
-        meta={
-            **_COMMON_META,
-            "size": (224, 224),
-            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#vit_b_32",
-            "acc@1": 75.912,
-            "acc@5": 92.466,
-        },
-    )
-    default = ImageNet1K_V1
+    # If a default model is added here the corresponding changes need to be done in vit_b_32
+    pass
 
 
 class ViT_L_16_Weights(WeightsEnum):
-    ImageNet1K_V1 = Weights(
-        url="https://download.pytorch.org/models/vit_l_16-852ce7e3.pth",
-        transforms=partial(ImageNetEval, crop_size=224, resize_size=242),
-        meta={
-            **_COMMON_META,
-            "size": (224, 224),
-            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#vit_l_16",
-            "acc@1": 79.662,
-            "acc@5": 94.638,
-        },
-    )
-    default = ImageNet1K_V1
+    # If a default model is added here the corresponding changes need to be done in vit_l_16
+    pass
 
 
 class ViT_L_32_Weights(WeightsEnum):
-    ImageNet1K_V1 = Weights(
-        url="https://download.pytorch.org/models/vit_l_32-c7638314.pth",
-        transforms=partial(ImageNetEval, crop_size=224),
-        meta={
-            **_COMMON_META,
-            "size": (224, 224),
-            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#vit_l_32",
-            "acc@1": 76.972,
-            "acc@5": 93.07,
-        },
-    )
-    default = ImageNet1K_V1
+    # If a default model is added here the corresponding changes need to be done in vit_l_32
+    pass
 
 
 def _vision_transformer(
@@ -329,7 +281,7 @@ def _vision_transformer(
     return model
 
 
-@handle_legacy_interface(weights=("pretrained", ViT_B_16_Weights.ImageNet1K_V1))
+@handle_legacy_interface(weights=("pretrained", None))
 def vit_b_16(*, weights: Optional[ViT_B_16_Weights] = None, progress: bool = True, **kwargs: Any) -> VisionTransformer:
     """
     Constructs a vit_b_16 architecture from
@@ -354,7 +306,7 @@ def vit_b_16(*, weights: Optional[ViT_B_16_Weights] = None, progress: bool = Tru
     )
 
 
-@handle_legacy_interface(weights=("pretrained", ViT_B_32_Weights.ImageNet1K_V1))
+@handle_legacy_interface(weights=("pretrained", None))
 def vit_b_32(*, weights: Optional[ViT_B_32_Weights] = None, progress: bool = True, **kwargs: Any) -> VisionTransformer:
     """
     Constructs a vit_b_32 architecture from
@@ -379,7 +331,7 @@ def vit_b_32(*, weights: Optional[ViT_B_32_Weights] = None, progress: bool = Tru
     )
 
 
-@handle_legacy_interface(weights=("pretrained", ViT_L_16_Weights.ImageNet1K_V1))
+@handle_legacy_interface(weights=("pretrained", None))
 def vit_l_16(*, weights: Optional[ViT_L_16_Weights] = None, progress: bool = True, **kwargs: Any) -> VisionTransformer:
     """
     Constructs a vit_l_16 architecture from
@@ -404,7 +356,7 @@ def vit_l_16(*, weights: Optional[ViT_L_16_Weights] = None, progress: bool = Tru
     )
 
 
-@handle_legacy_interface(weights=("pretrained", ViT_L_32_Weights.ImageNet1K_V1))
+@handle_legacy_interface(weights=("pretrained", None))
 def vit_l_32(*, weights: Optional[ViT_L_32_Weights] = None, progress: bool = True, **kwargs: Any) -> VisionTransformer:
     """
     Constructs a vit_l_32 architecture from