From 4a2e7121d28447551a5d621b4feb4ff97314a3d9 Mon Sep 17 00:00:00 2001 From: sallysyw Date: Fri, 10 Dec 2021 19:24:30 +0000 Subject: [PATCH 1/5] Adding pretrained ViT weights --- .../prototype/models/vision_transformer.py | 46 +++++++++++++++---- 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/torchvision/prototype/models/vision_transformer.py b/torchvision/prototype/models/vision_transformer.py index 83c8160afc4..0031a1c47f1 100644 --- a/torchvision/prototype/models/vision_transformer.py +++ b/torchvision/prototype/models/vision_transformer.py @@ -10,12 +10,14 @@ import torch import torch.nn as nn from torch import Tensor +from torchvision.prototype.transforms import ImageNetEval +from torchvision.transforms.functional import InterpolationMode from ...utils import _log_api_usage_once -from ._api import WeightsEnum +from ._api import WeightsEnum, Weights +from ._meta import _IMAGENET_CATEGORIES from ._utils import handle_legacy_interface - __all__ = [ "VisionTransformer", "ViT_B_16_Weights", @@ -233,14 +235,33 @@ def forward(self, x: torch.Tensor): return x +_COMMON_META = { + "categories": _IMAGENET_CATEGORIES, + "interpolation": InterpolationMode.BILINEAR, +} + + class ViT_B_16_Weights(WeightsEnum): - # If a default model is added here the corresponding changes need to be done in vit_b_16 - pass + ImageNet1K_V1 = Weights( + url="https://download.pytorch.org/models/vit_b_16-0413b9bf.pth", + transforms=partial(ImageNetEval, crop_size=224), + meta={**_COMMON_META, "size": (224, 224), "acc@1": 80.004, "acc@5": 94.642}, + ) + default = ImageNet1K_V1 class ViT_B_32_Weights(WeightsEnum): - # If a default model is added here the corresponding changes need to be done in vit_b_32 - pass + ImageNet1K_V1 = Weights( + url="https://download.pytorch.org/models/vit_b_32-65f3bea4.pth", + transforms=partial(ImageNetEval, crop_size=224), + meta={ + **_COMMON_META, + "size": (224, 224), + "acc@1": 75.622, + "acc@5": 92.19, + }, + ) + default = ImageNet1K_V1 class ViT_L_16_Weights(WeightsEnum): @@ -249,8 +270,17 @@ class ViT_L_16_Weights(WeightsEnum): class ViT_L_32_Weights(WeightsEnum): - # If a default model is added here the corresponding changes need to be done in vit_l_32 - pass + ImageNet1K_V1 = Weights( + url="https://download.pytorch.org/models/vit_l_32-cd2ba208.pth", + transforms=partial(ImageNetEval, crop_size=224), + meta={ + **_COMMON_META, + "size": (224, 224), + "acc@1": 74.268, + "acc@5": 90.890, + }, + ) + default = ImageNet1K_V1 def _vision_transformer( From 64a28b7842fe4c5201ef052b538f174c193aabdd Mon Sep 17 00:00:00 2001 From: sallysyw Date: Fri, 10 Dec 2021 22:17:54 +0000 Subject: [PATCH 2/5] Adding recipe as part of meta --- torchvision/prototype/models/vision_transformer.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/torchvision/prototype/models/vision_transformer.py b/torchvision/prototype/models/vision_transformer.py index 0031a1c47f1..cd32f8ab951 100644 --- a/torchvision/prototype/models/vision_transformer.py +++ b/torchvision/prototype/models/vision_transformer.py @@ -245,7 +245,13 @@ class ViT_B_16_Weights(WeightsEnum): ImageNet1K_V1 = Weights( url="https://download.pytorch.org/models/vit_b_16-0413b9bf.pth", transforms=partial(ImageNetEval, crop_size=224), - meta={**_COMMON_META, "size": (224, 224), "acc@1": 80.004, "acc@5": 94.642}, + meta={ + **_COMMON_META, + "size": (224, 224), + "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#base-models", + "acc@1": 80.004, + "acc@5": 94.642, + }, ) default = ImageNet1K_V1 @@ -257,6 +263,7 @@ class ViT_B_32_Weights(WeightsEnum): meta={ **_COMMON_META, "size": (224, 224), + "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#base-models", "acc@1": 75.622, "acc@5": 92.19, }, @@ -276,8 +283,9 @@ class ViT_L_32_Weights(WeightsEnum): meta={ **_COMMON_META, "size": (224, 224), + "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#large-models-1", "acc@1": 74.268, - "acc@5": 90.890, + "acc@5": 90.89, }, ) default = ImageNet1K_V1 From 9c496252b12c924393497fb2450b529c780b06b1 Mon Sep 17 00:00:00 2001 From: sallysyw Date: Fri, 31 Dec 2021 02:47:42 +0000 Subject: [PATCH 3/5] update checkpoints using best ema results --- .../prototype/models/vision_transformer.py | 32 ++++++++++++------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/torchvision/prototype/models/vision_transformer.py b/torchvision/prototype/models/vision_transformer.py index c81f8872d57..8d0d06eaff7 100644 --- a/torchvision/prototype/models/vision_transformer.py +++ b/torchvision/prototype/models/vision_transformer.py @@ -243,14 +243,14 @@ def forward(self, x: torch.Tensor): class ViT_B_16_Weights(WeightsEnum): ImageNet1K_V1 = Weights( - url="https://download.pytorch.org/models/vit_b_16-0413b9bf.pth", + url="https://download.pytorch.org/models/vit_b_16-c867db91.pth", transforms=partial(ImageNetEval, crop_size=224), meta={ **_COMMON_META, "size": (224, 224), "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#base-models", - "acc@1": 80.004, - "acc@5": 94.642, + "acc@1": 81.072, + "acc@5": 95.318, }, ) default = ImageNet1K_V1 @@ -258,34 +258,44 @@ class ViT_B_16_Weights(WeightsEnum): class ViT_B_32_Weights(WeightsEnum): ImageNet1K_V1 = Weights( - url="https://download.pytorch.org/models/vit_b_32-65f3bea4.pth", + url="https://download.pytorch.org/models/vit_b_32-d86f8d99.pth", transforms=partial(ImageNetEval, crop_size=224), meta={ **_COMMON_META, "size": (224, 224), "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#base-models", - "acc@1": 75.622, - "acc@5": 92.19, + "acc@1": 75.912, + "acc@5": 92.466, }, ) default = ImageNet1K_V1 class ViT_L_16_Weights(WeightsEnum): - # If a default model is added here the corresponding changes need to be done in vit_l_16 - pass + ImageNet1K_V1 = Weights( + url="https://download.pytorch.org/models/vit_l_16-852ce7e3.pth", + transforms=partial(ImageNetEval, crop_size=224, resize_size=242), + meta={ + **_COMMON_META, + "size": (224, 224), + "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#large-models-1", + "acc@1": 79.662, + "acc@5": 94.638, + }, + ) + default = ImageNet1K_V1 class ViT_L_32_Weights(WeightsEnum): ImageNet1K_V1 = Weights( - url="https://download.pytorch.org/models/vit_l_32-cd2ba208.pth", + url="https://download.pytorch.org/models/vit_l_32-c7638314.pth", transforms=partial(ImageNetEval, crop_size=224), meta={ **_COMMON_META, "size": (224, 224), "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#large-models-1", - "acc@1": 74.268, - "acc@5": 90.89, + "acc@1": 76.972, + "acc@5": 93.07, }, ) default = ImageNet1K_V1 From 0b500bb3fa73d2b4cd8f8cf46eb3c02c42dd0318 Mon Sep 17 00:00:00 2001 From: sallysyw Date: Tue, 4 Jan 2022 03:37:44 +0000 Subject: [PATCH 4/5] Fix handle_legacy_interface and update recipe url --- .../prototype/models/vision_transformer.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/torchvision/prototype/models/vision_transformer.py b/torchvision/prototype/models/vision_transformer.py index 8d0d06eaff7..a3b0ec8e7e9 100644 --- a/torchvision/prototype/models/vision_transformer.py +++ b/torchvision/prototype/models/vision_transformer.py @@ -248,7 +248,7 @@ class ViT_B_16_Weights(WeightsEnum): meta={ **_COMMON_META, "size": (224, 224), - "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#base-models", + "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#vit_b_16", "acc@1": 81.072, "acc@5": 95.318, }, @@ -263,7 +263,7 @@ class ViT_B_32_Weights(WeightsEnum): meta={ **_COMMON_META, "size": (224, 224), - "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#base-models", + "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#vit_b_32", "acc@1": 75.912, "acc@5": 92.466, }, @@ -278,7 +278,7 @@ class ViT_L_16_Weights(WeightsEnum): meta={ **_COMMON_META, "size": (224, 224), - "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#large-models-1", + "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#vit_l_16", "acc@1": 79.662, "acc@5": 94.638, }, @@ -293,7 +293,7 @@ class ViT_L_32_Weights(WeightsEnum): meta={ **_COMMON_META, "size": (224, 224), - "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#large-models-1", + "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#vit_l_32", "acc@1": 76.972, "acc@5": 93.07, }, @@ -329,7 +329,7 @@ def _vision_transformer( return model -@handle_legacy_interface(weights=("pretrained", None)) +@handle_legacy_interface(weights=("pretrained", ViT_B_16_Weights.ImageNet1K_V1)) def vit_b_16(*, weights: Optional[ViT_B_16_Weights] = None, progress: bool = True, **kwargs: Any) -> VisionTransformer: """ Constructs a vit_b_16 architecture from @@ -354,7 +354,7 @@ def vit_b_16(*, weights: Optional[ViT_B_16_Weights] = None, progress: bool = Tru ) -@handle_legacy_interface(weights=("pretrained", None)) +@handle_legacy_interface(weights=("pretrained", ViT_B_32_Weights.ImageNet1K_V1)) def vit_b_32(*, weights: Optional[ViT_B_32_Weights] = None, progress: bool = True, **kwargs: Any) -> VisionTransformer: """ Constructs a vit_b_32 architecture from @@ -379,7 +379,7 @@ def vit_b_32(*, weights: Optional[ViT_B_32_Weights] = None, progress: bool = Tru ) -@handle_legacy_interface(weights=("pretrained", None)) +@handle_legacy_interface(weights=("pretrained", ViT_L_16_Weights.ImageNet1K_V1)) def vit_l_16(*, weights: Optional[ViT_L_16_Weights] = None, progress: bool = True, **kwargs: Any) -> VisionTransformer: """ Constructs a vit_l_16 architecture from @@ -404,7 +404,7 @@ def vit_l_16(*, weights: Optional[ViT_L_16_Weights] = None, progress: bool = Tru ) -@handle_legacy_interface(weights=("pretrained", None)) +@handle_legacy_interface(weights=("pretrained", ViT_L_32_Weights.ImageNet1K_V1)) def vit_l_32(*, weights: Optional[ViT_L_32_Weights] = None, progress: bool = True, **kwargs: Any) -> VisionTransformer: """ Constructs a vit_l_32 architecture from From 3aee34eedd208d8a2232b0949b917aa1faf1c54d Mon Sep 17 00:00:00 2001 From: sallysyw Date: Wed, 5 Jan 2022 02:14:54 +0000 Subject: [PATCH 5/5] Update README --- references/classification/README.md | 54 +++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/references/classification/README.md b/references/classification/README.md index a73fde3679f..48b20a30242 100644 --- a/references/classification/README.md +++ b/references/classification/README.md @@ -143,6 +143,60 @@ torchrun --nproc_per_node=8 train.py\ ``` Here `$MODEL` is one of `regnet_x_32gf`, `regnet_y_16gf` and `regnet_y_32gf`. +### Vision Transformer + +#### vit_b_16 +``` +torchrun --nproc_per_node=8 train.py\ + --model vit_b_16 --epochs 300 --batch-size 512 --opt adamw --lr 0.003 --wd 0.3\ + --lr-scheduler cosineannealinglr --lr-warmup-method linear --lr-warmup-epochs 30\ + --lr-warmup-decay 0.033 --amp --label-smoothing 0.11 --mixup-alpha 0.2 --auto-augment ra\ + --clip-grad-norm 1 --ra-sampler --cutmix-alpha 1.0 --model-ema +``` + +Note that the above command corresponds to training on a single node with 8 GPUs. +For generatring the pre-trained weights, we trained with 8 nodes, each with 8 GPUs (for a total of 64 GPUs), +and `--batch_size 64`. + +#### vit_b_32 +``` +torchrun --nproc_per_node=8 train.py\ + --model vit_b_32 --epochs 300 --batch-size 512 --opt adamw --lr 0.003 --wd 0.3\ + --lr-scheduler cosineannealinglr --lr-warmup-method linear --lr-warmup-epochs 30\ + --lr-warmup-decay 0.033 --amp --label-smoothing 0.11 --mixup-alpha 0.2 --auto-augment imagenet\ + --clip-grad-norm 1 --ra-sampler --cutmix-alpha 1.0 --model-ema +``` + +Note that the above command corresponds to training on a single node with 8 GPUs. +For generatring the pre-trained weights, we trained with 2 nodes, each with 8 GPUs (for a total of 16 GPUs), +and `--batch_size 256`. + +#### vit_l_16 +``` +torchrun --nproc_per_node=8 train.py\ + --model vit_l_16 --epochs 600 --batch-size 128 --lr 0.5 --lr-scheduler cosineannealinglr\ + --lr-warmup-method linear --lr-warmup-epochs 5 --label-smoothing 0.1 --mixup-alpha 0.2\ + --auto-augment ta_wide --random-erase 0.1 --weight-decay 0.00002 --norm-weight-decay 0.0\ + --clip-grad-norm 1 --ra-sampler --cutmix-alpha 1.0 --model-ema --val-resize-size 232 +``` + +Note that the above command corresponds to training on a single node with 8 GPUs. +For generatring the pre-trained weights, we trained with 2 nodes, each with 8 GPUs (for a total of 16 GPUs), +and `--batch_size 64`. + +#### vit_l_32 +``` +torchrun --nproc_per_node=8 train.py\ + --model vit_l_32 --epochs 300 --batch-size 512 --opt adamw --lr 0.003 --wd 0.3\ + --lr-scheduler cosineannealinglr --lr-warmup-method linear --lr-warmup-epochs 30\ + --lr-warmup-decay 0.033 --amp --label-smoothing 0.11 --mixup-alpha 0.2 --auto-augment ra\ + --clip-grad-norm 1 --ra-sampler --cutmix-alpha 1.0 --model-ema +``` + +Note that the above command corresponds to training on a single node with 8 GPUs. +For generatring the pre-trained weights, we trained with 8 nodes, each with 8 GPUs (for a total of 64 GPUs), +and `--batch_size 64`. + ## Mixed precision training Automatic Mixed Precision (AMP) training on GPU for Pytorch can be enabled with the [torch.cuda.amp](https://pytorch.org/docs/stable/amp.html?highlight=amp#module-torch.cuda.amp).