Adding vit_h_14 architecture (#5210)

yiwen-song · web-flow · commit 03d11338f3fa · 2022-01-19T11:35:07.000-08:00
* adding vit_h_14

* prototype and docs

* bug fix

* adding curl check
diff --git a/docs/source/models.rst b/docs/source/models.rst
@@ -88,6 +88,7 @@ You can construct a model with random weights by calling its constructor:
     vit_b_32 = models.vit_b_32()
     vit_l_16 = models.vit_l_16()
     vit_l_32 = models.vit_l_32()
+    vit_h_14 = models.vit_h_14() 
 
 We provide pre-trained models, using the PyTorch :mod:`torch.utils.model_zoo`.
 These can be constructed by passing ``pretrained=True``:
@@ -460,6 +461,7 @@ VisionTransformer
     vit_b_32
     vit_l_16
     vit_l_32
+    vit_h_14
 
 Quantized Models
 ----------------
diff --git a/hubconf.py b/hubconf.py
@@ -63,4 +63,5 @@
     vit_b_32,
     vit_l_16,
     vit_l_32,
+    vit_h_14,
 )
diff --git a/test/expect/ModelTester.test_vit_h_14_expect.pkl b/test/expect/ModelTester.test_vit_h_14_expect.pkl
diff --git a/torchvision/models/vision_transformer.py b/torchvision/models/vision_transformer.py
@@ -15,6 +15,7 @@
     "vit_b_32",
     "vit_l_16",
     "vit_l_32",
+    "vit_h_14",
 ]
 
 model_urls = {
@@ -260,6 +261,8 @@ def _vision_transformer(
     )
 
     if pretrained:
+        if arch not in model_urls:
+            raise ValueError(f"No checkpoint is available for model type '{arch}'!")
         state_dict = load_state_dict_from_url(model_urls[arch], progress=progress)
         model.load_state_dict(state_dict)
 
@@ -354,6 +357,26 @@ def vit_l_32(pretrained: bool = False, progress: bool = True, **kwargs: Any) ->
     )
 
 
+def vit_h_14(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer:
+    """
+    Constructs a vit_h_14 architecture from
+    `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" <https://arxiv.org/abs/2010.11929>`_.
+
+    NOTE: Pretrained weights are not available for this model.
+    """
+    return _vision_transformer(
+        arch="vit_h_14",
+        patch_size=14,
+        num_layers=32,
+        num_heads=16,
+        hidden_dim=1280,
+        mlp_dim=5120,
+        pretrained=pretrained,
+        progress=progress,
+        **kwargs,
+    )
+
+
 def interpolate_embeddings(
     image_size: int,
     patch_size: int,
diff --git a/torchvision/prototype/models/vision_transformer.py b/torchvision/prototype/models/vision_transformer.py
@@ -19,10 +19,12 @@
     "ViT_B_32_Weights",
     "ViT_L_16_Weights",
     "ViT_L_32_Weights",
+    "ViT_H_14_Weights",
     "vit_b_16",
     "vit_b_32",
     "vit_l_16",
     "vit_l_32",
+    "vit_h_14",
 ]
 
 
@@ -99,6 +101,11 @@ class ViT_L_32_Weights(WeightsEnum):
     default = ImageNet1K_V1
 
 
+class ViT_H_14_Weights(WeightsEnum):
+    # Weights are not available yet.
+    pass
+
+
 def _vision_transformer(
     patch_size: int,
     num_layers: int,
@@ -192,3 +199,19 @@ def vit_l_32(*, weights: Optional[ViT_L_32_Weights] = None, progress: bool = Tru
         progress=progress,
         **kwargs,
     )
+
+
+@handle_legacy_interface(weights=("pretrained", None))
+def vit_h_14(*, weights: Optional[ViT_H_14_Weights] = None, progress: bool = True, **kwargs: Any) -> VisionTransformer:
+    weights = ViT_H_14_Weights.verify(weights)
+
+    return _vision_transformer(
+        patch_size=14,
+        num_layers=32,
+        num_heads=16,
+        hidden_dim=1280,
+        mlp_dim=5120,
+        weights=weights,
+        progress=progress,
+        **kwargs,
+    )

Original file line number	Diff line number	Diff line change
`@@ -63,4 +63,5 @@`
`63`	`63`	`vit_b_32,`
`64`	`64`	`vit_l_16,`
`65`	`65`	`vit_l_32,`
	`66`	`+ vit_h_14,`
`66`	`67`	`)`