dropouts are more specific and aggressive in the paper, thanks for letting me know @hilach70

2026-02-27 06:30:21 +00:00 · 2020-10-14 05:48:27 -07:00
5 changed files with 20 additions and 37 deletions
--- a/README.md
+++ b/README.md
@@ -4,8 +4,6 @@

 Implementation of <a href="https://openreview.net/pdf?id=YicbFdNTTy">Vision Transformer</a>, a simple way to achieve SOTA in vision classification with only a single transformer encoder, in Pytorch. Significance is further explained in <a href="https://www.youtube.com/watch?v=TrdevFK_am4">Yannic Kilcher's</a> video. There's really not much to code here, but may as well lay it out for everyone so we expedite the attention revolution.

-For a Pytorch implementation with pretrained models, please see Ross Wightman's repository <a href="https://github.com/rwightman/pytorch-image-models">here</a>
-
 ## Install

 ```bash
@@ -128,23 +126,13 @@ Other sparse attention frameworks I would highly recommend is <a href="https://g
 ## Citations

 ```bibtex
-@misc{dosovitskiy2020image,
-    title   = {An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale},
-    author  = {Alexey Dosovitskiy and Lucas Beyer and Alexander Kolesnikov and Dirk Weissenborn and Xiaohua Zhai and Thomas Unterthiner and Mostafa Dehghani and Matthias Minderer and Georg Heigold and Sylvain Gelly and Jakob Uszkoreit and Neil Houlsby},
-    year    = {2020},
-    eprint  = {2010.11929},
-    archivePrefix = {arXiv},
-    primaryClass = {cs.CV}
-}
-```
-
-```bibtex
-@misc{vaswani2017attention,
-    title   = {Attention Is All You Need},
-    author  = {Ashish Vaswani and Noam Shazeer and Niki Parmar and Jakob Uszkoreit and Llion Jones and Aidan N. Gomez and Lukasz Kaiser and Illia Polosukhin},
-    year    = {2017},
-    eprint  = {1706.03762},
-    archivePrefix = {arXiv},
-    primaryClass = {cs.CL}
+@inproceedings{
+    anonymous2021an,
+    title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale},
+    author={Anonymous},
+    booktitle={Submitted to International Conference on Learning Representations},
+    year={2021},
+    url={https://openreview.net/forum?id=YicbFdNTTy},
+    note={under review}
 }
 ```
--- a/examples/cats_and_dogs.ipynb
+++ b/examples/cats_and_dogs.ipynb
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
 setup(
  name = 'vit-pytorch',
  packages = find_packages(exclude=['examples']),
-  version = '0.2.7',
+  version = '0.2.2',
  license='MIT',
  description = 'Vision Transformer (ViT) - Pytorch',
  author = 'Phil Wang',
--- a/vit_pytorch/efficient.py
+++ b/vit_pytorch/efficient.py
@@ -1,5 +1,5 @@
 import torch
-from einops import rearrange, repeat
+from einops import rearrange
 from torch import nn

 class ViT(nn.Module):
@@ -30,11 +30,10 @@ class ViT(nn.Module):

        x = rearrange(img, 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = p, p2 = p)
        x = self.patch_to_embedding(x)
-        b, n, _ = x.shape

-        cls_tokens = repeat(self.cls_token, '() n d -> b n d', b = b)
+        cls_tokens = self.cls_token.expand(img.shape[0], -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
-        x += self.pos_embedding[:, :(n + 1)]
+        x += self.pos_embedding
        x = self.transformer(x)

        x = self.to_cls_token(x[:, 0])
--- a/vit_pytorch/vit_pytorch.py
+++ b/vit_pytorch/vit_pytorch.py
@@ -1,10 +1,8 @@
 import torch
 import torch.nn.functional as F
-from einops import rearrange, repeat
+from einops import rearrange
 from torch import nn

-MIN_NUM_PATCHES = 16
-
 class Residual(nn.Module):
    def __init__(self, fn):
        super().__init__()
@@ -47,17 +45,16 @@ class Attention(nn.Module):

    def forward(self, x, mask = None):
        b, n, _, h = *x.shape, self.heads
-        qkv = self.to_qkv(x).chunk(3, dim = -1)
-        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), qkv)
+        qkv = self.to_qkv(x)
+        q, k, v = rearrange(qkv, 'b n (qkv h d) -> qkv b h n d', qkv = 3, h = h)

        dots = torch.einsum('bhid,bhjd->bhij', q, k) * self.scale
-        mask_value = -torch.finfo(dots.dtype).max

        if mask is not None:
            mask = F.pad(mask.flatten(1), (1, 0), value = True)
            assert mask.shape[-1] == dots.shape[-1], 'mask has incorrect dimensions'
            mask = mask[:, None, :] * mask[:, :, None]
-            dots.masked_fill_(~mask, mask_value)
+            dots.masked_fill_(~mask, float('-inf'))
            del mask

        attn = dots.softmax(dim=-1)
@@ -88,7 +85,6 @@ class ViT(nn.Module):
        assert image_size % patch_size == 0, 'image dimensions must be divisible by the patch size'
        num_patches = (image_size // patch_size) ** 2
        patch_dim = channels * patch_size ** 2
-        assert num_patches > MIN_NUM_PATCHES, f'your number of patches ({num_patches}) is way too small for attention to be effective. try decreasing your patch size'

        self.patch_size = patch_size

@@ -106,7 +102,8 @@ class ViT(nn.Module):
            nn.Linear(dim, mlp_dim),
            nn.GELU(),
            nn.Dropout(dropout),
-            nn.Linear(mlp_dim, num_classes)
+            nn.Linear(mlp_dim, num_classes),
+            nn.Dropout(dropout)
        )

    def forward(self, img, mask = None):
@@ -114,11 +111,10 @@ class ViT(nn.Module):

        x = rearrange(img, 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = p, p2 = p)
        x = self.patch_to_embedding(x)
-        b, n, _ = x.shape

-        cls_tokens = repeat(self.cls_token, '() n d -> b n d', b = b)
+        cls_tokens = self.cls_token.expand(img.shape[0], -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
-        x += self.pos_embedding[:, :(n + 1)]
+        x += self.pos_embedding
        x = self.dropout(x)

        x = self.transformer(x, mask)