1.4.2

add a simple vit with qknorm, since authors seem to be promoting the technique on twitter
fix linear head in simple vit, thanks to @atkos
2025-12-30 08:02:29 +00:00 · 2023-08-14 07:59:35 -07:00 · 2023-08-14 07:58:45 -07:00 · 2023-08-10 14:36:21 -07:00 · 2023-08-09 07:53:38 -07:00
9 changed files with 179 additions and 54 deletions
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
 setup(
  name = 'vit-pytorch',
  packages = find_packages(exclude=['examples']),
-  version = '1.2.9',
+  version = '1.4.2',
  license='MIT',
  description = 'Vision Transformer (ViT) - Pytorch',
  long_description_content_type = 'text/markdown',
--- a/vit_pytorch/simple_vit.py
+++ b/vit_pytorch/simple_vit.py
@@ -64,6 +64,7 @@ class Attention(nn.Module):
 class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim):
        super().__init__()
+        self.norm = nn.LayerNorm(dim)
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
@@ -74,7 +75,7 @@ class Transformer(nn.Module):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x
-        return x
+        return self.norm(x)

 class SimpleViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, channels = 3, dim_head = 64):
@@ -101,12 +102,10 @@ class SimpleViT(nn.Module):

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim)

-        self.to_latent = nn.Identity()
-        self.linear_head = nn.Sequential(
-            nn.LayerNorm(dim),
-            nn.Linear(dim, num_classes)
-        )
        self.pool = "mean"
+        self.to_latent = nn.Identity()
+
+        self.linear_head = nn.Linear(dim, num_classes)

    def forward(self, img):
        device = img.device
--- a/vit_pytorch/simple_vit_1d.py
+++ b/vit_pytorch/simple_vit_1d.py
@@ -62,6 +62,7 @@ class Attention(nn.Module):
 class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim):
        super().__init__()
+        self.norm = nn.LayerNorm(dim)
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
@@ -72,7 +73,7 @@ class Transformer(nn.Module):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x
-        return x
+        return self.norm(x)

 class SimpleViT(nn.Module):
    def __init__(self, *, seq_len, patch_size, num_classes, dim, depth, heads, mlp_dim, channels = 3, dim_head = 64):
@@ -93,10 +94,7 @@ class SimpleViT(nn.Module):
        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim)

        self.to_latent = nn.Identity()
-        self.linear_head = nn.Sequential(
-            nn.LayerNorm(dim),
-            nn.Linear(dim, num_classes)
-        )
+        self.linear_head = nn.Linear(dim, num_classes)

    def forward(self, series):
        *_, n, dtype = *series.shape, series.dtype
--- a/vit_pytorch/simple_vit_3d.py
+++ b/vit_pytorch/simple_vit_3d.py
@@ -77,6 +77,7 @@ class Attention(nn.Module):
 class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim):
        super().__init__()
+        self.norm = nn.LayerNorm(dim)
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
@@ -87,7 +88,7 @@ class Transformer(nn.Module):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x
-        return x
+        return self.norm(x)

 class SimpleViT(nn.Module):
    def __init__(self, *, image_size, image_patch_size, frames, frame_patch_size, num_classes, dim, depth, heads, mlp_dim, channels = 3, dim_head = 64):
@@ -111,10 +112,7 @@ class SimpleViT(nn.Module):
        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim)

        self.to_latent = nn.Identity()
-        self.linear_head = nn.Sequential(
-            nn.LayerNorm(dim),
-            nn.Linear(dim, num_classes)
-        )
+        self.linear_head = nn.Linear(dim, num_classes)

    def forward(self, video):
        *_, h, w, dtype = *video.shape, video.dtype
--- a/vit_pytorch/simple_vit_with_patch_dropout.py
+++ b/vit_pytorch/simple_vit_with_patch_dropout.py
@@ -87,6 +87,7 @@ class Attention(nn.Module):
 class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim):
        super().__init__()
+        self.norm = nn.LayerNorm(dim)
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
@@ -97,7 +98,7 @@ class Transformer(nn.Module):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x
-        return x
+        return self.norm(x)

 class SimpleViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, channels = 3, dim_head = 64, patch_dropout = 0.5):
@@ -122,10 +123,7 @@ class SimpleViT(nn.Module):
        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim)

        self.to_latent = nn.Identity()
-        self.linear_head = nn.Sequential(
-            nn.LayerNorm(dim),
-            nn.Linear(dim, num_classes)
-        )
+        self.linear_head = nn.Linear(dim, num_classes)

    def forward(self, img):
        *_, h, w, dtype = *img.shape, img.dtype
--- a/vit_pytorch/simple_vit_with_qk_norm.py
+++ b/vit_pytorch/simple_vit_with_qk_norm.py
@@ -0,0 +1,141 @@
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from einops import rearrange
+from einops.layers.torch import Rearrange
+
+# helpers
+
+def pair(t):
+    return t if isinstance(t, tuple) else (t, t)
+
+def posemb_sincos_2d(h, w, dim, temperature: int = 10000, dtype = torch.float32):
+    y, x = torch.meshgrid(torch.arange(h), torch.arange(w), indexing="ij")
+    assert (dim % 4) == 0, "feature dimension must be multiple of 4 for sincos emb"
+    omega = torch.arange(dim // 4) / (dim // 4 - 1)
+    omega = 1.0 / (temperature ** omega)
+
+    y = y.flatten()[:, None] * omega[None, :]
+    x = x.flatten()[:, None] * omega[None, :]
+    pe = torch.cat((x.sin(), x.cos(), y.sin(), y.cos()), dim=1)
+    return pe.type(dtype)
+
+# they use a query-key normalization that is equivalent to rms norm (no mean-centering, learned gamma), from vit 22B paper
+
+# in latest tweet, seem to claim more stable training at higher learning rates
+# unsure if this has taken off within Brain, or it has some hidden drawback
+
+class RMSNorm(nn.Module):
+    def __init__(self, heads, dim):
+        super().__init__()
+        self.scale = dim ** 0.5
+        self.gamma = nn.Parameter(torch.ones(heads, 1, dim) / self.scale)
+
+    def forward(self, x):
+        normed = F.normalize(x, dim = -1)
+        return normed * self.scale * self.gamma
+
+# classes
+
+class FeedForward(nn.Module):
+    def __init__(self, dim, hidden_dim):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.LayerNorm(dim),
+            nn.Linear(dim, hidden_dim),
+            nn.GELU(),
+            nn.Linear(hidden_dim, dim),
+        )
+    def forward(self, x):
+        return self.net(x)
+
+class Attention(nn.Module):
+    def __init__(self, dim, heads = 8, dim_head = 64):
+        super().__init__()
+        inner_dim = dim_head *  heads
+        self.heads = heads
+        self.norm = nn.LayerNorm(dim)
+
+        self.attend = nn.Softmax(dim = -1)
+
+        self.q_norm = RMSNorm(heads, dim_head)
+        self.k_norm = RMSNorm(heads, dim_head)
+
+        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
+        self.to_out = nn.Linear(inner_dim, dim, bias = False)
+
+    def forward(self, x):
+        x = self.norm(x)
+
+        qkv = self.to_qkv(x).chunk(3, dim = -1)
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv)
+
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+
+        dots = torch.matmul(q, k.transpose(-1, -2))
+
+        attn = self.attend(dots)
+
+        out = torch.matmul(attn, v)
+        out = rearrange(out, 'b h n d -> b n (h d)')
+        return self.to_out(out)
+
+class Transformer(nn.Module):
+    def __init__(self, dim, depth, heads, dim_head, mlp_dim):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim)
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(nn.ModuleList([
+                Attention(dim, heads = heads, dim_head = dim_head),
+                FeedForward(dim, mlp_dim)
+            ]))
+    def forward(self, x):
+        for attn, ff in self.layers:
+            x = attn(x) + x
+            x = ff(x) + x
+        return self.norm(x)
+
+class SimpleViT(nn.Module):
+    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, channels = 3, dim_head = 64):
+        super().__init__()
+        image_height, image_width = pair(image_size)
+        patch_height, patch_width = pair(patch_size)
+
+        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'
+
+        patch_dim = channels * patch_height * patch_width
+
+        self.to_patch_embedding = nn.Sequential(
+            Rearrange("b c (h p1) (w p2) -> b (h w) (p1 p2 c)", p1 = patch_height, p2 = patch_width),
+            nn.LayerNorm(patch_dim),
+            nn.Linear(patch_dim, dim),
+            nn.LayerNorm(dim),
+        )
+
+        self.pos_embedding = posemb_sincos_2d(
+            h = image_height // patch_height,
+            w = image_width // patch_width,
+            dim = dim,
+        ) 
+
+        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim)
+
+        self.pool = "mean"
+        self.to_latent = nn.Identity()
+
+        self.linear_head = nn.LayerNorm(dim)
+
+    def forward(self, img):
+        device = img.device
+
+        x = self.to_patch_embedding(img)
+        x += self.pos_embedding.to(device, dtype=x.dtype)
+
+        x = self.transformer(x)
+        x = x.mean(dim = 1)
+
+        x = self.to_latent(x)
+        return self.linear_head(x)
--- a/vit_pytorch/vit.py
+++ b/vit_pytorch/vit.py
@@ -11,24 +11,18 @@ def pair(t):

 # classes

-class PreNorm(nn.Module):
-    def __init__(self, dim, fn):
-        super().__init__()
-        self.norm = nn.LayerNorm(dim)
-        self.fn = fn
-    def forward(self, x, **kwargs):
-        return self.fn(self.norm(x), **kwargs)
-
 class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim, dropout = 0.):
        super().__init__()
        self.net = nn.Sequential(
+            nn.LayerNorm(dim),
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
        )
+
    def forward(self, x):
        return self.net(x)

@@ -41,6 +35,8 @@ class Attention(nn.Module):
        self.heads = heads
        self.scale = dim_head ** -0.5

+        self.norm = nn.LayerNorm(dim)
+
        self.attend = nn.Softmax(dim = -1)
        self.dropout = nn.Dropout(dropout)

@@ -52,6 +48,8 @@ class Attention(nn.Module):
        ) if project_out else nn.Identity()

    def forward(self, x):
+        x = self.norm(x)
+
        qkv = self.to_qkv(x).chunk(3, dim = -1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv)

@@ -67,17 +65,20 @@ class Attention(nn.Module):
 class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0.):
        super().__init__()
+        self.norm = nn.LayerNorm(dim)
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
-                PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout)),
-                PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout))
+                Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout),
+                FeedForward(dim, mlp_dim, dropout = dropout)
            ]))
+
    def forward(self, x):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x
-        return x
+
+        return self.norm(x)

 class ViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool = 'cls', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
@@ -107,10 +108,7 @@ class ViT(nn.Module):
        self.pool = pool
        self.to_latent = nn.Identity()

-        self.mlp_head = nn.Sequential(
-            nn.LayerNorm(dim),
-            nn.Linear(dim, num_classes)
-        )
+        self.mlp_head = nn.Linear(dim, num_classes)

    def forward(self, img):
        x = self.to_patch_embedding(img)
--- a/vit_pytorch/vit_with_patch_merger.py
+++ b/vit_pytorch/vit_with_patch_merger.py
@@ -32,18 +32,11 @@ class PatchMerger(nn.Module):

 # classes

-class PreNorm(nn.Module):
-    def __init__(self, dim, fn):
-        super().__init__()
-        self.norm = nn.LayerNorm(dim)
-        self.fn = fn
-    def forward(self, x, **kwargs):
-        return self.fn(self.norm(x), **kwargs)
-
 class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim, dropout = 0.):
        super().__init__()
        self.net = nn.Sequential(
+            nn.LayerNorm(dim),
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
@@ -62,6 +55,7 @@ class Attention(nn.Module):
        self.heads = heads
        self.scale = dim_head ** -0.5

+        self.norm = nn.LayerNorm(dim)
        self.attend = nn.Softmax(dim = -1)
        self.dropout = nn.Dropout(dropout)

@@ -73,6 +67,7 @@ class Attention(nn.Module):
        ) if project_out else nn.Identity()

    def forward(self, x):
+        x = self.norm(x)
        qkv = self.to_qkv(x).chunk(3, dim = -1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv)

@@ -88,6 +83,7 @@ class Attention(nn.Module):
 class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0., patch_merge_layer = None, patch_merge_num_tokens = 8):
        super().__init__()
+        self.norm = nn.LayerNorm(dim)
        self.layers = nn.ModuleList([])

        self.patch_merge_layer_index = default(patch_merge_layer, depth // 2) - 1 # default to mid-way through transformer, as shown in paper
@@ -95,8 +91,8 @@ class Transformer(nn.Module):

        for _ in range(depth):
            self.layers.append(nn.ModuleList([
-                PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout)),
-                PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout))
+                Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout),
+                FeedForward(dim, mlp_dim, dropout = dropout)
            ]))
    def forward(self, x):
        for index, (attn, ff) in enumerate(self.layers):
@@ -106,7 +102,7 @@ class Transformer(nn.Module):
            if index == self.patch_merge_layer_index:
                x = self.patch_merger(x)

-        return x
+        return self.norm(x)

 class ViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, patch_merge_layer = None, patch_merge_num_tokens = 8, channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
@@ -133,7 +129,6 @@ class ViT(nn.Module):

        self.mlp_head = nn.Sequential(
            Reduce('b n d -> b d', 'mean'),
-            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

--- a/vit_pytorch/vivit.py
+++ b/vit_pytorch/vivit.py
@@ -70,6 +70,7 @@ class Attention(nn.Module):
 class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0.):
        super().__init__()
+        self.norm = nn.LayerNorm(dim)
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
@@ -80,7 +81,7 @@ class Transformer(nn.Module):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x
-        return x
+        return self.norm(x)

 class ViT(nn.Module):
    def __init__(
@@ -137,10 +138,7 @@ class ViT(nn.Module):
        self.pool = pool
        self.to_latent = nn.Identity()

-        self.mlp_head = nn.Sequential(
-            nn.LayerNorm(dim),
-            nn.Linear(dim, num_classes)
-        )
+        self.mlp_head = nn.Linear(dim, num_classes)

    def forward(self, video):
        x = self.to_patch_embedding(video)
Author	SHA1	Message	Date
Phil Wang	4264efd906	1.4.2	2023-08-14 07:59:35 -07:00
Phil Wang	b194359301	add a simple vit with qknorm, since authors seem to be promoting the technique on twitter	2023-08-14 07:58:45 -07:00
lucidrains	950c901b80	fix linear head in simple vit, thanks to @atkos	2023-08-10 14:36:21 -07:00
Phil Wang	3e5d1be6f0	address https://github.com/lucidrains/vit-pytorch/pull/274	2023-08-09 07:53:38 -07:00