1.4.2

add a simple vit with qknorm, since authors seem to be promoting the technique on twitter
fix linear head in simple vit, thanks to @atkos
2026-05-09 09:02:31 +00:00 · 2023-08-14 07:59:35 -07:00 · 2023-08-14 07:58:45 -07:00 · 2023-08-10 14:36:21 -07:00 · 2023-08-09 07:53:38 -07:00 · 2023-07-25 10:38:55 -07:00
11 changed files with 289 additions and 61 deletions
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@
 - [Usage](#usage)
 - [Parameters](#parameters)
 - [Simple ViT](#simple-vit)
- [NaViT](#na-vit)
+- [NaViT](#navit)
 - [Distillation](#distillation)
 - [Deep ViT](#deep-vit)
 - [CaiT](#cait)
@@ -142,7 +142,7 @@ preds = v(img) # (1, 1000)

 ## NaViT

-<img src="./images/na_vit.png" width="450px"></img>
+<img src="./images/navit.png" width="450px"></img>

 <a href="https://arxiv.org/abs/2307.06304">This paper</a> proposes to leverage the flexibility of attention and masking for variable lengthed sequences to train images of multiple resolution, packed into a single batch. They demonstrate much faster training and improved accuracies, with the only cost being extra complexity in the architecture and dataloading. They use factorized 2d positional encodings, token dropping, as well as query-key normalization.

@@ -161,7 +161,8 @@ v = NaViT(
    heads = 16,
    mlp_dim = 2048,
    dropout = 0.1,
-    emb_dropout = 0.1
+    emb_dropout = 0.1,
+    token_dropout_prob = 0.1  # token dropout of 10% (keep 90% of tokens)
 )

 # 5 images of different resolutions - List[List[Tensor]]
@@ -178,6 +179,24 @@ preds = v(images) # (5, 1000) - 5, because 5 images of different resolution abov

 ```

+Or if you would rather that the framework auto group the images into variable lengthed sequences that do not exceed a certain max length
+
+```python
+images = [
+    torch.randn(3, 256, 256),
+    torch.randn(3, 128, 128),
+    torch.randn(3, 128, 256),
+    torch.randn(3, 256, 128),
+    torch.randn(3, 64, 256)
+]
+
+preds = v(
+    images,
+    group_images = True,
+    group_max_seq_len = 64
+) # (5, 1000)
+```
+
 ## Distillation

 <img src="./images/distill.png" width="300px"></img>
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
 setup(
  name = 'vit-pytorch',
  packages = find_packages(exclude=['examples']),
-  version = '1.2.5',
+  version = '1.4.2',
  license='MIT',
  description = 'Vision Transformer (ViT) - Pytorch',
  long_description_content_type = 'text/markdown',
--- a/vit_pytorch/na_vit.py
+++ b/vit_pytorch/na_vit.py
@@ -1,5 +1,5 @@
 from functools import partial
-from typing import List
+from typing import List, Union

 import torch
 import torch.nn.functional as F
@@ -17,12 +17,58 @@ def exists(val):
 def default(val, d):
    return val if exists(val) else d

+def always(val):
+    return lambda *args: val
+
 def pair(t):
    return t if isinstance(t, tuple) else (t, t)

 def divisible_by(numer, denom):
    return (numer % denom) == 0

+# auto grouping images
+
+def group_images_by_max_seq_len(
+    images: List[Tensor],
+    patch_size: int,
+    calc_token_dropout = None,
+    max_seq_len = 2048
+
+) -> List[List[Tensor]]:
+
+    calc_token_dropout = default(calc_token_dropout, always(0.))
+
+    groups = []
+    group = []
+    seq_len = 0
+
+    if isinstance(calc_token_dropout, (float, int)):
+        calc_token_dropout = always(calc_token_dropout)
+
+    for image in images:
+        assert isinstance(image, Tensor)
+
+        image_dims = image.shape[-2:]
+        ph, pw = map(lambda t: t // patch_size, image_dims)
+
+        image_seq_len = (ph * pw)
+        image_seq_len = int(image_seq_len * (1 - calc_token_dropout(*image_dims)))
+
+        assert image_seq_len <= max_seq_len, f'image with dimensions {image_dims} exceeds maximum sequence length'
+
+        if (seq_len + image_seq_len) > max_seq_len:
+            groups.append(group)
+            group = []
+            seq_len = 0
+
+        group.append(image)
+        seq_len += image_seq_len
+
+    if len(group) > 0:
+        groups.append(group)
+
+    return groups
+
 # normalization
 # they use layernorm without bias, something that pytorch does not offer

@@ -138,10 +184,26 @@ class Transformer(nn.Module):
        return self.norm(x)

 class NaViT(nn.Module):
-    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
+    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0., token_dropout_prob = None):
        super().__init__()
        image_height, image_width = pair(image_size)

+        # what percent of tokens to dropout
+        # if int or float given, then assume constant dropout prob
+        # otherwise accept a callback that in turn calculates dropout prob from height and width
+
+        self.calc_token_dropout = None
+
+        if callable(token_dropout_prob):
+            self.calc_token_dropout = token_dropout_prob
+
+        elif isinstance(token_dropout_prob, (float, int)):
+            assert 0. < token_dropout_prob < 1.
+            token_dropout_prob = float(token_dropout_prob)
+            self.calc_token_dropout = lambda height, width: token_dropout_prob
+
+        # calculate patching related stuff
+
        assert divisible_by(image_height, patch_size) and divisible_by(image_width, patch_size), 'Image dimensions must be divisible by the patch size.'

        patch_height_dim, patch_width_dim = (image_height // patch_size), (image_width // patch_size)
@@ -183,13 +245,25 @@ class NaViT(nn.Module):

    def forward(
        self,
-        batched_images: List[List[Tensor]] # assume different resolution images already grouped correctly
+        batched_images: Union[List[Tensor], List[List[Tensor]]], # assume different resolution images already grouped correctly
+        group_images = False,
+        group_max_seq_len = 2048
    ):
-        p, c, device = self.patch_size, self.channels, self.device
+        p, c, device, has_token_dropout = self.patch_size, self.channels, self.device, exists(self.calc_token_dropout)

        arange = partial(torch.arange, device = device)
        pad_sequence = partial(orig_pad_sequence, batch_first = True)

+        # auto pack if specified
+
+        if group_images:
+            batched_images = group_images_by_max_seq_len(
+                batched_images,
+                patch_size = self.patch_size,
+                calc_token_dropout = self.calc_token_dropout,
+                max_seq_len = group_max_seq_len
+            )
+
        # process images into variable lengthed sequences with attention mask

        num_images = []
@@ -219,6 +293,16 @@ class NaViT(nn.Module):
                pos = rearrange(pos, 'h w c -> (h w) c')
                seq = rearrange(image, 'c (h p1) (w p2) -> (h w) (c p1 p2)', p1 = p, p2 = p)

+                seq_len = seq.shape[-2]
+
+                if has_token_dropout:
+                    token_dropout = self.calc_token_dropout(*image_dims)
+                    num_keep = max(1, int(seq_len * (1 - token_dropout)))
+                    keep_indices = torch.randn((seq_len,), device = device).topk(num_keep, dim = -1).indices
+
+                    seq = seq[keep_indices]
+                    pos = pos[keep_indices]
+
                image_ids = F.pad(image_ids, (0, seq.shape[-2]), value = image_id)
                sequences.append(seq)
                positions.append(pos)
--- a/vit_pytorch/simple_vit.py
+++ b/vit_pytorch/simple_vit.py
@@ -64,6 +64,7 @@ class Attention(nn.Module):
 class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim):
        super().__init__()
+        self.norm = nn.LayerNorm(dim)
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
@@ -74,7 +75,7 @@ class Transformer(nn.Module):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x
-        return x
+        return self.norm(x)

 class SimpleViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, channels = 3, dim_head = 64):
@@ -101,12 +102,10 @@ class SimpleViT(nn.Module):

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim)

-        self.to_latent = nn.Identity()
-        self.linear_head = nn.Sequential(
-            nn.LayerNorm(dim),
-            nn.Linear(dim, num_classes)
-        )
        self.pool = "mean"
+        self.to_latent = nn.Identity()
+
+        self.linear_head = nn.Linear(dim, num_classes)

    def forward(self, img):
        device = img.device
--- a/vit_pytorch/simple_vit_1d.py
+++ b/vit_pytorch/simple_vit_1d.py
@@ -62,6 +62,7 @@ class Attention(nn.Module):
 class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim):
        super().__init__()
+        self.norm = nn.LayerNorm(dim)
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
@@ -72,7 +73,7 @@ class Transformer(nn.Module):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x
-        return x
+        return self.norm(x)

 class SimpleViT(nn.Module):
    def __init__(self, *, seq_len, patch_size, num_classes, dim, depth, heads, mlp_dim, channels = 3, dim_head = 64):
@@ -93,10 +94,7 @@ class SimpleViT(nn.Module):
        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim)

        self.to_latent = nn.Identity()
-        self.linear_head = nn.Sequential(
-            nn.LayerNorm(dim),
-            nn.Linear(dim, num_classes)
-        )
+        self.linear_head = nn.Linear(dim, num_classes)

    def forward(self, series):
        *_, n, dtype = *series.shape, series.dtype
--- a/vit_pytorch/simple_vit_3d.py
+++ b/vit_pytorch/simple_vit_3d.py
@@ -77,6 +77,7 @@ class Attention(nn.Module):
 class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim):
        super().__init__()
+        self.norm = nn.LayerNorm(dim)
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
@@ -87,7 +88,7 @@ class Transformer(nn.Module):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x
-        return x
+        return self.norm(x)

 class SimpleViT(nn.Module):
    def __init__(self, *, image_size, image_patch_size, frames, frame_patch_size, num_classes, dim, depth, heads, mlp_dim, channels = 3, dim_head = 64):
@@ -111,10 +112,7 @@ class SimpleViT(nn.Module):
        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim)

        self.to_latent = nn.Identity()
-        self.linear_head = nn.Sequential(
-            nn.LayerNorm(dim),
-            nn.Linear(dim, num_classes)
-        )
+        self.linear_head = nn.Linear(dim, num_classes)

    def forward(self, video):
        *_, h, w, dtype = *video.shape, video.dtype
--- a/vit_pytorch/simple_vit_with_patch_dropout.py
+++ b/vit_pytorch/simple_vit_with_patch_dropout.py
@@ -87,6 +87,7 @@ class Attention(nn.Module):
 class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim):
        super().__init__()
+        self.norm = nn.LayerNorm(dim)
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
@@ -97,7 +98,7 @@ class Transformer(nn.Module):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x
-        return x
+        return self.norm(x)

 class SimpleViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, channels = 3, dim_head = 64, patch_dropout = 0.5):
@@ -122,10 +123,7 @@ class SimpleViT(nn.Module):
        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim)

        self.to_latent = nn.Identity()
-        self.linear_head = nn.Sequential(
-            nn.LayerNorm(dim),
-            nn.Linear(dim, num_classes)
-        )
+        self.linear_head = nn.Linear(dim, num_classes)

    def forward(self, img):
        *_, h, w, dtype = *img.shape, img.dtype
--- a/vit_pytorch/simple_vit_with_qk_norm.py
+++ b/vit_pytorch/simple_vit_with_qk_norm.py
@@ -0,0 +1,141 @@
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from einops import rearrange
+from einops.layers.torch import Rearrange
+
+# helpers
+
+def pair(t):
+    return t if isinstance(t, tuple) else (t, t)
+
+def posemb_sincos_2d(h, w, dim, temperature: int = 10000, dtype = torch.float32):
+    y, x = torch.meshgrid(torch.arange(h), torch.arange(w), indexing="ij")
+    assert (dim % 4) == 0, "feature dimension must be multiple of 4 for sincos emb"
+    omega = torch.arange(dim // 4) / (dim // 4 - 1)
+    omega = 1.0 / (temperature ** omega)
+
+    y = y.flatten()[:, None] * omega[None, :]
+    x = x.flatten()[:, None] * omega[None, :]
+    pe = torch.cat((x.sin(), x.cos(), y.sin(), y.cos()), dim=1)
+    return pe.type(dtype)
+
+# they use a query-key normalization that is equivalent to rms norm (no mean-centering, learned gamma), from vit 22B paper
+
+# in latest tweet, seem to claim more stable training at higher learning rates
+# unsure if this has taken off within Brain, or it has some hidden drawback
+
+class RMSNorm(nn.Module):
+    def __init__(self, heads, dim):
+        super().__init__()
+        self.scale = dim ** 0.5
+        self.gamma = nn.Parameter(torch.ones(heads, 1, dim) / self.scale)
+
+    def forward(self, x):
+        normed = F.normalize(x, dim = -1)
+        return normed * self.scale * self.gamma
+
+# classes
+
+class FeedForward(nn.Module):
+    def __init__(self, dim, hidden_dim):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.LayerNorm(dim),
+            nn.Linear(dim, hidden_dim),
+            nn.GELU(),
+            nn.Linear(hidden_dim, dim),
+        )
+    def forward(self, x):
+        return self.net(x)
+
+class Attention(nn.Module):
+    def __init__(self, dim, heads = 8, dim_head = 64):
+        super().__init__()
+        inner_dim = dim_head *  heads
+        self.heads = heads
+        self.norm = nn.LayerNorm(dim)
+
+        self.attend = nn.Softmax(dim = -1)
+
+        self.q_norm = RMSNorm(heads, dim_head)
+        self.k_norm = RMSNorm(heads, dim_head)
+
+        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
+        self.to_out = nn.Linear(inner_dim, dim, bias = False)
+
+    def forward(self, x):
+        x = self.norm(x)
+
+        qkv = self.to_qkv(x).chunk(3, dim = -1)
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv)
+
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+
+        dots = torch.matmul(q, k.transpose(-1, -2))
+
+        attn = self.attend(dots)
+
+        out = torch.matmul(attn, v)
+        out = rearrange(out, 'b h n d -> b n (h d)')
+        return self.to_out(out)
+
+class Transformer(nn.Module):
+    def __init__(self, dim, depth, heads, dim_head, mlp_dim):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim)
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(nn.ModuleList([
+                Attention(dim, heads = heads, dim_head = dim_head),
+                FeedForward(dim, mlp_dim)
+            ]))
+    def forward(self, x):
+        for attn, ff in self.layers:
+            x = attn(x) + x
+            x = ff(x) + x
+        return self.norm(x)
+
+class SimpleViT(nn.Module):
+    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, channels = 3, dim_head = 64):
+        super().__init__()
+        image_height, image_width = pair(image_size)
+        patch_height, patch_width = pair(patch_size)
+
+        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'
+
+        patch_dim = channels * patch_height * patch_width
+
+        self.to_patch_embedding = nn.Sequential(
+            Rearrange("b c (h p1) (w p2) -> b (h w) (p1 p2 c)", p1 = patch_height, p2 = patch_width),
+            nn.LayerNorm(patch_dim),
+            nn.Linear(patch_dim, dim),
+            nn.LayerNorm(dim),
+        )
+
+        self.pos_embedding = posemb_sincos_2d(
+            h = image_height // patch_height,
+            w = image_width // patch_width,
+            dim = dim,
+        ) 
+
+        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim)
+
+        self.pool = "mean"
+        self.to_latent = nn.Identity()
+
+        self.linear_head = nn.LayerNorm(dim)
+
+    def forward(self, img):
+        device = img.device
+
+        x = self.to_patch_embedding(img)
+        x += self.pos_embedding.to(device, dtype=x.dtype)
+
+        x = self.transformer(x)
+        x = x.mean(dim = 1)
+
+        x = self.to_latent(x)
+        return self.linear_head(x)
--- a/vit_pytorch/vit.py
+++ b/vit_pytorch/vit.py
@@ -11,24 +11,18 @@ def pair(t):

 # classes

-class PreNorm(nn.Module):
-    def __init__(self, dim, fn):
-        super().__init__()
-        self.norm = nn.LayerNorm(dim)
-        self.fn = fn
-    def forward(self, x, **kwargs):
-        return self.fn(self.norm(x), **kwargs)
-
 class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim, dropout = 0.):
        super().__init__()
        self.net = nn.Sequential(
+            nn.LayerNorm(dim),
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
        )
+
    def forward(self, x):
        return self.net(x)

@@ -41,6 +35,8 @@ class Attention(nn.Module):
        self.heads = heads
        self.scale = dim_head ** -0.5

+        self.norm = nn.LayerNorm(dim)
+
        self.attend = nn.Softmax(dim = -1)
        self.dropout = nn.Dropout(dropout)

@@ -52,6 +48,8 @@ class Attention(nn.Module):
        ) if project_out else nn.Identity()

    def forward(self, x):
+        x = self.norm(x)
+
        qkv = self.to_qkv(x).chunk(3, dim = -1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv)

@@ -67,17 +65,20 @@ class Attention(nn.Module):
 class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0.):
        super().__init__()
+        self.norm = nn.LayerNorm(dim)
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
-                PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout)),
-                PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout))
+                Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout),
+                FeedForward(dim, mlp_dim, dropout = dropout)
            ]))
+
    def forward(self, x):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x
-        return x
+
+        return self.norm(x)

 class ViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool = 'cls', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
@@ -107,10 +108,7 @@ class ViT(nn.Module):
        self.pool = pool
        self.to_latent = nn.Identity()

-        self.mlp_head = nn.Sequential(
-            nn.LayerNorm(dim),
-            nn.Linear(dim, num_classes)
-        )
+        self.mlp_head = nn.Linear(dim, num_classes)

    def forward(self, img):
        x = self.to_patch_embedding(img)
--- a/vit_pytorch/vit_with_patch_merger.py
+++ b/vit_pytorch/vit_with_patch_merger.py
@@ -32,18 +32,11 @@ class PatchMerger(nn.Module):

 # classes

-class PreNorm(nn.Module):
-    def __init__(self, dim, fn):
-        super().__init__()
-        self.norm = nn.LayerNorm(dim)
-        self.fn = fn
-    def forward(self, x, **kwargs):
-        return self.fn(self.norm(x), **kwargs)
-
 class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim, dropout = 0.):
        super().__init__()
        self.net = nn.Sequential(
+            nn.LayerNorm(dim),
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
@@ -62,6 +55,7 @@ class Attention(nn.Module):
        self.heads = heads
        self.scale = dim_head ** -0.5

+        self.norm = nn.LayerNorm(dim)
        self.attend = nn.Softmax(dim = -1)
        self.dropout = nn.Dropout(dropout)

@@ -73,6 +67,7 @@ class Attention(nn.Module):
        ) if project_out else nn.Identity()

    def forward(self, x):
+        x = self.norm(x)
        qkv = self.to_qkv(x).chunk(3, dim = -1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv)

@@ -88,6 +83,7 @@ class Attention(nn.Module):
 class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0., patch_merge_layer = None, patch_merge_num_tokens = 8):
        super().__init__()
+        self.norm = nn.LayerNorm(dim)
        self.layers = nn.ModuleList([])

        self.patch_merge_layer_index = default(patch_merge_layer, depth // 2) - 1 # default to mid-way through transformer, as shown in paper
@@ -95,8 +91,8 @@ class Transformer(nn.Module):

        for _ in range(depth):
            self.layers.append(nn.ModuleList([
-                PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout)),
-                PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout))
+                Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout),
+                FeedForward(dim, mlp_dim, dropout = dropout)
            ]))
    def forward(self, x):
        for index, (attn, ff) in enumerate(self.layers):
@@ -106,7 +102,7 @@ class Transformer(nn.Module):
            if index == self.patch_merge_layer_index:
                x = self.patch_merger(x)

-        return x
+        return self.norm(x)

 class ViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, patch_merge_layer = None, patch_merge_num_tokens = 8, channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
@@ -133,7 +129,6 @@ class ViT(nn.Module):

        self.mlp_head = nn.Sequential(
            Reduce('b n d -> b d', 'mean'),
-            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

--- a/vit_pytorch/vivit.py
+++ b/vit_pytorch/vivit.py
@@ -70,6 +70,7 @@ class Attention(nn.Module):
 class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0.):
        super().__init__()
+        self.norm = nn.LayerNorm(dim)
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
@@ -80,7 +81,7 @@ class Transformer(nn.Module):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x
-        return x
+        return self.norm(x)

 class ViT(nn.Module):
    def __init__(
@@ -137,10 +138,7 @@ class ViT(nn.Module):
        self.pool = pool
        self.to_latent = nn.Identity()

-        self.mlp_head = nn.Sequential(
-            nn.LayerNorm(dim),
-            nn.Linear(dim, num_classes)
-        )
+        self.mlp_head = nn.Linear(dim, num_classes)

    def forward(self, video):
        x = self.to_patch_embedding(video)
Author	SHA1	Message	Date
Phil Wang	4264efd906	1.4.2	2023-08-14 07:59:35 -07:00
Phil Wang	b194359301	add a simple vit with qknorm, since authors seem to be promoting the technique on twitter	2023-08-14 07:58:45 -07:00
lucidrains	950c901b80	fix linear head in simple vit, thanks to @atkos	2023-08-10 14:36:21 -07:00
Phil Wang	3e5d1be6f0	address https://github.com/lucidrains/vit-pytorch/pull/274	2023-08-09 07:53:38 -07:00
Phil Wang	6e2393de95	wrap up NaViT	2023-07-25 10:38:55 -07:00
Phil Wang	32974c33df	one can pass a callback to token_dropout_prob for NaViT that takes in height and width and calculate appropriate dropout rate	2023-07-24 14:52:40 -07:00
Phil Wang	17675e0de4	add constant token dropout for NaViT	2023-07-24 14:14:36 -07:00
Phil Wang	598cffab53	release NaViT	2023-07-24 13:55:54 -07:00