address https://github.com/lucidrains/vit-pytorch/pull/274

wrap up NaViT
one can pass a callback to token_dropout_prob for NaViT that takes in height and width and calculate appropriate dropout rate
2025-12-30 16:12:29 +00:00 · 2023-08-09 07:53:38 -07:00 · 2023-07-25 10:38:55 -07:00 · 2023-07-24 14:52:40 -07:00
10 changed files with 134 additions and 61 deletions
--- a/README.md
+++ b/README.md
@@ -179,6 +179,24 @@ preds = v(images) # (5, 1000) - 5, because 5 images of different resolution abov

 ```

+Or if you would rather that the framework auto group the images into variable lengthed sequences that do not exceed a certain max length
+
+```python
+images = [
+    torch.randn(3, 256, 256),
+    torch.randn(3, 128, 128),
+    torch.randn(3, 128, 256),
+    torch.randn(3, 256, 128),
+    torch.randn(3, 64, 256)
+]
+
+preds = v(
+    images,
+    group_images = True,
+    group_max_seq_len = 64
+) # (5, 1000)
+```
+
 ## Distillation

 <img src="./images/distill.png" width="300px"></img>
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
 setup(
  name = 'vit-pytorch',
  packages = find_packages(exclude=['examples']),
-  version = '1.2.7',
+  version = '1.4.0',
  license='MIT',
  description = 'Vision Transformer (ViT) - Pytorch',
  long_description_content_type = 'text/markdown',
--- a/vit_pytorch/na_vit.py
+++ b/vit_pytorch/na_vit.py
@@ -1,5 +1,5 @@
 from functools import partial
-from typing import List
+from typing import List, Union

 import torch
 import torch.nn.functional as F
@@ -17,12 +17,58 @@ def exists(val):
 def default(val, d):
    return val if exists(val) else d

+def always(val):
+    return lambda *args: val
+
 def pair(t):
    return t if isinstance(t, tuple) else (t, t)

 def divisible_by(numer, denom):
    return (numer % denom) == 0

+# auto grouping images
+
+def group_images_by_max_seq_len(
+    images: List[Tensor],
+    patch_size: int,
+    calc_token_dropout = None,
+    max_seq_len = 2048
+
+) -> List[List[Tensor]]:
+
+    calc_token_dropout = default(calc_token_dropout, always(0.))
+
+    groups = []
+    group = []
+    seq_len = 0
+
+    if isinstance(calc_token_dropout, (float, int)):
+        calc_token_dropout = always(calc_token_dropout)
+
+    for image in images:
+        assert isinstance(image, Tensor)
+
+        image_dims = image.shape[-2:]
+        ph, pw = map(lambda t: t // patch_size, image_dims)
+
+        image_seq_len = (ph * pw)
+        image_seq_len = int(image_seq_len * (1 - calc_token_dropout(*image_dims)))
+
+        assert image_seq_len <= max_seq_len, f'image with dimensions {image_dims} exceeds maximum sequence length'
+
+        if (seq_len + image_seq_len) > max_seq_len:
+            groups.append(group)
+            group = []
+            seq_len = 0
+
+        group.append(image)
+        seq_len += image_seq_len
+
+    if len(group) > 0:
+        groups.append(group)
+
+    return groups
+
 # normalization
 # they use layernorm without bias, something that pytorch does not offer

@@ -138,14 +184,25 @@ class Transformer(nn.Module):
        return self.norm(x)

 class NaViT(nn.Module):
-    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0., token_dropout_prob = 0.):
+    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0., token_dropout_prob = None):
        super().__init__()
        image_height, image_width = pair(image_size)

        # what percent of tokens to dropout
-        # in paper, they found this should vary depending on resolution (todo - figure out how to do this, maybe with callback?)
+        # if int or float given, then assume constant dropout prob
+        # otherwise accept a callback that in turn calculates dropout prob from height and width

-        self.token_dropout_prob = token_dropout_prob
+        self.calc_token_dropout = None
+
+        if callable(token_dropout_prob):
+            self.calc_token_dropout = token_dropout_prob
+
+        elif isinstance(token_dropout_prob, (float, int)):
+            assert 0. < token_dropout_prob < 1.
+            token_dropout_prob = float(token_dropout_prob)
+            self.calc_token_dropout = lambda height, width: token_dropout_prob
+
+        # calculate patching related stuff

        assert divisible_by(image_height, patch_size) and divisible_by(image_width, patch_size), 'Image dimensions must be divisible by the patch size.'

@@ -188,13 +245,25 @@ class NaViT(nn.Module):

    def forward(
        self,
-        batched_images: List[List[Tensor]] # assume different resolution images already grouped correctly
+        batched_images: Union[List[Tensor], List[List[Tensor]]], # assume different resolution images already grouped correctly
+        group_images = False,
+        group_max_seq_len = 2048
    ):
-        p, c, device, has_token_dropout = self.patch_size, self.channels, self.device, self.token_dropout_prob > 0.
+        p, c, device, has_token_dropout = self.patch_size, self.channels, self.device, exists(self.calc_token_dropout)

        arange = partial(torch.arange, device = device)
        pad_sequence = partial(orig_pad_sequence, batch_first = True)

+        # auto pack if specified
+
+        if group_images:
+            batched_images = group_images_by_max_seq_len(
+                batched_images,
+                patch_size = self.patch_size,
+                calc_token_dropout = self.calc_token_dropout,
+                max_seq_len = group_max_seq_len
+            )
+
        # process images into variable lengthed sequences with attention mask

        num_images = []
@@ -227,8 +296,10 @@ class NaViT(nn.Module):
                seq_len = seq.shape[-2]

                if has_token_dropout:
-                    num_keep = max(1, int(seq_len * (1 - self.token_dropout_prob)))
+                    token_dropout = self.calc_token_dropout(*image_dims)
+                    num_keep = max(1, int(seq_len * (1 - token_dropout)))
                    keep_indices = torch.randn((seq_len,), device = device).topk(num_keep, dim = -1).indices
+
                    seq = seq[keep_indices]
                    pos = pos[keep_indices]

--- a/vit_pytorch/simple_vit.py
+++ b/vit_pytorch/simple_vit.py
@@ -64,6 +64,7 @@ class Attention(nn.Module):
 class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim):
        super().__init__()
+        self.norm = nn.LayerNorm(dim)
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
@@ -74,7 +75,7 @@ class Transformer(nn.Module):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x
-        return x
+        return self.norm(x)

 class SimpleViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, channels = 3, dim_head = 64):
@@ -101,12 +102,10 @@ class SimpleViT(nn.Module):

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim)

-        self.to_latent = nn.Identity()
-        self.linear_head = nn.Sequential(
-            nn.LayerNorm(dim),
-            nn.Linear(dim, num_classes)
-        )
        self.pool = "mean"
+        self.to_latent = nn.Identity()
+
+        self.linear_head = nn.LayerNorm(dim)

    def forward(self, img):
        device = img.device
--- a/vit_pytorch/simple_vit_1d.py
+++ b/vit_pytorch/simple_vit_1d.py
@@ -62,6 +62,7 @@ class Attention(nn.Module):
 class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim):
        super().__init__()
+        self.norm = nn.LayerNorm(dim)
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
@@ -72,7 +73,7 @@ class Transformer(nn.Module):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x
-        return x
+        return self.norm(x)

 class SimpleViT(nn.Module):
    def __init__(self, *, seq_len, patch_size, num_classes, dim, depth, heads, mlp_dim, channels = 3, dim_head = 64):
@@ -93,10 +94,7 @@ class SimpleViT(nn.Module):
        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim)

        self.to_latent = nn.Identity()
-        self.linear_head = nn.Sequential(
-            nn.LayerNorm(dim),
-            nn.Linear(dim, num_classes)
-        )
+        self.linear_head = nn.Linear(dim, num_classes)

    def forward(self, series):
        *_, n, dtype = *series.shape, series.dtype
--- a/vit_pytorch/simple_vit_3d.py
+++ b/vit_pytorch/simple_vit_3d.py
@@ -77,6 +77,7 @@ class Attention(nn.Module):
 class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim):
        super().__init__()
+        self.norm = nn.LayerNorm(dim)
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
@@ -87,7 +88,7 @@ class Transformer(nn.Module):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x
-        return x
+        return self.norm(x)

 class SimpleViT(nn.Module):
    def __init__(self, *, image_size, image_patch_size, frames, frame_patch_size, num_classes, dim, depth, heads, mlp_dim, channels = 3, dim_head = 64):
@@ -111,10 +112,7 @@ class SimpleViT(nn.Module):
        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim)

        self.to_latent = nn.Identity()
-        self.linear_head = nn.Sequential(
-            nn.LayerNorm(dim),
-            nn.Linear(dim, num_classes)
-        )
+        self.linear_head = nn.Linear(dim, num_classes)

    def forward(self, video):
        *_, h, w, dtype = *video.shape, video.dtype
--- a/vit_pytorch/simple_vit_with_patch_dropout.py
+++ b/vit_pytorch/simple_vit_with_patch_dropout.py
@@ -87,6 +87,7 @@ class Attention(nn.Module):
 class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim):
        super().__init__()
+        self.norm = nn.LayerNorm(dim)
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
@@ -97,7 +98,7 @@ class Transformer(nn.Module):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x
-        return x
+        return self.norm(x)

 class SimpleViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, channels = 3, dim_head = 64, patch_dropout = 0.5):
@@ -122,10 +123,7 @@ class SimpleViT(nn.Module):
        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim)

        self.to_latent = nn.Identity()
-        self.linear_head = nn.Sequential(
-            nn.LayerNorm(dim),
-            nn.Linear(dim, num_classes)
-        )
+        self.linear_head = nn.Linear(dim, num_classes)

    def forward(self, img):
        *_, h, w, dtype = *img.shape, img.dtype
--- a/vit_pytorch/vit.py
+++ b/vit_pytorch/vit.py
@@ -11,24 +11,18 @@ def pair(t):

 # classes

-class PreNorm(nn.Module):
-    def __init__(self, dim, fn):
-        super().__init__()
-        self.norm = nn.LayerNorm(dim)
-        self.fn = fn
-    def forward(self, x, **kwargs):
-        return self.fn(self.norm(x), **kwargs)
-
 class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim, dropout = 0.):
        super().__init__()
        self.net = nn.Sequential(
+            nn.LayerNorm(dim),
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
        )
+
    def forward(self, x):
        return self.net(x)

@@ -41,6 +35,8 @@ class Attention(nn.Module):
        self.heads = heads
        self.scale = dim_head ** -0.5

+        self.norm = nn.LayerNorm(dim)
+
        self.attend = nn.Softmax(dim = -1)
        self.dropout = nn.Dropout(dropout)

@@ -52,6 +48,8 @@ class Attention(nn.Module):
        ) if project_out else nn.Identity()

    def forward(self, x):
+        x = self.norm(x)
+
        qkv = self.to_qkv(x).chunk(3, dim = -1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv)

@@ -67,17 +65,20 @@ class Attention(nn.Module):
 class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0.):
        super().__init__()
+        self.norm = nn.LayerNorm(dim)
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
-                PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout)),
-                PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout))
+                Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout),
+                FeedForward(dim, mlp_dim, dropout = dropout)
            ]))
+
    def forward(self, x):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x
-        return x
+
+        return self.norm(x)

 class ViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool = 'cls', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
@@ -107,10 +108,7 @@ class ViT(nn.Module):
        self.pool = pool
        self.to_latent = nn.Identity()

-        self.mlp_head = nn.Sequential(
-            nn.LayerNorm(dim),
-            nn.Linear(dim, num_classes)
-        )
+        self.mlp_head = nn.Linear(dim, num_classes)

    def forward(self, img):
        x = self.to_patch_embedding(img)
--- a/vit_pytorch/vit_with_patch_merger.py
+++ b/vit_pytorch/vit_with_patch_merger.py
@@ -32,18 +32,11 @@ class PatchMerger(nn.Module):

 # classes

-class PreNorm(nn.Module):
-    def __init__(self, dim, fn):
-        super().__init__()
-        self.norm = nn.LayerNorm(dim)
-        self.fn = fn
-    def forward(self, x, **kwargs):
-        return self.fn(self.norm(x), **kwargs)
-
 class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim, dropout = 0.):
        super().__init__()
        self.net = nn.Sequential(
+            nn.LayerNorm(dim),
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
@@ -62,6 +55,7 @@ class Attention(nn.Module):
        self.heads = heads
        self.scale = dim_head ** -0.5

+        self.norm = nn.LayerNorm(dim)
        self.attend = nn.Softmax(dim = -1)
        self.dropout = nn.Dropout(dropout)

@@ -73,6 +67,7 @@ class Attention(nn.Module):
        ) if project_out else nn.Identity()

    def forward(self, x):
+        x = self.norm(x)
        qkv = self.to_qkv(x).chunk(3, dim = -1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv)

@@ -88,6 +83,7 @@ class Attention(nn.Module):
 class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0., patch_merge_layer = None, patch_merge_num_tokens = 8):
        super().__init__()
+        self.norm = nn.LayerNorm(dim)
        self.layers = nn.ModuleList([])

        self.patch_merge_layer_index = default(patch_merge_layer, depth // 2) - 1 # default to mid-way through transformer, as shown in paper
@@ -95,8 +91,8 @@ class Transformer(nn.Module):

        for _ in range(depth):
            self.layers.append(nn.ModuleList([
-                PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout)),
-                PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout))
+                Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout),
+                FeedForward(dim, mlp_dim, dropout = dropout)
            ]))
    def forward(self, x):
        for index, (attn, ff) in enumerate(self.layers):
@@ -106,7 +102,7 @@ class Transformer(nn.Module):
            if index == self.patch_merge_layer_index:
                x = self.patch_merger(x)

-        return x
+        return self.norm(x)

 class ViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, patch_merge_layer = None, patch_merge_num_tokens = 8, channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
@@ -133,7 +129,6 @@ class ViT(nn.Module):

        self.mlp_head = nn.Sequential(
            Reduce('b n d -> b d', 'mean'),
-            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

--- a/vit_pytorch/vivit.py
+++ b/vit_pytorch/vivit.py
@@ -70,6 +70,7 @@ class Attention(nn.Module):
 class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0.):
        super().__init__()
+        self.norm = nn.LayerNorm(dim)
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
@@ -80,7 +81,7 @@ class Transformer(nn.Module):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x
-        return x
+        return self.norm(x)

 class ViT(nn.Module):
    def __init__(
@@ -137,10 +138,7 @@ class ViT(nn.Module):
        self.pool = pool
        self.to_latent = nn.Identity()

-        self.mlp_head = nn.Sequential(
-            nn.LayerNorm(dim),
-            nn.Linear(dim, num_classes)
-        )
+        self.mlp_head = nn.Linear(dim, num_classes)

    def forward(self, video):
        x = self.to_patch_embedding(video)
Author	SHA1	Message	Date
Phil Wang	3e5d1be6f0	address https://github.com/lucidrains/vit-pytorch/pull/274	2023-08-09 07:53:38 -07:00
Phil Wang	6e2393de95	wrap up NaViT	2023-07-25 10:38:55 -07:00
Phil Wang	32974c33df	one can pass a callback to token_dropout_prob for NaViT that takes in height and width and calculate appropriate dropout rate	2023-07-24 14:52:40 -07:00