fix when video time seq len less than max time seq len for video acceptor

some models only return embeddings with some kwarg on forward
need to be able to invoke with eval no grad
2025-12-30 16:12:29 +00:00 · 2025-07-27 09:00:56 -07:00 · 2025-07-27 08:46:43 -07:00 · 2025-07-27 08:25:58 -07:00 · 2025-07-27 08:14:48 -07:00 · 2025-07-27 08:05:48 -07:00
5 changed files with 343 additions and 12 deletions
--- a/README.md
+++ b/README.md
@@ -2172,4 +2172,13 @@ Coming from computer vision and new to transformers? Here are some resources tha
 }
 ```

+```bibtex
+@inproceedings{Fuller2025SimplerFV,
+    title   = {Simpler Fast Vision Transformers with a Jumbo CLS Token},
+    author  = {Anthony Fuller and Yousef Yassin and Daniel G. Kyrollos and Evan Shelhamer and James R. Green},
+    year    = {2025},
+    url     = {https://api.semanticscholar.org/CorpusID:276557720}
+}
+```
+
 *I visualise a time when we will be to robots what dogs are to humans, and I’m rooting for the machines.* — Claude Shannon
--- a/setup.py
+++ b/setup.py
@@ -6,7 +6,7 @@ with open('README.md') as f:
 setup(
  name = 'vit-pytorch',
  packages = find_packages(exclude=['examples']),
-  version = '1.9.1',
+  version = '1.11.4',
  license='MIT',
  description = 'Vision Transformer (ViT) - Pytorch',
  long_description = long_description,
--- a/vit_pytorch/accept_video_wrapper.py
+++ b/vit_pytorch/accept_video_wrapper.py
@@ -0,0 +1,129 @@
+from contextlib import nullcontext
+
+import torch
+from torch import is_tensor, randn
+from torch.nn import Module, Parameter
+from torch.utils._pytree import tree_flatten, tree_unflatten
+
+from einops import rearrange, repeat
+
+# helper functions
+
+def exists(v):
+    return v is not None
+
+def default(v, d):
+    return v if exists(v) else d
+
+# classes
+
+class AcceptVideoWrapper(Module):
+    def __init__(
+        self,
+        image_net: Module,
+        forward_function = 'forward',
+        add_time_pos_emb = False,
+        dim_emb = None,
+        time_seq_len = None,
+        output_pos_add_pos_emb = 0 # defaults to first output position to add embedding 
+    ):
+        super().__init__()
+        self.image_net = image_net
+        self.forward_function = forward_function # for openclip, used in TRI-LBM
+
+        self.add_time_pos_emb = add_time_pos_emb
+        self.output_pos_add_pos_emb = output_pos_add_pos_emb
+
+        if add_time_pos_emb:
+            assert exists(dim_emb) and exists(time_seq_len), '`dim_emb` and `time_seq_len` must be set if adding positional embeddings to the output'
+            self.time_seq_len = time_seq_len
+
+            self.pos_emb = Parameter(randn(time_seq_len, dim_emb) * 1e-2)
+
+    def forward(
+        self,
+        video, # (b c t h w)
+        eval_with_no_grad = False,
+        forward_kwargs = dict()
+    ):
+        add_time_pos_emb = self.add_time_pos_emb
+        time = video.shape[2]
+
+        # maybe validate time positional embedding
+
+        if add_time_pos_emb:
+            assert time <= self.time_seq_len, f'received video with {time} frames but `time_seq_len` ({self.time_seq_len}) is too low'
+
+        video = rearrange(video, 'b c t h w -> b t c h w')
+
+        video = rearrange(video, 'b t ... -> (b t) ...')
+
+        # forward through image net for outputs
+
+        func = getattr(self.image_net, self.forward_function)
+
+        if eval_with_no_grad:
+            self.image_net.eval()
+
+        context = torch.no_grad if eval_with_no_grad else nullcontext
+
+        with context():
+            outputs = func(video, **forward_kwargs)
+
+        # handle multiple outputs, say logits and embeddings returned from extractor - also handle some reduce aux loss being returned
+
+        outputs, tree_spec = tree_flatten(outputs)
+
+        outputs = tuple(rearrange(t, '(b t) ... -> b t ...', t = time) if is_tensor(t) and t.numel() > 1 else t for t in outputs)
+
+        # maybe add time positional embedding
+
+        if add_time_pos_emb:
+
+            outputs = list(outputs)
+            embed = outputs[self.output_pos_add_pos_emb]
+
+            pos_emb = rearrange(self.pos_emb, 't d -> 1 t d')
+
+            # handle the network outputting embeddings with spatial dimensions intact - assume embedded dimension is last
+
+            dims_to_unsqueeze = embed.ndim - pos_emb.ndim
+
+            pos_emb = pos_emb.reshape(*pos_emb.shape[:2], *((1,) * dims_to_unsqueeze) , pos_emb.shape[-1])
+
+            embed = embed + pos_emb[:, :embed.shape[1]]
+
+            outputs[self.output_pos_add_pos_emb] = embed
+
+        return tree_unflatten(outputs, tree_spec)
+
+# main
+
+if __name__ == '__main__':
+    from vit_pytorch import ViT
+
+    v = ViT(
+        image_size = 256,
+        patch_size = 32,
+        num_classes = 1000,
+        dim = 1024,
+        depth = 6,
+        heads = 16,
+        mlp_dim = 2048,
+        dropout = 0.1,
+        emb_dropout = 0.1
+    )
+
+    videos = torch.randn(1, 3, 7, 256, 256)
+
+    # step up the difficulty and return embeddings for robotics
+
+    from vit_pytorch.extractor import Extractor
+    v = Extractor(v)
+
+    video_acceptor = AcceptVideoWrapper(v, add_time_pos_emb = True, output_pos_add_pos_emb = 1, time_seq_len = 12, dim_emb = 1024)
+
+    logits, embeddings = video_acceptor(videos, eval_with_no_grad = True) # always (batch, channels, time, height, width) - time is always dimension 2
+
+    assert logits.shape == (1, 7, 1000)
+    assert embeddings.shape == (1, 7, 65, 1024)
--- a/vit_pytorch/jumbo_vit.py
+++ b/vit_pytorch/jumbo_vit.py
@@ -0,0 +1,204 @@
+# Simpler Fast Vision Transformers with a Jumbo CLS Token
+# https://arxiv.org/abs/2502.15021
+
+import torch
+from torch import nn
+from torch.nn import Module, ModuleList
+
+from einops import rearrange, repeat, reduce, pack, unpack
+from einops.layers.torch import Rearrange
+
+# helpers
+
+def pair(t):
+    return t if isinstance(t, tuple) else (t, t)
+
+def divisible_by(num, den):
+    return (num % den) == 0
+
+def posemb_sincos_2d(h, w, dim, temperature: int = 10000, dtype = torch.float32):
+    y, x = torch.meshgrid(torch.arange(h), torch.arange(w), indexing="ij")
+    assert divisible_by(dim, 4), "feature dimension must be multiple of 4 for sincos emb"
+
+    omega = torch.arange(dim // 4) / (dim // 4 - 1)
+    omega = temperature ** -omega
+
+    y = y.flatten()[:, None] * omega[None, :]
+    x = x.flatten()[:, None] * omega[None, :]
+    pos_emb = torch.cat((x.sin(), x.cos(), y.sin(), y.cos()), dim=1)
+
+    return pos_emb.type(dtype)
+
+# classes
+
+def FeedForward(dim, mult = 4.):
+    hidden_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, hidden_dim),
+        nn.GELU(),
+        nn.Linear(hidden_dim, dim),
+    )
+
+class Attention(Module):
+    def __init__(self, dim, heads = 8, dim_head = 64):
+        super().__init__()
+        inner_dim = dim_head *  heads
+        self.heads = heads
+        self.scale = dim_head ** -0.5
+        self.norm = nn.LayerNorm(dim)
+
+        self.attend = nn.Softmax(dim = -1)
+
+        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
+        self.to_out = nn.Linear(inner_dim, dim, bias = False)
+
+    def forward(self, x):
+        x = self.norm(x)
+
+        qkv = self.to_qkv(x).chunk(3, dim = -1)
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv)
+
+        dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale
+
+        attn = self.attend(dots)
+
+        out = torch.matmul(attn, v)
+        out = rearrange(out, 'b h n d -> b n (h d)')
+        return self.to_out(out)
+
+class JumboViT(Module):
+    def __init__(
+        self,
+        *,
+        image_size,
+        patch_size,
+        num_classes,
+        dim,
+        depth,
+        heads,
+        mlp_dim,
+        num_jumbo_cls = 1,  # differing from paper, allow for multiple jumbo cls, so one could break it up into 2 jumbo cls tokens with 3x the dim, as an example
+        jumbo_cls_k = 6,    # they use a CLS token with this factor times the dimension - 6 was the value they settled on
+        jumbo_ff_mult = 2,  # expansion factor of the jumbo cls token feedforward
+        channels = 3,
+        dim_head = 64
+    ):
+        super().__init__()
+        image_height, image_width = pair(image_size)
+        patch_height, patch_width = pair(patch_size)
+
+        assert divisible_by(image_height, patch_height) and divisible_by(image_width, patch_width), 'Image dimensions must be divisible by the patch size.'
+
+        patch_dim = channels * patch_height * patch_width
+
+        self.to_patch_embedding = nn.Sequential(
+            Rearrange("b c (h p1) (w p2) -> b (h w) (p1 p2 c)", p1 = patch_height, p2 = patch_width),
+            nn.LayerNorm(patch_dim),
+            nn.Linear(patch_dim, dim),
+            nn.LayerNorm(dim),
+        )
+
+        self.pos_embedding = posemb_sincos_2d(
+            h = image_height // patch_height,
+            w = image_width // patch_width,
+            dim = dim,
+        ) 
+
+        jumbo_cls_dim = dim * jumbo_cls_k
+
+        self.jumbo_cls_token = nn.Parameter(torch.zeros(num_jumbo_cls, jumbo_cls_dim))
+
+        jumbo_cls_to_tokens = Rearrange('b n (k d) -> b (n k) d', k = jumbo_cls_k)
+        self.jumbo_cls_to_tokens = jumbo_cls_to_tokens
+
+        self.norm = nn.LayerNorm(dim)
+        self.layers = ModuleList([])
+
+        # attention and feedforwards
+
+        self.jumbo_ff = nn.Sequential(
+            Rearrange('b (n k) d -> b n (k d)', k = jumbo_cls_k),
+            FeedForward(jumbo_cls_dim, int(jumbo_cls_dim * jumbo_ff_mult)), # they use separate parameters for the jumbo feedforward, weight tied for parameter efficient
+            jumbo_cls_to_tokens
+        )
+
+        for _ in range(depth):
+            self.layers.append(ModuleList([
+                Attention(dim, heads = heads, dim_head = dim_head),
+                FeedForward(dim, mlp_dim),
+            ]))
+
+        self.to_latent = nn.Identity()
+
+        self.linear_head = nn.Linear(dim, num_classes)
+
+    def forward(self, img):
+
+        batch, device = img.shape[0], img.device
+
+        x = self.to_patch_embedding(img)
+
+        # pos embedding
+
+        pos_emb = self.pos_embedding.to(device, dtype = x.dtype)
+
+        x = x + pos_emb
+
+        # add cls tokens
+
+        cls_tokens = repeat(self.jumbo_cls_token, 'nj d -> b nj d', b = batch)
+
+        jumbo_tokens = self.jumbo_cls_to_tokens(cls_tokens)
+
+        x, cls_packed_shape = pack([jumbo_tokens, x], 'b * d')
+
+        # attention and feedforwards
+
+        for layer, (attn, ff) in enumerate(self.layers, start = 1):
+            is_last = layer == len(self.layers)
+
+            x = attn(x) + x
+
+            # jumbo feedforward
+
+            jumbo_cls_tokens, x = unpack(x, cls_packed_shape, 'b * d')
+
+            x = ff(x) + x
+            jumbo_cls_tokens = self.jumbo_ff(jumbo_cls_tokens) + jumbo_cls_tokens
+
+            if is_last:
+                continue
+
+            x, _ = pack([jumbo_cls_tokens, x], 'b * d')
+
+        pooled = reduce(jumbo_cls_tokens, 'b n d -> b d', 'mean')
+
+        # normalization and project to logits
+
+        embed = self.norm(pooled)
+
+        embed = self.to_latent(embed)
+        logits = self.linear_head(embed)
+        return logits
+
+# copy pasteable file
+
+if __name__ == '__main__':
+
+    v = JumboViT(
+        num_classes = 1000,
+        image_size = 64,
+        patch_size = 8,
+        dim = 16,
+        depth = 2,
+        heads = 2,
+        mlp_dim = 32,
+        jumbo_cls_k = 3,
+        jumbo_ff_mult = 2,
+    )
+
+    images = torch.randn(1, 3, 64, 64)
+
+    logits = v(images)
+    assert logits.shape == (1, 1000)
--- a/vit_pytorch/na_vit_nested_tensor_3d.py
+++ b/vit_pytorch/na_vit_nested_tensor_3d.py
@@ -83,17 +83,6 @@ class Attention(Module):

        # split heads

-        def split_heads(t):
-            return t.unflatten(-1, (self.heads, self.dim_head)).transpose(1, 2).contiguous()
-
-        # queries, keys, values
-
-        query = self.to_queries(x)
-        key = self.to_keys(context)
-        value = self.to_values(context)
-
-        # split heads
-
        def split_heads(t):
            return t.unflatten(-1, (self.heads, self.dim_head))
Author	SHA1	Message	Date
lucidrains	29ac8e143c	fix when video time seq len less than max time seq len for video acceptor	2025-07-27 09:00:56 -07:00
lucidrains	e05cd6d8b8	some models only return embeddings with some kwarg on forward	2025-07-27 08:46:43 -07:00
lucidrains	b46233c3d6	need to be able to invoke with eval no grad	2025-07-27 08:25:58 -07:00
lucidrains	68e13a3c7d	bit more flexible	2025-07-27 08:14:48 -07:00
lucidrains	b22dc0ecd2	add a wrapper for accepting video and processing the images individually, optionally able to add time positional embeddings - for use in two robotics work	2025-07-27 08:05:48 -07:00
lucidrains	db05a141a6	add the proposed jumbo vit from Fuller et al. of Carleton University	2025-03-05 10:50:34 -08:00
lucidrains	9f49a31977	1.9.2	2025-01-19 05:53:11 -08:00
JacobLinCool	ab63fc9cc8	remove duplicated qkv computation in na_vit_nested_tensor_3d.py (#341 )	2025-01-19 05:52:46 -08:00