1.7.5

add the u-vit implementation with simple vit + register tokens
attention re-use in lookup vit should use pre-softmax attention matrix
2026-05-09 09:02:31 +00:00 · 2024-08-07 08:46:18 -07:00 · 2024-08-07 08:45:57 -07:00 · 2024-07-19 19:23:38 -07:00 · 2024-07-19 10:23:12 -07:00
4 changed files with 213 additions and 15 deletions
--- a/README.md
+++ b/README.md
@@ -2081,6 +2081,17 @@ Coming from computer vision and new to transformers? Here are some resources tha
 }
 ```

+```bibtex
+@article{Bao2022AllAW,
+    title   = {All are Worth Words: A ViT Backbone for Diffusion Models},
+    author  = {Fan Bao and Shen Nie and Kaiwen Xue and Yue Cao and Chongxuan Li and Hang Su and Jun Zhu},
+    journal = {2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+    year    = {2022},
+    pages   = {22669-22679},
+    url     = {https://api.semanticscholar.org/CorpusID:253581703}
+}
+```
+
 ```bibtex
@misc{Rubin2024,
    author  = {Ohad Rubin},
--- a/setup.py
+++ b/setup.py
@@ -6,7 +6,7 @@ with open('README.md') as f:
 setup(
  name = 'vit-pytorch',
  packages = find_packages(exclude=['examples']),
-  version = '1.7.2',
+  version = '1.7.5',
  license='MIT',
  description = 'Vision Transformer (ViT) - Pytorch',
  long_description=long_description,
--- a/vit_pytorch/look_vit.py
+++ b/vit_pytorch/look_vit.py
@@ -66,6 +66,7 @@ class Attention(Module):
        heads = 8,
        dim_head = 64,
        dropout = 0.,
+        cross_attend = False,
        reuse_attention = False
    ):
        super().__init__()
@@ -74,10 +75,13 @@ class Attention(Module):
        self.scale = dim_head ** -0.5
        self.heads = heads
        self.reuse_attention = reuse_attention
+        self.cross_attend = cross_attend

        self.split_heads = Rearrange('b n (h d) -> b h n d', h = heads)

        self.norm = LayerNorm(dim) if not reuse_attention else nn.Identity()
+        self.norm_context = LayerNorm(dim) if cross_attend else nn.Identity()
+
        self.attend = nn.Softmax(dim = -1)
        self.dropout = nn.Dropout(dropout)

@@ -95,11 +99,17 @@ class Attention(Module):
        self,
        x,
        context = None,
-        return_attn = False,
-        attn = None
+        return_qk_sim = False,
+        qk_sim = None
    ):
        x = self.norm(x)
-        context = default(context, x)
+
+        assert not (exists(context) ^ self.cross_attend)
+
+        if self.cross_attend:
+            context = self.norm_context(context)
+        else:
+            context = x

        v = self.to_v(context)
        v = self.split_heads(v)
@@ -109,20 +119,21 @@ class Attention(Module):
            q, k = tuple(self.split_heads(t) for t in qk)

            q = q * self.scale
-            sim = einsum(q, k, 'b h i d, b h j d -> b h i j')
+            qk_sim = einsum(q, k, 'b h i d, b h j d -> b h i j')

-            attn = self.attend(sim)
-            attn = self.dropout(attn)
        else:
-            assert exists(attn), 'attention matrix must be passed in for reusing previous attention'
+            assert exists(qk_sim), 'qk sim matrix must be passed in for reusing previous attention'
+
+        attn = self.attend(qk_sim)
+        attn = self.dropout(attn)

        out = einsum(attn, v, 'b h i j, b h j d -> b h i d')
        out = self.to_out(out)

-        if not return_attn:
+        if not return_qk_sim:
            return out

-        return out, attn
+        return out, qk_sim

 # LookViT

@@ -179,8 +190,8 @@ class LookViT(Module):
            layers.append(ModuleList([
                Attention(dim = dim, dim_head = dim_head, heads = heads, dropout = dropout),
                MLP(dim = dim, factor = mlp_factor, dropout = dropout),
-                Attention(dim = dim, dim_head = cross_attn_dim_head, heads = cross_attn_heads, dropout = dropout),
-                Attention(dim = dim, dim_head = cross_attn_dim_head, heads = cross_attn_heads, dropout = dropout, reuse_attention = True),
+                Attention(dim = dim, dim_head = cross_attn_dim_head, heads = cross_attn_heads, dropout = dropout, cross_attend = True),
+                Attention(dim = dim, dim_head = cross_attn_dim_head, heads = cross_attn_heads, dropout = dropout, cross_attend = True, reuse_attention = True),
                LayerNorm(dim),
                MLP(dim = dim, factor = highres_mlp_factor, dropout = dropout)
            ]))
@@ -218,7 +229,7 @@ class LookViT(Module):

            # main tokens cross attends (lookup) on the high res tokens

-            lookup_out, lookup_attn = lookup_cross_attn(tokens, highres_tokens, return_attn = True)  # return attention as they reuse the attention matrix
+            lookup_out, qk_sim = lookup_cross_attn(tokens, highres_tokens, return_qk_sim = True)  # return attention as they reuse the attention matrix
            tokens = lookup_out + tokens

            tokens = attn(tokens) + tokens
@@ -226,9 +237,9 @@ class LookViT(Module):

            # attention-reuse

-            lookup_attn = rearrange(lookup_attn, 'b h i j -> b h j i') # transpose for reverse cross attention
+            qk_sim = rearrange(qk_sim, 'b h i j -> b h j i') # transpose for reverse cross attention

-            highres_tokens = highres_attn(highres_tokens, tokens, attn = lookup_attn) + highres_tokens
+            highres_tokens = highres_attn(highres_tokens, tokens, qk_sim = qk_sim) + highres_tokens
            highres_tokens = highres_norm(highres_tokens)

            highres_tokens = highres_mlp(highres_tokens) + highres_tokens
--- a/vit_pytorch/simple_uvit.py
+++ b/vit_pytorch/simple_uvit.py
@@ -0,0 +1,176 @@
+import torch
+from torch import nn
+from torch.nn import Module, ModuleList
+
+from einops import rearrange, repeat, pack, unpack
+from einops.layers.torch import Rearrange
+
+# helpers
+
+def pair(t):
+    return t if isinstance(t, tuple) else (t, t)
+
+def exists(v):
+    return v is not None
+
+def divisible_by(num, den):
+    return (num % den) == 0
+
+def posemb_sincos_2d(h, w, dim, temperature: int = 10000, dtype = torch.float32):
+    y, x = torch.meshgrid(torch.arange(h), torch.arange(w), indexing="ij")
+    assert divisible_by(dim, 4), "feature dimension must be multiple of 4 for sincos emb"
+    omega = torch.arange(dim // 4) / (dim // 4 - 1)
+    omega = temperature ** -omega
+
+    y = y.flatten()[:, None] * omega[None, :]
+    x = x.flatten()[:, None] * omega[None, :]
+    pe = torch.cat((x.sin(), x.cos(), y.sin(), y.cos()), dim=1)
+    return pe.type(dtype)
+
+# classes
+
+def FeedForward(dim, hidden_dim):
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, hidden_dim),
+        nn.GELU(),
+        nn.Linear(hidden_dim, dim),
+    )    
+
+class Attention(Module):
+    def __init__(self, dim, heads = 8, dim_head = 64):
+        super().__init__()
+        inner_dim = dim_head *  heads
+        self.heads = heads
+        self.scale = dim_head ** -0.5
+        self.norm = nn.LayerNorm(dim)
+
+        self.attend = nn.Softmax(dim = -1)
+
+        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
+        self.to_out = nn.Linear(inner_dim, dim, bias = False)
+
+    def forward(self, x):
+        x = self.norm(x)
+
+        qkv = self.to_qkv(x).chunk(3, dim = -1)
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv)
+
+        dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale
+
+        attn = self.attend(dots)
+
+        out = torch.matmul(attn, v)
+        out = rearrange(out, 'b h n d -> b n (h d)')
+        return self.to_out(out)
+
+class Transformer(Module):
+    def __init__(self, dim, depth, heads, dim_head, mlp_dim):
+        super().__init__()
+        self.depth = depth
+        self.norm = nn.LayerNorm(dim)
+        self.layers = ModuleList([])
+
+        for layer in range(1, depth + 1):
+            latter_half = layer >= (depth / 2 + 1)
+
+            self.layers.append(nn.ModuleList([
+                nn.Linear(dim * 2, dim) if latter_half else None,
+                Attention(dim, heads = heads, dim_head = dim_head),
+                FeedForward(dim, mlp_dim)
+            ]))
+
+    def forward(self, x):
+
+        skips = []
+
+        for ind, (combine_skip, attn, ff) in enumerate(self.layers):
+            layer = ind + 1
+            first_half = layer <= (self.depth / 2)
+
+            if first_half:
+                skips.append(x)
+
+            if exists(combine_skip):
+                skip = skips.pop()
+                skip_and_x = torch.cat((skip, x), dim = -1)
+                x = combine_skip(skip_and_x)
+
+            x = attn(x) + x
+            x = ff(x) + x
+
+        assert len(skips) == 0
+
+        return self.norm(x)
+
+class SimpleUViT(Module):
+    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, num_register_tokens = 4, channels = 3, dim_head = 64):
+        super().__init__()
+        image_height, image_width = pair(image_size)
+        patch_height, patch_width = pair(patch_size)
+
+        assert divisible_by(image_height, patch_height) and divisible_by(image_width, patch_width), 'Image dimensions must be divisible by the patch size.'
+
+        patch_dim = channels * patch_height * patch_width
+
+        self.to_patch_embedding = nn.Sequential(
+            Rearrange("b c (h p1) (w p2) -> b (h w) (p1 p2 c)", p1 = patch_height, p2 = patch_width),
+            nn.LayerNorm(patch_dim),
+            nn.Linear(patch_dim, dim),
+            nn.LayerNorm(dim),
+        )
+
+        pos_embedding = posemb_sincos_2d(
+            h = image_height // patch_height,
+            w = image_width // patch_width,
+            dim = dim
+        )
+
+        self.register_buffer('pos_embedding', pos_embedding, persistent = False)
+
+        self.register_tokens = nn.Parameter(torch.randn(num_register_tokens, dim))
+
+        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim)
+
+        self.pool = "mean"
+        self.to_latent = nn.Identity()
+
+        self.linear_head = nn.Linear(dim, num_classes)
+
+    def forward(self, img):
+        batch, device = img.shape[0], img.device
+
+        x = self.to_patch_embedding(img)
+        x = x + self.pos_embedding.type(x.dtype)
+
+        r = repeat(self.register_tokens, 'n d -> b n d', b = batch)
+
+        x, ps = pack([x, r], 'b * d')
+
+        x = self.transformer(x)
+
+        x, _ = unpack(x, ps, 'b * d')
+
+        x = x.mean(dim = 1)
+
+        x = self.to_latent(x)
+        return self.linear_head(x)
+
+# quick test on odd number of layers
+
+if __name__ == '__main__':
+
+    v = SimpleUViT(
+        image_size = 256,
+        patch_size = 32,
+        num_classes = 1000,
+        dim = 1024,
+        depth = 7,
+        heads = 16,
+        mlp_dim = 2048
+    ).cuda()
+
+    img = torch.randn(2, 3, 256, 256).cuda()
+
+    preds = v(img)
+    assert preds.shape == (2, 1000)
Author	SHA1	Message	Date
Phil Wang	4f22eae631	1.7.5	2024-08-07 08:46:18 -07:00
Phil Wang	dfc8df6713	add the u-vit implementation with simple vit + register tokens	2024-08-07 08:45:57 -07:00
lucidrains	9992a615d1	attention re-use in lookup vit should use pre-softmax attention matrix	2024-07-19 19:23:38 -07:00
Phil Wang	4b2c00cb63	when cross attending in look vit, make sure context tokens are normalized	2024-07-19 10:23:12 -07:00