add CaiT, new vision transformer out of facebook AI, complete with layerscale, talking heads, and cls -> patch cross attention

2025-12-30 08:02:29 +00:00 · 2021-03-31 22:38:53 -07:00
4 changed files with 189 additions and 8 deletions
--- a/README.md
+++ b/README.md
@@ -143,6 +143,36 @@ img = torch.randn(1, 3, 256, 256)
 preds = v(img) # (1, 1000)
 ```

+## CaiT
+
+<a href="https://arxiv.org/abs/2103.17239">This paper</a> also notes difficulty in training vision transformers at greater depths and proposes two solutions. First it proposes to do per-channel multiplication of the output of the residual block. Second, it proposes to have the patches attend to one another, and only allow the CLS token to attend to the patches in the last few layers.
+
+They also add <a href="https://github.com/lucidrains/x-transformers#talking-heads-attention">Talking Heads</a>, noting improvements
+
+You can use this scheme as follows
+
+```python
+import torch
+from vit_pytorch.cait import CaiT
+
+v = CaiT(
+    image_size = 256,
+    patch_size = 32,
+    num_classes = 1000,
+    dim = 1024,
+    depth = 12,         # depth of transformer for patch to patch attention only
+    cls_depth = 2,      # depth of cross attention of CLS tokens to patch
+    heads = 16,
+    mlp_dim = 2048,
+    dropout = 0.1,
+    emb_dropout = 0.1
+)
+
+img = torch.randn(1, 3, 256, 256)
+
+preds = v(img) # (1, 1000)
+```
+
 ## Token-to-Token ViT

 <img src="./images/t2t.png" width="400px"></img>
@@ -164,7 +194,8 @@ v = T2TViT(
 )

 img = torch.randn(1, 3, 224, 224)
-v(img) # (1, 1000)
+
+preds = v(img) # (1, 1000)
 ```

 ## Cross ViT
@@ -177,7 +208,7 @@ v(img) # (1, 1000)
 import torch
 from vit_pytorch.cross_vit import CrossViT

-model = CrossViT(
+v = CrossViT(
    image_size = 256,
    num_classes = 1000,
    depth = 4,               # number of multi-scale encoding blocks
@@ -199,7 +230,7 @@ model = CrossViT(

 img = torch.randn(1, 3, 256, 256)

-pred = model(img) # (1, 1000)
+pred = v(img) # (1, 1000)
 ```

 ## PiT
@@ -212,7 +243,7 @@ pred = model(img) # (1, 1000)
 import torch
 from vit_pytorch.pit import PiT

-p = PiT(
+v = PiT(
    image_size = 224,
    patch_size = 14,
    dim = 256,
@@ -228,7 +259,7 @@ p = PiT(

 img = torch.randn(1, 3, 224, 224)

-preds = p(img) # (1, 1000)
+preds = v(img) # (1, 1000)
 ```

 ## CvT
@@ -241,7 +272,7 @@ preds = p(img) # (1, 1000)
 import torch
 from vit_pytorch.cvt import CvT

-model = CvT(
+v = CvT(
    num_classes = 1000,
    s1_emb_dim = 64,        # stage 1 - dimension
    s1_emb_kernel = 7,      # stage 1 - conv kernel
@@ -272,7 +303,7 @@ model = CvT(

 img = torch.randn(1, 3, 224, 224)

-pred = model(img) # (1, 1000)
+pred = v(img) # (1, 1000)
 ```

 ## Masked Patch Prediction
--- a/images/cait.png
+++ b/images/cait.png
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
 setup(
  name = 'vit-pytorch',
  packages = find_packages(exclude=['examples']),
-  version = '0.12.0',
+  version = '0.14.0',
  license='MIT',
  description = 'Vision Transformer (ViT) - Pytorch',
  author = 'Phil Wang',
--- a/vit_pytorch/cait.py
+++ b/vit_pytorch/cait.py
@@ -0,0 +1,150 @@
+import torch
+from torch import nn, einsum
+import torch.nn.functional as F
+
+from einops import rearrange, repeat
+from einops.layers.torch import Rearrange
+
+# helpers
+
+def exists(val):
+    return val is not None
+
+# classes
+
+class LayerScale(nn.Module):
+    def __init__(self, dim, fn, init_eps = 0.1):
+        super().__init__()
+        scale = torch.zeros(1, 1, dim).fill_(init_eps)
+        self.scale = nn.Parameter(scale)
+        self.fn = fn
+    def forward(self, x, **kwargs):
+        return self.fn(x, **kwargs) * self.scale
+
+class PreNorm(nn.Module):
+    def __init__(self, dim, fn):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim)
+        self.fn = fn
+    def forward(self, x, **kwargs):
+        return self.fn(self.norm(x), **kwargs)
+
+class FeedForward(nn.Module):
+    def __init__(self, dim, hidden_dim, dropout = 0.):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(dim, hidden_dim),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim, dim),
+            nn.Dropout(dropout)
+        )
+    def forward(self, x):
+        return self.net(x)
+
+class Attention(nn.Module):
+    def __init__(self, dim, heads = 8, dim_head = 64, dropout = 0.):
+        super().__init__()
+        inner_dim = dim_head *  heads
+        self.heads = heads
+        self.scale = dim_head ** -0.5
+
+        self.to_q = nn.Linear(dim, inner_dim, bias = False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias = False)
+
+        self.attend = nn.Softmax(dim = -1)
+
+        self.mix_heads_pre_attn = nn.Parameter(torch.randn(heads, heads))
+        self.mix_heads_post_attn = nn.Parameter(torch.randn(heads, heads))
+
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, dim),
+            nn.Dropout(dropout)
+        )
+
+    def forward(self, x, context = None):
+        b, n, _, h = *x.shape, self.heads
+
+        context = x if not exists(context) else torch.cat((x, context), dim = 1)
+
+        qkv = (self.to_q(x), *self.to_kv(context).chunk(2, dim = -1))
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), qkv)
+
+        dots = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale
+
+        dots = einsum('b h i j, h g -> b g i j', dots, self.mix_heads_pre_attn)    # talking heads, pre-softmax
+        attn = self.attend(dots)
+        attn = einsum('b h i j, h g -> b g i j', attn, self.mix_heads_post_attn)   # talking heads, post-softmax
+
+        out = einsum('b h i j, b h j d -> b h i d', attn, v)
+        out = rearrange(out, 'b h n d -> b n (h d)')
+        return self.to_out(out)
+
+class Transformer(nn.Module):
+    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0.):
+        super().__init__()
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(nn.ModuleList([
+                LayerScale(dim, PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout))),
+                LayerScale(dim, PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout)))
+            ]))
+    def forward(self, x, context = None):
+        for attn, ff in self.layers:
+            x = attn(x, context = context) + x
+            x = ff(x) + x
+        return x
+
+class CaiT(nn.Module):
+    def __init__(
+        self,
+        *,
+        image_size,
+        patch_size,
+        num_classes,
+        dim,
+        depth,
+        cls_depth,
+        heads,
+        mlp_dim,
+        dim_head = 64,
+        dropout = 0.,
+        emb_dropout = 0.
+    ):
+        super().__init__()
+        assert image_size % patch_size == 0, 'Image dimensions must be divisible by the patch size.'
+        num_patches = (image_size // patch_size) ** 2
+        patch_dim = 3 * patch_size ** 2
+
+        self.to_patch_embedding = nn.Sequential(
+            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_size, p2 = patch_size),
+            nn.Linear(patch_dim, dim),
+        )
+
+        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches, dim))
+        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
+        self.dropout = nn.Dropout(emb_dropout)
+
+        self.patch_transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)
+        self.cls_transformer = Transformer(dim, cls_depth, heads, dim_head, mlp_dim, dropout)
+
+        self.to_latent = nn.Identity()
+
+        self.mlp_head = nn.Sequential(
+            nn.LayerNorm(dim),
+            nn.Linear(dim, num_classes)
+        )
+
+    def forward(self, img):
+        x = self.to_patch_embedding(img)
+        b, n, _ = x.shape
+
+        x += self.pos_embedding[:, :(n + 1)]
+        x = self.dropout(x)
+
+        x = self.patch_transformer(x)
+
+        cls_tokens = repeat(self.cls_token, '() n d -> b n d', b = b)
+        x = self.cls_transformer(cls_tokens, context = x)
+
+        return self.mlp_head(x[:, 0])