vit_pytorch/efficient.py

import torch
from torch import nn
from einops import rearrange, repeat
from einops.layers.torch import Rearrange

def pair(t):
    return t if isinstance(t, tuple) else (t, t)

class ViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, transformer, pool = 'cls', channels = 3):
        super().__init__()
        image_size_h, image_size_w = pair(image_size)
        assert image_size_h % patch_size == 0 and image_size_w % patch_size == 0, 'image dimensions must be divisible by the patch size'
        assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'
        num_patches = (image_size_h // patch_size) * (image_size_w // patch_size)
        patch_dim = channels * patch_size ** 2

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_size, p2 = patch_size),
            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
            nn.LayerNorm(dim)
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.transformer = transformer

        self.pool = pool
        self.to_latent = nn.Identity()

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, img):
        x = self.to_patch_embedding(img)
        b, n, _ = x.shape

        cls_tokens = repeat(self.cls_token, '() n d -> b n d', b = b)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding[:, :(n + 1)]
        x = self.transformer(x)

        x = x.mean(dim = 1) if self.pool == 'mean' else x[:, 0]

        x = self.to_latent(x)
        return self.mlp_head(x)
write up example for using efficient transformers 2020-10-07 19:15:21 -07:00			`import torch`
			`from torch import nn`
make sure distillation still works 2021-02-21 19:08:18 -08:00			`from einops import rearrange, repeat`
			`from einops.layers.torch import Rearrange`
write up example for using efficient transformers 2020-10-07 19:15:21 -07:00
allow for rectangular images for efficient adapter 2022-01-31 08:55:31 -08:00			`def pair(t):`
			`return t if isinstance(t, tuple) else (t, t)`

write up example for using efficient transformers 2020-10-07 19:15:21 -07:00			`class ViT(nn.Module):`
allow for mean pool with efficient version too 2020-12-23 18:15:40 -08:00			`def __init__(self, *, image_size, patch_size, num_classes, dim, transformer, pool = 'cls', channels = 3):`
write up example for using efficient transformers 2020-10-07 19:15:21 -07:00			`super().__init__()`
allow for rectangular images for efficient adapter 2022-01-31 08:55:31 -08:00			`image_size_h, image_size_w = pair(image_size)`
			`assert image_size_h % patch_size == 0 and image_size_w % patch_size == 0, 'image dimensions must be divisible by the patch size'`
allow for mean pool with efficient version too 2020-12-23 18:15:40 -08:00			`assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'`
allow for rectangular images for efficient adapter 2022-01-31 08:55:31 -08:00			`num_patches = (image_size_h // patch_size) * (image_size_w // patch_size)`
write up example for using efficient transformers 2020-10-07 19:15:21 -07:00			`patch_dim = channels * patch_size ** 2`

make sure distillation still works 2021-02-21 19:08:18 -08:00			`self.to_patch_embedding = nn.Sequential(`
			`Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_size, p2 = patch_size),`
double down on dual patch norm, fix MAE and Simmim to be compatible with dual patchnorm 2023-02-10 10:39:50 -08:00			`nn.LayerNorm(patch_dim),`
make sure distillation still works 2021-02-21 19:08:18 -08:00			`nn.Linear(patch_dim, dim),`
double down on dual patch norm, fix MAE and Simmim to be compatible with dual patchnorm 2023-02-10 10:39:50 -08:00			`nn.LayerNorm(dim)`
make sure distillation still works 2021-02-21 19:08:18 -08:00			`)`
write up example for using efficient transformers 2020-10-07 19:15:21 -07:00
			`self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))`
			`self.cls_token = nn.Parameter(torch.randn(1, 1, dim))`
			`self.transformer = transformer`

allow for mean pool with efficient version too 2020-12-23 18:15:40 -08:00			`self.pool = pool`
			`self.to_latent = nn.Identity()`
write up example for using efficient transformers 2020-10-07 19:15:21 -07:00
			`self.mlp_head = nn.Sequential(`
norm cls token before sending to mlp head 2020-10-10 12:08:42 -07:00			`nn.LayerNorm(dim),`
simplify mlp head 2020-12-07 14:31:50 -08:00			`nn.Linear(dim, num_classes)`
write up example for using efficient transformers 2020-10-07 19:15:21 -07:00			`)`

			`def forward(self, img):`
make sure distillation still works 2021-02-21 19:08:18 -08:00			`x = self.to_patch_embedding(img)`
allow for training on different image sizes, provided images are smaller than what was passed as `image_size` keyword on init 2020-10-25 13:17:42 -07:00			`b, n, _ = x.shape`
write up example for using efficient transformers 2020-10-07 19:15:21 -07:00
use einops repeat 2020-10-28 18:13:57 -07:00			`cls_tokens = repeat(self.cls_token, '() n d -> b n d', b = b)`
write up example for using efficient transformers 2020-10-07 19:15:21 -07:00			`x = torch.cat((cls_tokens, x), dim=1)`
allow for training on different image sizes, provided images are smaller than what was passed as `image_size` keyword on init 2020-10-25 13:17:42 -07:00			`x += self.pos_embedding[:, :(n + 1)]`
write up example for using efficient transformers 2020-10-07 19:15:21 -07:00			`x = self.transformer(x)`

allow for mean pool with efficient version too 2020-12-23 18:15:40 -08:00			`x = x.mean(dim = 1) if self.pool == 'mean' else x[:, 0]`

			`x = self.to_latent(x)`
write up example for using efficient transformers 2020-10-07 19:15:21 -07:00			`return self.mlp_head(x)`