add vit with patch dropout, fully embrace structured dropout as multiple papers are now corroborating each other

2026-01-05 20:42:29 +00:00 · 2022-12-02 09:22:08 -08:00
30 changed files with 32 additions and 1009 deletions
--- a/.github/workflows/python-test.yml
+++ b/.github/workflows/python-test.yml
@@ -27,7 +27,6 @@ jobs:
      run: |
        python -m pip install --upgrade pip
        python -m pip install pytest
-        python -m pip install wheel
        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
    - name: Test with pytest
      run: |
--- a/README.md
+++ b/README.md
@@ -7,7 +7,6 @@
 - [Usage](#usage)
 - [Parameters](#parameters)
 - [Simple ViT](#simple-vit)
- [NaViT](#na-vit)
 - [Distillation](#distillation)
 - [Deep ViT](#deep-vit)
 - [CaiT](#cait)
@@ -28,7 +27,6 @@
 - [Masked Autoencoder](#masked-autoencoder)
 - [Simple Masked Image Modeling](#simple-masked-image-modeling)
 - [Masked Patch Prediction](#masked-patch-prediction)
- [Masked Position Prediction](#masked-position-prediction)
 - [Adaptive Token Sampling](#adaptive-token-sampling)
 - [Patch Merger](#patch-merger)
 - [Vision Transformer for Small Datasets](#vision-transformer-for-small-datasets)
@@ -140,44 +138,6 @@ img = torch.randn(1, 3, 256, 256)
 preds = v(img) # (1, 1000)
 ```

-## NaViT
-
-<img src="./images/na_vit.png" width="450px"></img>
-
-<a href="https://arxiv.org/abs/2307.06304">This paper</a> proposes to leverage the flexibility of attention and masking for variable lengthed sequences to train images of multiple resolution, packed into a single batch. They demonstrate much faster training and improved accuracies, with the only cost being extra complexity in the architecture and dataloading. They use factorized 2d positional encodings, token dropping, as well as query-key normalization.
-
-You can use it as follows
-
-```python
-import torch
-from vit_pytorch.na_vit import NaViT
-
-v = NaViT(
-    image_size = 256,
-    patch_size = 32,
-    num_classes = 1000,
-    dim = 1024,
-    depth = 6,
-    heads = 16,
-    mlp_dim = 2048,
-    dropout = 0.1,
-    emb_dropout = 0.1
-)
-
-# 5 images of different resolutions - List[List[Tensor]]
-
-# for now, you'll have to correctly place images in same batch element as to not exceed maximum allowed sequence length for self-attention w/ masking
-
-images = [
-    [torch.randn(3, 256, 256), torch.randn(3, 128, 128)],
-    [torch.randn(3, 128, 256), torch.randn(3, 256, 128)],
-    [torch.randn(3, 64, 256)]
-]
-
-preds = v(images) # (5, 1000) - 5, because 5 images of different resolution above
-
-```
-
 ## Distillation

 <img src="./images/distill.png" width="300px"></img>
@@ -343,7 +303,7 @@ cct = CCT(
    pooling_padding = 1,
    num_layers = 14,
    num_heads = 6,
-    mlp_ratio = 3.,
+    mlp_radio = 3.,
    num_classes = 1000,
    positional_embedding = 'learnable', # ['sine', 'learnable', 'none']
 )
@@ -884,44 +844,6 @@ for _ in range(100):
 torch.save(model.state_dict(), './pretrained-net.pt')
 ```

-## Masked Position Prediction
-
-<img src="./images/mp3.png" width="400px"></img>
-
-New <a href="https://arxiv.org/abs/2207.07611">paper</a> that introduces masked position prediction pre-training criteria. This strategy is more efficient than the Masked Autoencoder strategy and has comparable performance.  
-
-```python
-import torch
-from vit_pytorch.mp3 import ViT, MP3
-
-v = ViT(
-    num_classes = 1000,
-    image_size = 256,
-    patch_size = 8,
-    dim = 1024,
-    depth = 6,
-    heads = 8,
-    mlp_dim = 2048,
-    dropout = 0.1,
-)
-
-mp3 = MP3(
-    vit = v,
-    masking_ratio = 0.75
-)
-
-images = torch.randn(8, 3, 256, 256)
-
-loss = mp3(images)
-loss.backward()
-
-# that's all!
-# do the above in a for loop many times with a lot of images and your vision transformer will learn
-
-# save your improved vision transformer
-torch.save(v.state_dict(), './trained-vit.pt')
-```
-
 ## Adaptive Token Sampling

 <img src="./images/ats.png" width="400px"></img>
@@ -1121,7 +1043,7 @@ cct = CCT(
    pooling_padding = 1,
    num_layers = 14,
    num_heads = 6,
-    mlp_ratio = 3.,
+    mlp_radio = 3.,
    num_classes = 1000,
    positional_embedding = 'learnable'
 )
@@ -1951,36 +1873,6 @@ Coming from computer vision and new to transformers? Here are some resources tha
 }
 ```

-```bibtex
-@article{Liu2022PatchDropoutEV,
-    title   = {PatchDropout: Economizing Vision Transformers Using Patch Dropout},
-    author  = {Yue Liu and Christos Matsoukas and Fredrik Strand and Hossein Azizpour and Kevin Smith},
-    journal = {ArXiv},
-    year    = {2022},
-    volume  = {abs/2208.07220}
-}
-```
-
-```bibtex
-@misc{https://doi.org/10.48550/arxiv.2302.01327,
-    doi     = {10.48550/ARXIV.2302.01327},
-    url     = {https://arxiv.org/abs/2302.01327},
-    author  = {Kumar, Manoj and Dehghani, Mostafa and Houlsby, Neil},
-    title   = {Dual PatchNorm},
-    publisher = {arXiv},
-    year    = {2023},
-    copyright = {Creative Commons Attribution 4.0 International}
-}
-```
-
-```bibtex
-@inproceedings{Dehghani2023PatchNP,
-    title   = {Patch n' Pack: NaViT, a Vision Transformer for any Aspect Ratio and Resolution},
-    author  = {Mostafa Dehghani and Basil Mustafa and Josip Djolonga and Jonathan Heek and Matthias Minderer and Mathilde Caron and Andreas Steiner and Joan Puigcerver and Robert Geirhos and Ibrahim M. Alabdulmohsin and Avital Oliver and Piotr Padlewski and Alexey A. Gritsenko and Mario Luvci'c and Neil Houlsby},
-    year    = {2023}
-}
-```
-
 ```bibtex
@misc{vaswani2017attention,
    title   = {Attention Is All You Need},
@@ -1993,11 +1885,12 @@ Coming from computer vision and new to transformers? Here are some resources tha
 ```

 ```bibtex
-@inproceedings{dao2022flashattention,
-    title   = {Flash{A}ttention: Fast and Memory-Efficient Exact Attention with {IO}-Awareness},
-    author  = {Dao, Tri and Fu, Daniel Y. and Ermon, Stefano and Rudra, Atri and R{\'e}, Christopher},
-    booktitle = {Advances in Neural Information Processing Systems},
-    year    = {2022}
+@article{Liu2022PatchDropoutEV,
+    title   = {PatchDropout: Economizing Vision Transformers Using Patch Dropout},
+    author  = {Yue Liu and Christos Matsoukas and Fredrik Strand and Hossein Azizpour and Kevin Smith},
+    journal = {ArXiv},
+    year    = {2022},
+    volume  = {abs/2208.07220}
 }
 ```

--- a/images/mp3.png
+++ b/images/mp3.png
--- a/images/navit.png
+++ b/images/navit.png
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
 setup(
  name = 'vit-pytorch',
  packages = find_packages(exclude=['examples']),
-  version = '1.2.6',
+  version = '0.40.1',
  license='MIT',
  description = 'Vision Transformer (ViT) - Pytorch',
  long_description_content_type = 'text/markdown',
@@ -16,7 +16,7 @@ setup(
    'image recognition'
  ],
  install_requires=[
-    'einops>=0.6.1',
+    'einops>=0.6.0',
    'torch>=1.10',
    'torchvision'
  ],
--- a/vit_pytorch/init.py
+++ b/vit_pytorch/init.py
@@ -1,10 +1,3 @@
-import torch
-from packaging import version
-
-if version.parse(torch.__version__) >= version.parse('2.0.0'):
-    from einops._torch_specific import allow_ops_in_compiled_graph
-    allow_ops_in_compiled_graph()
-
 from vit_pytorch.vit import ViT
 from vit_pytorch.simple_vit import SimpleViT

--- a/vit_pytorch/ats_vit.py
+++ b/vit_pytorch/ats_vit.py
@@ -230,9 +230,7 @@ class ViT(nn.Module):

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_height, p2 = patch_width),
-            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim)
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
--- a/vit_pytorch/cait.py
+++ b/vit_pytorch/cait.py
@@ -150,9 +150,7 @@ class CaiT(nn.Module):

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_size, p2 = patch_size),
-            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim)
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches, dim))
--- a/vit_pytorch/cross_vit.py
+++ b/vit_pytorch/cross_vit.py
@@ -186,9 +186,7 @@ class ImageEmbedder(nn.Module):

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_size, p2 = patch_size),
-            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim)
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
--- a/vit_pytorch/deepvit.py
+++ b/vit_pytorch/deepvit.py
@@ -105,9 +105,7 @@ class DeepViT(nn.Module):

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_size, p2 = patch_size),
-            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim)
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
--- a/vit_pytorch/efficient.py
+++ b/vit_pytorch/efficient.py
@@ -17,9 +17,7 @@ class ViT(nn.Module):

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_size, p2 = patch_size),
-            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim)
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
--- a/vit_pytorch/learnable_memory_vit.py
+++ b/vit_pytorch/learnable_memory_vit.py
@@ -118,9 +118,7 @@ class ViT(nn.Module):

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_height, p2 = patch_width),
-            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim)
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
--- a/vit_pytorch/local_vit.py
+++ b/vit_pytorch/local_vit.py
@@ -126,9 +126,7 @@ class LocalViT(nn.Module):

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_size, p2 = patch_size),
-            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim),
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
--- a/vit_pytorch/mae.py
+++ b/vit_pytorch/mae.py
@@ -24,11 +24,8 @@ class MAE(nn.Module):

        self.encoder = encoder
        num_patches, encoder_dim = encoder.pos_embedding.shape[-2:]
-
-        self.to_patch = encoder.to_patch_embedding[0]
-        self.patch_to_emb = nn.Sequential(*encoder.to_patch_embedding[1:])
-
-        pixel_values_per_patch = encoder.to_patch_embedding[2].weight.shape[-1]
+        self.to_patch, self.patch_to_emb = encoder.to_patch_embedding[:2]
+        pixel_values_per_patch = self.patch_to_emb.weight.shape[-1]

        # decoder parameters
        self.decoder_dim = decoder_dim
@@ -49,10 +46,7 @@ class MAE(nn.Module):
        # patch to encoder tokens and add positions

        tokens = self.patch_to_emb(patches)
-        if self.encoder.pool == "cls":
-            tokens += self.encoder.pos_embedding[:, 1:(num_patches + 1)]
-        elif self.encoder.pool == "mean":
-            tokens += self.encoder.pos_embedding.to(device, dtype=tokens.dtype) 
+        tokens = tokens + self.encoder.pos_embedding[:, 1:(num_patches + 1)]

        # calculate of patches needed to be masked, and get random indices, dividing it up for mask vs unmasked

--- a/vit_pytorch/mp3.py
+++ b/vit_pytorch/mp3.py
@@ -1,186 +0,0 @@
-import torch
-from torch import nn, einsum
-import torch.nn.functional as F
-
-from einops import rearrange, repeat
-from einops.layers.torch import Rearrange
-
-# helpers
-
-def exists(val):
-    return val is not None
-
-def default(val, d):
-    return val if exists(val) else d
-
-def pair(t):
-    return t if isinstance(t, tuple) else (t, t)
-
-# positional embedding
-
-def posemb_sincos_2d(patches, temperature = 10000, dtype = torch.float32):
-    _, h, w, dim, device, dtype = *patches.shape, patches.device, patches.dtype
-
-    y, x = torch.meshgrid(torch.arange(h, device = device), torch.arange(w, device = device), indexing = 'ij')
-    assert (dim % 4) == 0, 'feature dimension must be multiple of 4 for sincos emb'
-    omega = torch.arange(dim // 4, device = device) / (dim // 4 - 1)
-    omega = 1. / (temperature ** omega)
-
-    y = y.flatten()[:, None] * omega[None, :]
-    x = x.flatten()[:, None] * omega[None, :]
-    pe = torch.cat((x.sin(), x.cos(), y.sin(), y.cos()), dim = 1)
-    return pe.type(dtype)
-
-# feedforward
-
-class FeedForward(nn.Module):
-    def __init__(self, dim, hidden_dim, dropout = 0.):
-        super().__init__()
-        self.net = nn.Sequential(
-            nn.LayerNorm(dim),
-            nn.Linear(dim, hidden_dim),
-            nn.GELU(),
-            nn.Dropout(dropout),
-            nn.Linear(hidden_dim, dim),
-            nn.Dropout(dropout)
-        )
-    def forward(self, x):
-        return self.net(x)
-
-# (cross)attention
-
-class Attention(nn.Module):
-    def __init__(self, dim, heads = 8, dim_head = 64, dropout = 0.):
-        super().__init__()
-        inner_dim = dim_head *  heads
-        self.heads = heads
-        self.scale = dim_head ** -0.5
-
-        self.attend = nn.Softmax(dim = -1)
-        self.dropout = nn.Dropout(dropout)
-
-        self.norm = nn.LayerNorm(dim)
-
-        self.to_q = nn.Linear(dim, inner_dim, bias = False)
-        self.to_kv = nn.Linear(dim, inner_dim * 2, bias = False)
-
-        self.to_out = nn.Sequential(
-            nn.Linear(inner_dim, dim),
-            nn.Dropout(dropout)
-        )
-
-    def forward(self, x, context = None):
-        b, n, _, h = *x.shape, self.heads
-
-        x = self.norm(x)
-
-        context = self.norm(context) if exists(context) else x
-
-        qkv = (self.to_q(x), *self.to_kv(context).chunk(2, dim = -1))
-        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), qkv)
-
-        dots = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale
-
-        attn = self.attend(dots)
-        attn = self.dropout(attn)
-
-        out = einsum('b h i j, b h j d -> b h i d', attn, v)
-        out = rearrange(out, 'b h n d -> b n (h d)')
-        return self.to_out(out)
-
-class Transformer(nn.Module):
-    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0.):
-        super().__init__()
-        self.layers = nn.ModuleList([])
-        for _ in range(depth):
-            self.layers.append(nn.ModuleList([
-                Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout),
-                FeedForward(dim, mlp_dim, dropout = dropout)
-            ]))
-    def forward(self, x, context = None):
-        for attn, ff in self.layers:
-            x = attn(x, context = context) + x
-            x = ff(x) + x
-        return x
-
-class ViT(nn.Module):
-    def __init__(self, *, num_classes, image_size, patch_size, dim, depth, heads, mlp_dim, channels = 3, dim_head = 64, dropout = 0.):
-        super().__init__()
-        image_height, image_width = pair(image_size)
-        patch_height, patch_width = pair(patch_size)
-
-        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'
-        
-        num_patches = (image_height // patch_height) * (image_width // patch_width)
-        patch_dim = channels * patch_height * patch_width
-
-        self.dim = dim
-        self.num_patches = num_patches
-
-        self.to_patch_embedding = nn.Sequential(
-            Rearrange('b c (h p1) (w p2) -> b h w (p1 p2 c)', p1 = patch_height, p2 = patch_width),
-            nn.LayerNorm(patch_dim),
-            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim),
-        )
-
-        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)
-
-        self.to_latent = nn.Identity()
-        self.linear_head = nn.Sequential(
-            nn.LayerNorm(dim),
-            nn.Linear(dim, num_classes)
-        )
-
-    def forward(self, img):
-        *_, h, w, dtype = *img.shape, img.dtype
-
-        x = self.to_patch_embedding(img)
-        pe = posemb_sincos_2d(x)
-        x = rearrange(x, 'b ... d -> b (...) d') + pe
-
-        x = self.transformer(x)
-        x = x.mean(dim = 1)
-
-        x = self.to_latent(x)
-        return self.linear_head(x)
-
-# Masked Position Prediction Pre-Training
-
-class MP3(nn.Module):
-    def __init__(self, vit: ViT, masking_ratio):
-        super().__init__()
-        self.vit = vit
-
-        assert masking_ratio > 0 and masking_ratio < 1, 'masking ratio must be kept between 0 and 1'
-        self.masking_ratio = masking_ratio
-
-        dim = vit.dim
-        self.mlp_head = nn.Sequential(
-            nn.LayerNorm(dim),
-            nn.Linear(dim, vit.num_patches)
-        )
-
-    def forward(self, img):
-        device = img.device
-        tokens = self.vit.to_patch_embedding(img)
-        tokens = rearrange(tokens, 'b ... d -> b (...) d')
-
-        batch, num_patches, *_ = tokens.shape
-
-        # Masking
-        num_masked = int(self.masking_ratio * num_patches)
-        rand_indices = torch.rand(batch, num_patches, device = device).argsort(dim = -1)
-        masked_indices, unmasked_indices = rand_indices[:, :num_masked], rand_indices[:, num_masked:]
-
-        batch_range = torch.arange(batch, device = device)[:, None]
-        tokens_unmasked = tokens[batch_range, unmasked_indices]
-
-        attended_tokens = self.vit.transformer(tokens, tokens_unmasked)
-        logits = rearrange(self.mlp_head(attended_tokens), 'b n d -> (b n) d')
-        
-        # Define labels
-        labels = repeat(torch.arange(num_patches, device = device), 'n -> (b n)', b = batch)
-        loss = F.cross_entropy(logits, labels)
-
-        return loss
--- a/vit_pytorch/mpp.py
+++ b/vit_pytorch/mpp.py
@@ -96,9 +96,6 @@ class MPP(nn.Module):
        self.loss = MPPLoss(patch_size, channels, output_channel_bits,
                            max_pixel_val, mean, std)

-        # extract patching function
-        self.patch_to_emb = nn.Sequential(transformer.to_patch_embedding[1:])
-
        # output transformation
        self.to_bits = nn.Linear(dim, 2**(output_channel_bits * channels))

@@ -154,7 +151,7 @@ class MPP(nn.Module):
        masked_input[bool_mask_replace] = self.mask_token

        # linear embedding of patches
-        masked_input = self.patch_to_emb(masked_input)
+        masked_input = transformer.to_patch_embedding[-1](masked_input)

        # add cls token to input sequence
        b, n, _ = masked_input.shape
--- a/vit_pytorch/na_vit.py
+++ b/vit_pytorch/na_vit.py
@@ -1,305 +0,0 @@
-from functools import partial
-from typing import List
-
-import torch
-import torch.nn.functional as F
-from torch import nn, Tensor
-from torch.nn.utils.rnn import pad_sequence as orig_pad_sequence
-
-from einops import rearrange, repeat
-from einops.layers.torch import Rearrange
-
-# helpers
-
-def exists(val):
-    return val is not None
-
-def default(val, d):
-    return val if exists(val) else d
-
-def pair(t):
-    return t if isinstance(t, tuple) else (t, t)
-
-def divisible_by(numer, denom):
-    return (numer % denom) == 0
-
-# normalization
-# they use layernorm without bias, something that pytorch does not offer
-
-class LayerNorm(nn.Module):
-    def __init__(self, dim):
-        super().__init__()
-        self.gamma = nn.Parameter(torch.ones(dim))
-        self.register_buffer('beta', torch.zeros(dim))
-
-    def forward(self, x):
-        return F.layer_norm(x, x.shape[-1:], self.gamma, self.beta)
-
-# they use a query-key normalization that is equivalent to rms norm (no mean-centering, learned gamma), from vit 22B paper
-
-class RMSNorm(nn.Module):
-    def __init__(self, heads, dim):
-        super().__init__()
-        self.scale = dim ** 0.5
-        self.gamma = nn.Parameter(torch.ones(heads, 1, dim))
-
-    def forward(self, x):
-        normed = F.normalize(x, dim = -1)
-        return normed * self.scale * self.gamma
-
-# feedforward
-
-def FeedForward(dim, hidden_dim, dropout = 0.):
-    return nn.Sequential(
-        LayerNorm(dim),
-        nn.Linear(dim, hidden_dim),
-        nn.GELU(),
-        nn.Dropout(dropout),
-        nn.Linear(hidden_dim, dim),
-        nn.Dropout(dropout)
-    )
-
-class Attention(nn.Module):
-    def __init__(self, dim, heads = 8, dim_head = 64, dropout = 0.):
-        super().__init__()
-        inner_dim = dim_head *  heads
-        self.heads = heads
-        self.norm = LayerNorm(dim)
-
-        self.q_norm = RMSNorm(heads, dim_head)
-        self.k_norm = RMSNorm(heads, dim_head)
-
-        self.attend = nn.Softmax(dim = -1)
-        self.dropout = nn.Dropout(dropout)
-
-        self.to_q = nn.Linear(dim, inner_dim, bias = False)
-        self.to_kv = nn.Linear(dim, inner_dim * 2, bias = False)
-
-        self.to_out = nn.Sequential(
-            nn.Linear(inner_dim, dim, bias = False),
-            nn.Dropout(dropout)
-        )
-
-    def forward(
-        self,
-        x,
-        context = None,
-        mask = None,
-        attn_mask = None
-    ):
-        x = self.norm(x)
-        kv_input = default(context, x)
-
-        qkv = (self.to_q(x), *self.to_kv(kv_input).chunk(2, dim = -1))
-
-        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv)
-
-        q = self.q_norm(q)
-        k = self.k_norm(k)
-
-        dots = torch.matmul(q, k.transpose(-1, -2))
-
-        if exists(mask):
-            mask = rearrange(mask, 'b j -> b 1 1 j')
-            dots = dots.masked_fill(~mask, -torch.finfo(dots.dtype).max)
-
-        if exists(attn_mask):
-            dots = dots.masked_fill(~attn_mask, -torch.finfo(dots.dtype).max)
-
-        attn = self.attend(dots)
-        attn = self.dropout(attn)
-
-        out = torch.matmul(attn, v)
-        out = rearrange(out, 'b h n d -> b n (h d)')
-        return self.to_out(out)
-
-class Transformer(nn.Module):
-    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0.):
-        super().__init__()
-        self.layers = nn.ModuleList([])
-        for _ in range(depth):
-            self.layers.append(nn.ModuleList([
-                Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout),
-                FeedForward(dim, mlp_dim, dropout = dropout)
-            ]))
-
-        self.norm = LayerNorm(dim)
-
-    def forward(
-        self,
-        x,
-        mask = None,
-        attn_mask = None
-    ):
-        for attn, ff in self.layers:
-            x = attn(x, mask = mask, attn_mask = attn_mask) + x
-            x = ff(x) + x
-
-        return self.norm(x)
-
-class NaViT(nn.Module):
-    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
-        super().__init__()
-        image_height, image_width = pair(image_size)
-
-        assert divisible_by(image_height, patch_size) and divisible_by(image_width, patch_size), 'Image dimensions must be divisible by the patch size.'
-
-        patch_height_dim, patch_width_dim = (image_height // patch_size), (image_width // patch_size)
-        patch_dim = channels * (patch_size ** 2)
-
-        self.channels = channels
-        self.patch_size = patch_size
-
-        self.to_patch_embedding = nn.Sequential(
-            LayerNorm(patch_dim),
-            nn.Linear(patch_dim, dim),
-            LayerNorm(dim),
-        )
-
-        self.pos_embed_height = nn.Parameter(torch.randn(patch_height_dim, dim))
-        self.pos_embed_width = nn.Parameter(torch.randn(patch_width_dim, dim))
-
-        self.dropout = nn.Dropout(emb_dropout)
-
-        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)
-
-        # final attention pooling queries
-
-        self.attn_pool_queries = nn.Parameter(torch.randn(dim))
-        self.attn_pool = Attention(dim = dim, dim_head = dim_head, heads = heads)
-
-        # output to logits
-
-        self.to_latent = nn.Identity()
-
-        self.mlp_head = nn.Sequential(
-            LayerNorm(dim),
-            nn.Linear(dim, num_classes, bias = False)
-        )
-
-    @property
-    def device(self):
-        return next(self.parameters()).device
-
-    def forward(
-        self,
-        batched_images: List[List[Tensor]] # assume different resolution images already grouped correctly
-    ):
-        p, c, device = self.patch_size, self.channels, self.device
-
-        arange = partial(torch.arange, device = device)
-        pad_sequence = partial(orig_pad_sequence, batch_first = True)
-
-        # process images into variable lengthed sequences with attention mask
-
-        num_images = []
-        batched_sequences = []
-        batched_positions = []
-        batched_image_ids = []
-
-        for images in batched_images:
-            num_images.append(len(images))
-
-            sequences = []
-            positions = []
-            image_ids = torch.empty((0,), device = device, dtype = torch.long)
-
-            for image_id, image in enumerate(images):
-                assert image.ndim ==3 and image.shape[0] == c
-                image_dims = image.shape[-2:]
-                assert all([divisible_by(dim, p) for dim in image_dims]), f'height and width {image_dims} of images must be divisible by patch size {p}'
-
-                ph, pw = map(lambda dim: dim // p, image_dims)
-
-                pos = torch.stack(torch.meshgrid((
-                    arange(ph),
-                    arange(pw)
-                ), indexing = 'ij'), dim = -1)
-
-                pos = rearrange(pos, 'h w c -> (h w) c')
-                seq = rearrange(image, 'c (h p1) (w p2) -> (h w) (c p1 p2)', p1 = p, p2 = p)
-
-                image_ids = F.pad(image_ids, (0, seq.shape[-2]), value = image_id)
-                sequences.append(seq)
-                positions.append(pos)
-
-            batched_image_ids.append(image_ids)
-            batched_sequences.append(torch.cat(sequences, dim = 0))
-            batched_positions.append(torch.cat(positions, dim = 0))
-
-        # derive key padding mask
-
-        lengths = torch.tensor([seq.shape[-2] for seq in batched_sequences], device = device, dtype = torch.long)
-        max_length = arange(lengths.amax().item())
-        key_pad_mask = rearrange(lengths, 'b -> b 1') <= rearrange(max_length, 'n -> 1 n')
-
-        # derive attention mask, and combine with key padding mask from above
-
-        batched_image_ids = pad_sequence(batched_image_ids)
-        attn_mask = rearrange(batched_image_ids, 'b i -> b 1 i 1') == rearrange(batched_image_ids, 'b j -> b 1 1 j')
-        attn_mask = attn_mask & rearrange(key_pad_mask, 'b j -> b 1 1 j')
-
-        # combine patched images as well as the patched width / height positions for 2d positional embedding
-
-        patches = pad_sequence(batched_sequences)
-        patch_positions = pad_sequence(batched_positions)
-
-        # need to know how many images for final attention pooling
-
-        num_images = torch.tensor(num_images, device = device, dtype = torch.long)        
-
-        # to patches
-
-        x = self.to_patch_embedding(patches)        
-
-        # factorized 2d absolute positional embedding
-
-        h_indices, w_indices = patch_positions.unbind(dim = -1)
-
-        h_pos = self.pos_embed_height[h_indices]
-        w_pos = self.pos_embed_width[w_indices]
-
-        x = x + h_pos + w_pos
-
-        # embed dropout
-
-        x = self.dropout(x)
-
-        # attention
-
-        x = self.transformer(x, attn_mask = attn_mask)
-
-        # do attention pooling at the end
-
-        max_queries = num_images.amax().item()
-
-        queries = repeat(self.attn_pool_queries, 'd -> b n d', n = max_queries, b = x.shape[0])
-
-        # attention pool mask
-
-        image_id_arange = arange(max_queries)
-
-        attn_pool_mask = rearrange(image_id_arange, 'i -> i 1') == rearrange(batched_image_ids, 'b j -> b 1 j')
-
-        attn_pool_mask = attn_pool_mask & rearrange(key_pad_mask, 'b j -> b 1 j')
-
-        attn_pool_mask = rearrange(attn_pool_mask, 'b i j -> b 1 i j')
-
-        # attention pool
-
-        x = self.attn_pool(queries, context = x, attn_mask = attn_pool_mask) + queries
-
-        x = rearrange(x, 'b n d -> (b n) d')
-
-        # each batch element may not have same amount of images
-
-        is_images = image_id_arange < rearrange(num_images, 'b -> b 1')
-        is_images = rearrange(is_images, 'b n -> (b n)')
-
-        x = x[is_images]
-
-        # project out to logits
-
-        x = self.to_latent(x)
-
-        return self.mlp_head(x)
--- a/vit_pytorch/nest.py
+++ b/vit_pytorch/nest.py
@@ -144,9 +144,7 @@ class NesT(nn.Module):

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (p1 p2 c) h w', p1 = patch_size, p2 = patch_size),
-            LayerNorm(patch_dim),
            nn.Conv2d(patch_dim, layer_dims[0], 1),
-            LayerNorm(layer_dims[0])
        )

        block_repeats = cast_tuple(block_repeats, num_hierarchies)
--- a/vit_pytorch/simmim.py
+++ b/vit_pytorch/simmim.py
@@ -18,11 +18,8 @@ class SimMIM(nn.Module):

        self.encoder = encoder
        num_patches, encoder_dim = encoder.pos_embedding.shape[-2:]
-
-        self.to_patch = encoder.to_patch_embedding[0]
-        self.patch_to_emb = nn.Sequential(*encoder.to_patch_embedding[1:])
-
-        pixel_values_per_patch = encoder.to_patch_embedding[2].weight.shape[-1]
+        self.to_patch, self.patch_to_emb = encoder.to_patch_embedding[:2]
+        pixel_values_per_patch = self.patch_to_emb.weight.shape[-1]

        # simple linear head

--- a/vit_pytorch/simple_flash_attn_vit.py
+++ b/vit_pytorch/simple_flash_attn_vit.py
@@ -1,176 +0,0 @@
-from collections import namedtuple
-from packaging import version
-
-import torch
-import torch.nn.functional as F
-from torch import nn
-
-from einops import rearrange
-from einops.layers.torch import Rearrange
-
-# constants
-
-Config = namedtuple('FlashAttentionConfig', ['enable_flash', 'enable_math', 'enable_mem_efficient'])
-
-# helpers
-
-def pair(t):
-    return t if isinstance(t, tuple) else (t, t)
-
-def posemb_sincos_2d(patches, temperature = 10000, dtype = torch.float32):
-    _, h, w, dim, device, dtype = *patches.shape, patches.device, patches.dtype
-
-    y, x = torch.meshgrid(torch.arange(h, device = device), torch.arange(w, device = device), indexing = 'ij')
-    assert (dim % 4) == 0, 'feature dimension must be multiple of 4 for sincos emb'
-    omega = torch.arange(dim // 4, device = device) / (dim // 4 - 1)
-    omega = 1. / (temperature ** omega)
-
-    y = y.flatten()[:, None] * omega[None, :]
-    x = x.flatten()[:, None] * omega[None, :] 
-    pe = torch.cat((x.sin(), x.cos(), y.sin(), y.cos()), dim = 1)
-    return pe.type(dtype)
-
-# main class
-
-class Attend(nn.Module):
-    def __init__(self, use_flash = False):
-        super().__init__()
-        self.use_flash = use_flash
-        assert not (use_flash and version.parse(torch.__version__) < version.parse('2.0.0')), 'in order to use flash attention, you must be using pytorch 2.0 or above'
-
-        # determine efficient attention configs for cuda and cpu
-
-        self.cpu_config = Config(True, True, True)
-        self.cuda_config = None
-
-        if not torch.cuda.is_available() or not use_flash:
-            return
-
-        device_properties = torch.cuda.get_device_properties(torch.device('cuda'))
-
-        if device_properties.major == 8 and device_properties.minor == 0:
-            self.cuda_config = Config(True, False, False)
-        else:
-            self.cuda_config = Config(False, True, True)
-
-    def flash_attn(self, q, k, v):
-        config = self.cuda_config if q.is_cuda else self.cpu_config
-
-        # flash attention - https://arxiv.org/abs/2205.14135
-        
-        with torch.backends.cuda.sdp_kernel(**config._asdict()):
-            out = F.scaled_dot_product_attention(q, k, v)
-
-        return out
-
-    def forward(self, q, k, v):
-        n, device, scale = q.shape[-2], q.device, q.shape[-1] ** -0.5
-
-        if self.use_flash:
-            return self.flash_attn(q, k, v)
-
-        # similarity
-
-        sim = einsum("b h i d, b j d -> b h i j", q, k) * scale
-
-        # attention
-
-        attn = sim.softmax(dim=-1)
-
-        # aggregate values
-
-        out = einsum("b h i j, b j d -> b h i d", attn, v)
-
-        return out
-
-# classes
-
-class FeedForward(nn.Module):
-    def __init__(self, dim, hidden_dim):
-        super().__init__()
-        self.net = nn.Sequential(
-            nn.LayerNorm(dim),
-            nn.Linear(dim, hidden_dim),
-            nn.GELU(),
-            nn.Linear(hidden_dim, dim),
-        )
-    def forward(self, x):
-        return self.net(x)
-
-class Attention(nn.Module):
-    def __init__(self, dim, heads = 8, dim_head = 64, use_flash = True):
-        super().__init__()
-        inner_dim = dim_head *  heads
-        self.heads = heads
-        self.scale = dim_head ** -0.5
-        self.norm = nn.LayerNorm(dim)
-
-        self.attend = Attend(use_flash = use_flash)
-
-        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
-        self.to_out = nn.Linear(inner_dim, dim, bias = False)
-
-    def forward(self, x):
-        x = self.norm(x)
-
-        qkv = self.to_qkv(x).chunk(3, dim = -1)
-        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv)
-
-        out = self.attend(q, k, v)
-
-        out = rearrange(out, 'b h n d -> b n (h d)')
-        return self.to_out(out)
-
-class Transformer(nn.Module):
-    def __init__(self, dim, depth, heads, dim_head, mlp_dim, use_flash):
-        super().__init__()
-        self.layers = nn.ModuleList([])
-        for _ in range(depth):
-            self.layers.append(nn.ModuleList([
-                Attention(dim, heads = heads, dim_head = dim_head, use_flash = use_flash),
-                FeedForward(dim, mlp_dim)
-            ]))
-    def forward(self, x):
-        for attn, ff in self.layers:
-            x = attn(x) + x
-            x = ff(x) + x
-        return x
-
-class SimpleViT(nn.Module):
-    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, channels = 3, dim_head = 64, use_flash = True):
-        super().__init__()
-        image_height, image_width = pair(image_size)
-        patch_height, patch_width = pair(patch_size)
-
-        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'
-
-        num_patches = (image_height // patch_height) * (image_width // patch_width)
-        patch_dim = channels * patch_height * patch_width
-
-        self.to_patch_embedding = nn.Sequential(
-            Rearrange('b c (h p1) (w p2) -> b h w (p1 p2 c)', p1 = patch_height, p2 = patch_width),
-            nn.LayerNorm(patch_dim),
-            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim),
-        )
-
-        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, use_flash)
-
-        self.to_latent = nn.Identity()
-        self.linear_head = nn.Sequential(
-            nn.LayerNorm(dim),
-            nn.Linear(dim, num_classes)
-        )
-
-    def forward(self, img):
-        *_, h, w, dtype = *img.shape, img.dtype
-
-        x = self.to_patch_embedding(img)
-        pe = posemb_sincos_2d(x)
-        x = rearrange(x, 'b ... d -> b (...) d') + pe
-
-        x = self.transformer(x)
-        x = x.mean(dim = 1)
-
-        x = self.to_latent(x)
-        return self.linear_head(x)
--- a/vit_pytorch/simple_vit.py
+++ b/vit_pytorch/simple_vit.py
@@ -9,15 +9,17 @@ from einops.layers.torch import Rearrange
 def pair(t):
    return t if isinstance(t, tuple) else (t, t)

-def posemb_sincos_2d(h, w, dim, temperature: int = 10000, dtype = torch.float32):
-    y, x = torch.meshgrid(torch.arange(h), torch.arange(w), indexing="ij")
-    assert (dim % 4) == 0, "feature dimension must be multiple of 4 for sincos emb"
-    omega = torch.arange(dim // 4) / (dim // 4 - 1)
-    omega = 1.0 / (temperature ** omega)
+def posemb_sincos_2d(patches, temperature = 10000, dtype = torch.float32):
+    _, h, w, dim, device, dtype = *patches.shape, patches.device, patches.dtype
+
+    y, x = torch.meshgrid(torch.arange(h, device = device), torch.arange(w, device = device), indexing = 'ij')
+    assert (dim % 4) == 0, 'feature dimension must be multiple of 4 for sincos emb'
+    omega = torch.arange(dim // 4, device = device) / (dim // 4 - 1)
+    omega = 1. / (temperature ** omega)

    y = y.flatten()[:, None] * omega[None, :]
-    x = x.flatten()[:, None] * omega[None, :]
-    pe = torch.cat((x.sin(), x.cos(), y.sin(), y.cos()), dim=1)
+    x = x.flatten()[:, None] * omega[None, :] 
+    pe = torch.cat((x.sin(), x.cos(), y.sin(), y.cos()), dim = 1)
    return pe.type(dtype)

 # classes
@@ -84,21 +86,14 @@ class SimpleViT(nn.Module):

        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'

+        num_patches = (image_height // patch_height) * (image_width // patch_width)
        patch_dim = channels * patch_height * patch_width

        self.to_patch_embedding = nn.Sequential(
-            Rearrange("b c (h p1) (w p2) -> b (h w) (p1 p2 c)", p1 = patch_height, p2 = patch_width),
-            nn.LayerNorm(patch_dim),
+            Rearrange('b c (h p1) (w p2) -> b h w (p1 p2 c)', p1 = patch_height, p2 = patch_width),
            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim),
        )

-        self.pos_embedding = posemb_sincos_2d(
-            h = image_height // patch_height,
-            w = image_width // patch_width,
-            dim = dim,
-        ) 
-
        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim)

        self.to_latent = nn.Identity()
@@ -106,13 +101,13 @@ class SimpleViT(nn.Module):
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )
-        self.pool = "mean"

    def forward(self, img):
-        device = img.device
+        *_, h, w, dtype = *img.shape, img.dtype

        x = self.to_patch_embedding(img)
-        x += self.pos_embedding.to(device, dtype=x.dtype)
+        pe = posemb_sincos_2d(x)
+        x = rearrange(x, 'b ... d -> b (...) d') + pe

        x = self.transformer(x)
        x = x.mean(dim = 1)
--- a/vit_pytorch/simple_vit_1d.py
+++ b/vit_pytorch/simple_vit_1d.py
@@ -85,9 +85,7 @@ class SimpleViT(nn.Module):

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (n p) -> b n (p c)', p = patch_size),
-            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim),
        )

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim)
--- a/vit_pytorch/simple_vit_3d.py
+++ b/vit_pytorch/simple_vit_3d.py
@@ -103,9 +103,7 @@ class SimpleViT(nn.Module):

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (f pf) (h p1) (w p2) -> b f h w (p1 p2 pf c)', p1 = patch_height, p2 = patch_width, pf = frame_patch_size),
-            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim),
        )

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim)
--- a/vit_pytorch/simple_vit_with_patch_dropout.py
+++ b/vit_pytorch/simple_vit_with_patch_dropout.py
@@ -1,143 +0,0 @@
-import torch
-from torch import nn
-
-from einops import rearrange
-from einops.layers.torch import Rearrange
-
-# helpers
-
-def pair(t):
-    return t if isinstance(t, tuple) else (t, t)
-
-def posemb_sincos_2d(patches, temperature = 10000, dtype = torch.float32):
-    _, h, w, dim, device, dtype = *patches.shape, patches.device, patches.dtype
-
-    y, x = torch.meshgrid(torch.arange(h, device = device), torch.arange(w, device = device), indexing = 'ij')
-    assert (dim % 4) == 0, 'feature dimension must be multiple of 4 for sincos emb'
-    omega = torch.arange(dim // 4, device = device) / (dim // 4 - 1)
-    omega = 1. / (temperature ** omega)
-
-    y = y.flatten()[:, None] * omega[None, :]
-    x = x.flatten()[:, None] * omega[None, :] 
-    pe = torch.cat((x.sin(), x.cos(), y.sin(), y.cos()), dim = 1)
-    return pe.type(dtype)
-
-# patch dropout
-
-class PatchDropout(nn.Module):
-    def __init__(self, prob):
-        super().__init__()
-        assert 0 <= prob < 1.
-        self.prob = prob
-
-    def forward(self, x):
-        if not self.training or self.prob == 0.:
-            return x
-
-        b, n, _, device = *x.shape, x.device
-
-        batch_indices = torch.arange(b, device = device)
-        batch_indices = rearrange(batch_indices, '... -> ... 1')
-        num_patches_keep = max(1, int(n * (1 - self.prob)))
-        patch_indices_keep = torch.randn(b, n, device = device).topk(num_patches_keep, dim = -1).indices
-
-        return x[batch_indices, patch_indices_keep]
-
-# classes
-
-class FeedForward(nn.Module):
-    def __init__(self, dim, hidden_dim):
-        super().__init__()
-        self.net = nn.Sequential(
-            nn.LayerNorm(dim),
-            nn.Linear(dim, hidden_dim),
-            nn.GELU(),
-            nn.Linear(hidden_dim, dim),
-        )
-    def forward(self, x):
-        return self.net(x)
-
-class Attention(nn.Module):
-    def __init__(self, dim, heads = 8, dim_head = 64):
-        super().__init__()
-        inner_dim = dim_head *  heads
-        self.heads = heads
-        self.scale = dim_head ** -0.5
-        self.norm = nn.LayerNorm(dim)
-
-        self.attend = nn.Softmax(dim = -1)
-
-        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
-        self.to_out = nn.Linear(inner_dim, dim, bias = False)
-
-    def forward(self, x):
-        x = self.norm(x)
-
-        qkv = self.to_qkv(x).chunk(3, dim = -1)
-        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv)
-
-        dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale
-
-        attn = self.attend(dots)
-
-        out = torch.matmul(attn, v)
-        out = rearrange(out, 'b h n d -> b n (h d)')
-        return self.to_out(out)
-
-class Transformer(nn.Module):
-    def __init__(self, dim, depth, heads, dim_head, mlp_dim):
-        super().__init__()
-        self.layers = nn.ModuleList([])
-        for _ in range(depth):
-            self.layers.append(nn.ModuleList([
-                Attention(dim, heads = heads, dim_head = dim_head),
-                FeedForward(dim, mlp_dim)
-            ]))
-    def forward(self, x):
-        for attn, ff in self.layers:
-            x = attn(x) + x
-            x = ff(x) + x
-        return x
-
-class SimpleViT(nn.Module):
-    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, channels = 3, dim_head = 64, patch_dropout = 0.5):
-        super().__init__()
-        image_height, image_width = pair(image_size)
-        patch_height, patch_width = pair(patch_size)
-
-        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'
-
-        num_patches = (image_height // patch_height) * (image_width // patch_width)
-        patch_dim = channels * patch_height * patch_width
-
-        self.to_patch_embedding = nn.Sequential(
-            Rearrange('b c (h p1) (w p2) -> b h w (p1 p2 c)', p1 = patch_height, p2 = patch_width),
-            nn.LayerNorm(patch_dim),
-            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim)
-        )
-
-        self.patch_dropout = PatchDropout(patch_dropout)
-
-        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim)
-
-        self.to_latent = nn.Identity()
-        self.linear_head = nn.Sequential(
-            nn.LayerNorm(dim),
-            nn.Linear(dim, num_classes)
-        )
-
-    def forward(self, img):
-        *_, h, w, dtype = *img.shape, img.dtype
-
-        x = self.to_patch_embedding(img)
-        pe = posemb_sincos_2d(x)
-        x = rearrange(x, 'b ... d -> b (...) d') + pe
-
-        x = self.patch_dropout(x)
-
-        x = self.transformer(x)
-        x = x.mean(dim = 1)
-
-        x = self.to_latent(x)
-        return self.linear_head(x)
--- a/vit_pytorch/twins_svt.py
+++ b/vit_pytorch/twins_svt.py
@@ -71,12 +71,7 @@ class PatchEmbedding(nn.Module):
        self.dim = dim
        self.dim_out = dim_out
        self.patch_size = patch_size
-
-        self.proj = nn.Sequential(
-            LayerNorm(patch_size ** 2 * dim),
-            nn.Conv2d(patch_size ** 2 * dim, dim_out, 1),
-            LayerNorm(dim_out)
-        )
+        self.proj = nn.Conv2d(patch_size ** 2 * dim, dim_out, 1)

    def forward(self, fmap):
        p = self.patch_size
--- a/vit_pytorch/vit.py
+++ b/vit_pytorch/vit.py
@@ -93,9 +93,7 @@ class ViT(nn.Module):

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_height, p2 = patch_width),
-            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim),
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
--- a/vit_pytorch/vit_1d.py
+++ b/vit_pytorch/vit_1d.py
@@ -84,9 +84,7 @@ class ViT(nn.Module):

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (n p) -> b n (p c)', p = patch_size),
-            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim),
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
--- a/vit_pytorch/vit_3d.py
+++ b/vit_pytorch/vit_3d.py
@@ -95,9 +95,7 @@ class ViT(nn.Module):

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (f pf) (h p1) (w p2) -> b (f h w) (p1 p2 pf c)', p1 = patch_height, p2 = patch_width, pf = frame_patch_size),
-            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim),
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
--- a/vit_pytorch/vit_with_patch_merger.py
+++ b/vit_pytorch/vit_with_patch_merger.py
@@ -121,9 +121,7 @@ class ViT(nn.Module):

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_height, p2 = patch_width),
-            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim)
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
--- a/vit_pytorch/vivit.py
+++ b/vit_pytorch/vivit.py
@@ -120,9 +120,7 @@ class ViT(nn.Module):

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (f pf) (h p1) (w p2) -> b f (h w) (p1 p2 pf c)', p1 = patch_height, p2 = patch_width, pf = frame_patch_size),
-            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim)
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_frame_patches, num_image_patches, dim))
@@ -146,7 +144,7 @@ class ViT(nn.Module):
        x = self.to_patch_embedding(video)
        b, f, n, _ = x.shape

-        x = x + self.pos_embedding[:, :f, :n]
+        x = x + self.pos_embedding

        if exists(self.spatial_cls_token):
            spatial_cls_tokens = repeat(self.spatial_cls_token, '1 1 d -> b f 1 d', b = b, f = f)