need simple vit with patch dropout for another project

2025-12-30 16:12:29 +00:00 · 2022-12-05 10:47:02 -08:00
22 changed files with 27 additions and 63 deletions
--- a/README.md
+++ b/README.md
@@ -1883,18 +1883,6 @@ Coming from computer vision and new to transformers? Here are some resources tha
 }
 ```

-```bibtex
-@misc{https://doi.org/10.48550/arxiv.2302.01327,
-    doi     = {10.48550/ARXIV.2302.01327},
-    url     = {https://arxiv.org/abs/2302.01327},
-    author  = {Kumar, Manoj and Dehghani, Mostafa and Houlsby, Neil},
-    title   = {Dual PatchNorm},
-    publisher = {arXiv},
-    year    = {2023},
-    copyright = {Creative Commons Attribution 4.0 International}
-}
-```
-
 ```bibtex
@misc{vaswani2017attention,
    title   = {Attention Is All You Need},
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
 setup(
  name = 'vit-pytorch',
  packages = find_packages(exclude=['examples']),
-  version = '1.0.2',
+  version = '0.40.2',
  license='MIT',
  description = 'Vision Transformer (ViT) - Pytorch',
  long_description_content_type = 'text/markdown',
--- a/vit_pytorch/ats_vit.py
+++ b/vit_pytorch/ats_vit.py
@@ -230,9 +230,7 @@ class ViT(nn.Module):

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_height, p2 = patch_width),
-            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim)
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
--- a/vit_pytorch/cait.py
+++ b/vit_pytorch/cait.py
@@ -150,9 +150,7 @@ class CaiT(nn.Module):

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_size, p2 = patch_size),
-            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim)
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches, dim))
--- a/vit_pytorch/cross_vit.py
+++ b/vit_pytorch/cross_vit.py
@@ -186,9 +186,7 @@ class ImageEmbedder(nn.Module):

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_size, p2 = patch_size),
-            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim)
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
--- a/vit_pytorch/deepvit.py
+++ b/vit_pytorch/deepvit.py
@@ -105,9 +105,7 @@ class DeepViT(nn.Module):

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_size, p2 = patch_size),
-            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim)
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
--- a/vit_pytorch/efficient.py
+++ b/vit_pytorch/efficient.py
@@ -17,9 +17,7 @@ class ViT(nn.Module):

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_size, p2 = patch_size),
-            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim)
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
--- a/vit_pytorch/learnable_memory_vit.py
+++ b/vit_pytorch/learnable_memory_vit.py
@@ -118,9 +118,7 @@ class ViT(nn.Module):

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_height, p2 = patch_width),
-            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim)
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
--- a/vit_pytorch/local_vit.py
+++ b/vit_pytorch/local_vit.py
@@ -126,9 +126,7 @@ class LocalViT(nn.Module):

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_size, p2 = patch_size),
-            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim),
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
--- a/vit_pytorch/mae.py
+++ b/vit_pytorch/mae.py
@@ -24,11 +24,8 @@ class MAE(nn.Module):

        self.encoder = encoder
        num_patches, encoder_dim = encoder.pos_embedding.shape[-2:]
-
-        self.to_patch = encoder.to_patch_embedding[0]
-        self.patch_to_emb = nn.Sequential(*encoder.to_patch_embedding[1:])
-
-        pixel_values_per_patch = encoder.to_patch_embedding[2].weight.shape[-1]
+        self.to_patch, self.patch_to_emb = encoder.to_patch_embedding[:2]
+        pixel_values_per_patch = self.patch_to_emb.weight.shape[-1]

        # decoder parameters
        self.decoder_dim = decoder_dim
--- a/vit_pytorch/nest.py
+++ b/vit_pytorch/nest.py
@@ -144,9 +144,7 @@ class NesT(nn.Module):

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (p1 p2 c) h w', p1 = patch_size, p2 = patch_size),
-            LayerNorm(patch_dim),
            nn.Conv2d(patch_dim, layer_dims[0], 1),
-            LayerNorm(layer_dims[0])
        )

        block_repeats = cast_tuple(block_repeats, num_hierarchies)
--- a/vit_pytorch/simmim.py
+++ b/vit_pytorch/simmim.py
@@ -18,11 +18,8 @@ class SimMIM(nn.Module):

        self.encoder = encoder
        num_patches, encoder_dim = encoder.pos_embedding.shape[-2:]
-
-        self.to_patch = encoder.to_patch_embedding[0]
-        self.patch_to_emb = nn.Sequential(*encoder.to_patch_embedding[1:])
-
-        pixel_values_per_patch = encoder.to_patch_embedding[2].weight.shape[-1]
+        self.to_patch, self.patch_to_emb = encoder.to_patch_embedding[:2]
+        pixel_values_per_patch = self.patch_to_emb.weight.shape[-1]

        # simple linear head

--- a/vit_pytorch/simple_vit.py
+++ b/vit_pytorch/simple_vit.py
@@ -22,6 +22,27 @@ def posemb_sincos_2d(patches, temperature = 10000, dtype = torch.float32):
    pe = torch.cat((x.sin(), x.cos(), y.sin(), y.cos()), dim = 1)
    return pe.type(dtype)

+# patch dropout
+
+class PatchDropout(nn.Module):
+    def __init__(self, prob):
+        super().__init__()
+        assert 0 <= prob < 1.
+        self.prob = prob
+
+    def forward(self, x):
+        if not self.training or self.prob == 0.:
+            return x
+
+        b, n, _, device = *x.shape, x.device
+
+        batch_indices = torch.arange(b, device = device)
+        batch_indices = rearrange(batch_indices, '... -> ... 1')
+        num_patches_keep = max(1, int(n * (1 - self.prob)))
+        patch_indices_keep = torch.randn(b, n, device = device).topk(num_patches_keep, dim = -1).indices
+
+        return x[batch_indices, patch_indices_keep]
+
 # classes

 class FeedForward(nn.Module):
@@ -91,9 +112,7 @@ class SimpleViT(nn.Module):

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b h w (p1 p2 c)', p1 = patch_height, p2 = patch_width),
-            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim),
        )

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim)
--- a/vit_pytorch/simple_vit_1d.py
+++ b/vit_pytorch/simple_vit_1d.py
@@ -85,9 +85,7 @@ class SimpleViT(nn.Module):

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (n p) -> b n (p c)', p = patch_size),
-            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim),
        )

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim)
--- a/vit_pytorch/simple_vit_3d.py
+++ b/vit_pytorch/simple_vit_3d.py
@@ -103,9 +103,7 @@ class SimpleViT(nn.Module):

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (f pf) (h p1) (w p2) -> b f h w (p1 p2 pf c)', p1 = patch_height, p2 = patch_width, pf = frame_patch_size),
-            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim),
        )

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim)
--- a/vit_pytorch/simple_vit_with_patch_dropout.py
+++ b/vit_pytorch/simple_vit_with_patch_dropout.py
@@ -112,9 +112,7 @@ class SimpleViT(nn.Module):

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b h w (p1 p2 c)', p1 = patch_height, p2 = patch_width),
-            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim)
        )

        self.patch_dropout = PatchDropout(patch_dropout)
--- a/vit_pytorch/twins_svt.py
+++ b/vit_pytorch/twins_svt.py
@@ -71,12 +71,7 @@ class PatchEmbedding(nn.Module):
        self.dim = dim
        self.dim_out = dim_out
        self.patch_size = patch_size
-
-        self.proj = nn.Sequential(
-            LayerNorm(patch_size ** 2 * dim),
-            nn.Conv2d(patch_size ** 2 * dim, dim_out, 1),
-            LayerNorm(dim_out)
-        )
+        self.proj = nn.Conv2d(patch_size ** 2 * dim, dim_out, 1)

    def forward(self, fmap):
        p = self.patch_size
--- a/vit_pytorch/vit.py
+++ b/vit_pytorch/vit.py
@@ -93,9 +93,7 @@ class ViT(nn.Module):

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_height, p2 = patch_width),
-            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim),
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
--- a/vit_pytorch/vit_1d.py
+++ b/vit_pytorch/vit_1d.py
@@ -84,9 +84,7 @@ class ViT(nn.Module):

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (n p) -> b n (p c)', p = patch_size),
-            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim),
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
--- a/vit_pytorch/vit_3d.py
+++ b/vit_pytorch/vit_3d.py
@@ -95,9 +95,7 @@ class ViT(nn.Module):

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (f pf) (h p1) (w p2) -> b (f h w) (p1 p2 pf c)', p1 = patch_height, p2 = patch_width, pf = frame_patch_size),
-            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim),
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
--- a/vit_pytorch/vit_with_patch_merger.py
+++ b/vit_pytorch/vit_with_patch_merger.py
@@ -121,9 +121,7 @@ class ViT(nn.Module):

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_height, p2 = patch_width),
-            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim)
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
--- a/vit_pytorch/vivit.py
+++ b/vit_pytorch/vivit.py
@@ -120,9 +120,7 @@ class ViT(nn.Module):

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (f pf) (h p1) (w p2) -> b f (h w) (p1 p2 pf c)', p1 = patch_height, p2 = patch_width, pf = frame_patch_size),
-            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
-            nn.LayerNorm(dim)
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_frame_patches, num_image_patches, dim))