be able to accept non-square patches, thanks to @FilipAndersson245

update readme
0.17.2
2025-12-30 08:02:29 +00:00 · 2021-05-01 20:04:41 -07:00 · 2021-05-01 11:51:35 -07:00 · 2021-04-30 06:44:59 -07:00 · 2021-04-30 06:44:54 -07:00 · 2021-04-30 06:44:41 -07:00
5 changed files with 80 additions and 16 deletions
--- a/README.md
+++ b/README.md
@@ -38,6 +38,7 @@ preds = v(img) # (1, 1000)
 ```

 ## Parameters
+
 - `image_size`: int.  
 Image size. If you have rectangular images, make sure your image size is the maximum of the width and height
 - `patch_size`: int.  
@@ -338,7 +339,7 @@ pred = v(img) # (1, 1000)

 <img src="./images/twins_svt.png" width="400px"></img>

-This <a href="https://arxiv.org/abs/2104.13840">paper</a> mixes local and global attention, along with position encoding generator (proposed in <a href="https://arxiv.org/abs/2102.10882">CPVT</a>) and global average pooling, to achieve the same results as <a href="https://arxiv.org/abs/2103.14030">Swin</a>, without the extra complexity of shifted windows, etc.
+This <a href="https://arxiv.org/abs/2104.13840">paper</a> proposes mixing local and global attention, along with position encoding generator (proposed in <a href="https://arxiv.org/abs/2102.10882">CPVT</a>) and global average pooling, to achieve the same results as <a href="https://arxiv.org/abs/2103.14030">Swin</a>, without the extra complexity of shifted windows, CLS tokens, nor positional embeddings.

 ```python
 import torch
@@ -583,6 +584,58 @@ img = torch.randn(1, 3, 224, 224)
 v(img) # (1, 1000)
 ```

+## FAQ
+
+- How do I pass in non-square images?
+
+You can already pass in non-square images - you just have to make sure your height and width is less than or equal to the `image_size`, and both divisible by the `patch_size`
+
+ex.
+
+```python
+import torch
+from vit_pytorch import ViT
+
+v = ViT(
+    image_size = 256,
+    patch_size = 32,
+    num_classes = 1000,
+    dim = 1024,
+    depth = 6,
+    heads = 16,
+    mlp_dim = 2048,
+    dropout = 0.1,
+    emb_dropout = 0.1
+)
+
+img = torch.randn(1, 3, 256, 128) # <-- not a square
+
+preds = v(img) # (1, 1000)
+```
+
+- How do I pass in non-square patches?
+
+```python
+import torch
+from vit_pytorch import ViT
+
+v = ViT(
+    num_classes = 1000,
+    image_size = (256, 128),  # image size is a tuple of (height, width)
+    patch_size = (32, 16),    # patch size is a tuple of (height, width)
+    dim = 1024,
+    depth = 6,
+    heads = 16,
+    mlp_dim = 2048,
+    dropout = 0.1,
+    emb_dropout = 0.1
+)
+
+img = torch.randn(1, 3, 256, 128)
+
+preds = v(img)
+```
+
 ## Resources

 Coming from computer vision and new to transformers? Here are some resources that greatly accelerated my learning.
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
 setup(
  name = 'vit-pytorch',
  packages = find_packages(exclude=['examples']),
-  version = '0.17.0',
+  version = '0.17.3',
  license='MIT',
  description = 'Vision Transformer (ViT) - Pytorch',
  author = 'Phil Wang',
--- a/vit_pytorch/rvt.py
+++ b/vit_pytorch/rvt.py
@@ -83,11 +83,11 @@ class GEGLU(nn.Module):
        return F.gelu(gates) * x

 class FeedForward(nn.Module):
-    def __init__(self, dim, hidden_dim, dropout = 0.):
+    def __init__(self, dim, hidden_dim, dropout = 0., use_glu = True):
        super().__init__()
        self.net = nn.Sequential(
-            nn.Linear(dim, hidden_dim * 2),
-            GEGLU(),
+            nn.Linear(dim, hidden_dim * 2 if use_glu else hidden_dim),
+            GEGLU() if use_glu else nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
@@ -154,14 +154,14 @@ class Attention(nn.Module):
        return self.to_out(out)

 class Transformer(nn.Module):
-    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0., use_rotary = True, use_ds_conv = True):
+    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0., use_rotary = True, use_ds_conv = True, use_glu = True):
        super().__init__()
        self.layers = nn.ModuleList([])
        self.pos_emb = AxialRotaryEmbedding(dim_head)
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
                PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout, use_rotary = use_rotary, use_ds_conv = use_ds_conv)),
-                PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout))
+                PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout, use_glu = use_glu))
            ]))
    def forward(self, x, fmap_dims):
        pos_emb = self.pos_emb(x[:, 1:])
@@ -174,7 +174,7 @@ class Transformer(nn.Module):
 # Rotary Vision Transformer

 class RvT(nn.Module):
-    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0., use_rotary = True, use_ds_conv = True):
+    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0., use_rotary = True, use_ds_conv = True, use_glu = True):
        super().__init__()
        assert image_size % patch_size == 0, 'Image dimensions must be divisible by the patch size.'
        num_patches = (image_size // patch_size) ** 2
@@ -187,7 +187,7 @@ class RvT(nn.Module):
        )

        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
-        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout, use_rotary, use_ds_conv)
+        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout, use_rotary, use_ds_conv, use_glu)

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
--- a/vit_pytorch/twins_svt.py
+++ b/vit_pytorch/twins_svt.py
@@ -162,11 +162,11 @@ class Transformer(nn.Module):
                Residual(PreNorm(dim, FeedForward(dim, mlp_mult, dropout = dropout)))
            ]))
    def forward(self, x):
-        for local_attn, ff, global_attn, ff in self.layers:
+        for local_attn, ff1, global_attn, ff2 in self.layers:
            x = local_attn(x)
-            x = ff(x)
+            x = ff1(x)
            x = global_attn(x)
-            x = ff(x)
+            x = ff2(x)
        return x

 class TwinsSVT(nn.Module):
--- a/vit_pytorch/vit.py
+++ b/vit_pytorch/vit.py
@@ -5,6 +5,13 @@ import torch.nn.functional as F
 from einops import rearrange, repeat
 from einops.layers.torch import Rearrange

+# helpers
+
+def pair(t):
+    return t if isinstance(t, tuple) else (t, t)
+
+# classes
+
 class PreNorm(nn.Module):
    def __init__(self, dim, fn):
        super().__init__()
@@ -74,13 +81,17 @@ class Transformer(nn.Module):
 class ViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool = 'cls', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
        super().__init__()
-        assert image_size % patch_size == 0, 'Image dimensions must be divisible by the patch size.'
-        num_patches = (image_size // patch_size) ** 2
-        patch_dim = channels * patch_size ** 2
+        image_height, image_width = pair(image_size)
+        patch_height, patch_width = pair(patch_size)
+
+        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'
+
+        num_patches = (image_height // patch_height) * (image_width // patch_width)
+        patch_dim = channels * patch_height * patch_width
        assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'

        self.to_patch_embedding = nn.Sequential(
-            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_size, p2 = patch_size),
+            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_height, p2 = patch_width),
            nn.Linear(patch_dim, dim),
        )
Author	SHA1	Message	Date
Phil Wang	6549522629	be able to accept non-square patches, thanks to @FilipAndersson245	2021-05-01 20:04:41 -07:00
Phil Wang	6a80a4ef89	update readme	2021-05-01 11:51:35 -07:00
Phil Wang	9f05587a7d	0.17.2	2021-04-30 06:44:59 -07:00
Phil Wang	65bb350e85	0.17.2	2021-04-30 06:44:54 -07:00
Phil Wang	fd4a7dfcf8	Merge pull request #102 from jon-tow/rvt-add-use-glu-flag Add `use_glu` flag to `RvT`	2021-04-30 06:44:41 -07:00
Jonathan Tow	6f3a5fcf0b	Add `use_glu` flag to `RvT`	2021-04-30 02:07:41 -04:00
Phil Wang	7807f24509	fix small bug	2021-04-29 15:39:41 -07:00