allow for training on different image sizes, provided images are smaller than what was passed as image_size keyword on init

2025-12-30 08:02:29 +00:00 · 2020-10-25 13:17:42 -07:00
parent 6d1df1a970
commit 7a214d7109
3 changed files with 7 additions and 5 deletions
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
 setup(
  name = 'vit-pytorch',
  packages = find_packages(exclude=['examples']),
-  version = '0.2.5',
+  version = '0.2.6',
  license='MIT',
  description = 'Vision Transformer (ViT) - Pytorch',
  author = 'Phil Wang',
--- a/vit_pytorch/efficient.py
+++ b/vit_pytorch/efficient.py
@@ -30,10 +30,11 @@ class ViT(nn.Module):

        x = rearrange(img, 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = p, p2 = p)
        x = self.patch_to_embedding(x)
+        b, n, _ = x.shape

-        cls_tokens = self.cls_token.expand(img.shape[0], -1, -1)
+        cls_tokens = self.cls_token.expand(b, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
-        x += self.pos_embedding
+        x += self.pos_embedding[:, :(n + 1)]
        x = self.transformer(x)

        x = self.to_cls_token(x[:, 0])
--- a/vit_pytorch/vit_pytorch.py
+++ b/vit_pytorch/vit_pytorch.py
@@ -113,10 +113,11 @@ class ViT(nn.Module):

        x = rearrange(img, 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = p, p2 = p)
        x = self.patch_to_embedding(x)
+        b, n, _ = x.shape

-        cls_tokens = self.cls_token.expand(img.shape[0], -1, -1)
+        cls_tokens = self.cls_token.expand(b, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
-        x += self.pos_embedding
+        x += self.pos_embedding[:, :(n + 1)]
        x = self.dropout(x)

        x = self.transformer(x, mask)