fix a bug and add suggestion for BYOL pre-training

2025-12-30 08:02:29 +00:00 · 2020-10-04 14:55:29 -07:00
parent 112ba5c476
commit 8fb261ca66
3 changed files with 58 additions and 3 deletions
--- a/README.md
+++ b/README.md
@@ -30,6 +30,56 @@ img = torch.randn(1, 3, 256, 256)
 preds = v(img) # (1, 1000)
 ```

+## Suggestion
+
+You can train this with a near SOTA self-supervised learning technique, <a href="https://github.com/lucidrains/byol-pytorch">BYOL</a>, with the following code.
+
+(1)
+```bash
+$ pip install byol-pytorch
+```
+
+(2)
+```python
+import torch
+from vit_pytorch import ViT
+from byol_pytorch import BYOL
+
+model = ViT(
+    image_size = 256,
+    patch_size = 32,
+    num_classes = 1000,
+    dim = 1024,
+    depth = 6,
+    heads = 8,
+    mlp_dim = 2048
+)
+
+learner = BYOL(
+    model,
+    image_size = 256,
+    hidden_layer = 'to_cls_token'
+)
+
+opt = torch.optim.Adam(learner.parameters(), lr=3e-4)
+
+def sample_unlabelled_images():
+    return torch.randn(20, 3, 256, 256)
+
+for _ in range(100):
+    images = sample_unlabelled_images()
+    loss = learner(images)
+    opt.zero_grad()
+    loss.backward()
+    opt.step()
+    learner.update_moving_average() # update moving average of target encoder
+
+# save your improved network
+torch.save(model.state_dict(), './pretrained-net.pt')
+```
+
+A pytorch-lightning script is ready for you to use at the repository link above.
+
 ## Citations

 ```bibtex
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
 setup(
  name = 'vit-pytorch',
  packages = find_packages(),
-  version = '0.0.2',
+  version = '0.0.3',
  license='MIT',
  description = 'Vision Transformer (ViT) - Pytorch',
  author = 'Phil Wang',
--- a/vit_pytorch/vit_pytorch.py
+++ b/vit_pytorch/vit_pytorch.py
@@ -77,6 +77,8 @@ class ViT(nn.Module):
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.transformer = Transformer(dim, depth, heads, mlp_dim)

+        self.to_cls_token = nn.Identity()
+
        self.mlp_head = nn.Sequential(
            nn.Linear(dim, mlp_dim),
            nn.GELU(),
@@ -88,8 +90,11 @@ class ViT(nn.Module):

        x = rearrange(img, 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = p, p2 = p)
        x = self.patch_to_embedding(x)
-        x = torch.cat((self.cls_token, x), dim=1)
+
+        cls_tokens = self.cls_token.expand(img.shape[0], -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding
        x = self.transformer(x)

-        return self.mlp_head(x[:, 0])
+        x = self.to_cls_token(x[:, 0])
+        return self.mlp_head(x)