allow for DistillableVit to still run predictions

2025-12-30 08:02:29 +00:00 · 2021-01-11 10:49:14 -08:00
parent 1106a2ba88
commit e8ca6038c9
3 changed files with 19 additions and 7 deletions
--- a/README.md
+++ b/README.md
@@ -102,6 +102,10 @@ labels = torch.randint(0, 1000, (2,))

 loss = distiller(img, labels)
 loss.backward()
+
+# after lots of training above ...
+
+pred = v(img) # (2, 1000)
 ```

 The `DistillableViT` class is identical to `ViT` except for how the forward pass is handled, so you should be able to load the parameters back to `ViT` after you have completed distillation training.
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
 setup(
  name = 'vit-pytorch',
  packages = find_packages(exclude=['examples']),
-  version = '0.6.5',
+  version = '0.6.6',
  license='MIT',
  description = 'Vision Transformer (ViT) - Pytorch',
  author = 'Phil Wang',
--- a/vit_pytorch/distill.py
+++ b/vit_pytorch/distill.py
@@ -14,8 +14,8 @@ def exists(val):
 # classes

 class DistillMixin:
-    def forward(self, img, distill_token, mask = None):
-        p = self.patch_size
+    def forward(self, img, distill_token = None, mask = None):
+        p, distilling = self.patch_size, exists(distill_token)

        x = rearrange(img, 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = p, p2 = p)
        x = self.patch_to_embedding(x)
@@ -25,16 +25,24 @@ class DistillMixin:
        x = torch.cat((cls_tokens, x), dim = 1)
        x += self.pos_embedding[:, :(n + 1)]

-        distill_tokens = repeat(distill_token, '() n d -> b n d', b = b)
-        x = torch.cat((x, distill_tokens), dim = 1)
+        if distilling:
+            distill_tokens = repeat(distill_token, '() n d -> b n d', b = b)
+            x = torch.cat((x, distill_tokens), dim = 1)

        x = self._attend(x, mask)

-        x, distill_tokens = x[:, :-1], x[:, -1]
+        if distilling:
+            x, distill_tokens = x[:, :-1], x[:, -1]
+
        x = x.mean(dim = 1) if self.pool == 'mean' else x[:, 0]

        x = self.to_latent(x)
-        return self.mlp_head(x), distill_tokens
+        out = self.mlp_head(x)
+
+        if distilling:
+            return out, distill_tokens
+
+        return out

 class DistillableViT(DistillMixin, ViT):
    def __init__(self, *args, **kwargs):