diff --git a/README.md b/README.md index 7b9146f..dee0421 100644 --- a/README.md +++ b/README.md @@ -102,6 +102,10 @@ labels = torch.randint(0, 1000, (2,)) loss = distiller(img, labels) loss.backward() + +# after lots of training above ... + +pred = v(img) # (2, 1000) ``` The `DistillableViT` class is identical to `ViT` except for how the forward pass is handled, so you should be able to load the parameters back to `ViT` after you have completed distillation training. diff --git a/setup.py b/setup.py index 97efc64..bc01f11 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup( name = 'vit-pytorch', packages = find_packages(exclude=['examples']), - version = '0.6.5', + version = '0.6.6', license='MIT', description = 'Vision Transformer (ViT) - Pytorch', author = 'Phil Wang', diff --git a/vit_pytorch/distill.py b/vit_pytorch/distill.py index 2864dbc..b199713 100644 --- a/vit_pytorch/distill.py +++ b/vit_pytorch/distill.py @@ -14,8 +14,8 @@ def exists(val): # classes class DistillMixin: - def forward(self, img, distill_token, mask = None): - p = self.patch_size + def forward(self, img, distill_token = None, mask = None): + p, distilling = self.patch_size, exists(distill_token) x = rearrange(img, 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = p, p2 = p) x = self.patch_to_embedding(x) @@ -25,16 +25,24 @@ class DistillMixin: x = torch.cat((cls_tokens, x), dim = 1) x += self.pos_embedding[:, :(n + 1)] - distill_tokens = repeat(distill_token, '() n d -> b n d', b = b) - x = torch.cat((x, distill_tokens), dim = 1) + if distilling: + distill_tokens = repeat(distill_token, '() n d -> b n d', b = b) + x = torch.cat((x, distill_tokens), dim = 1) x = self._attend(x, mask) - x, distill_tokens = x[:, :-1], x[:, -1] + if distilling: + x, distill_tokens = x[:, :-1], x[:, -1] + x = x.mean(dim = 1) if self.pool == 'mean' else x[:, 0] x = self.to_latent(x) - return self.mlp_head(x), distill_tokens + out = self.mlp_head(x) + + if distilling: + return out, distill_tokens + + return out class DistillableViT(DistillMixin, ViT): def __init__(self, *args, **kwargs):