Compare commits

...

3 Commits
0.8.0 ... 0.9.2

Author SHA1 Message Date
Phil Wang
2d413a392c remove masking, as it complicates with little benefit 2021-03-23 12:16:32 -07:00
Phil Wang
b900850144 add deep vit 2021-03-23 11:57:13 -07:00
Phil Wang
78489045cd readme 2021-03-09 19:23:09 -08:00
6 changed files with 90 additions and 47 deletions

View File

@@ -117,6 +117,33 @@ v = v.to_vit()
type(v) # <class 'vit_pytorch.vit_pytorch.ViT'>
```
## Deep ViT
This <a href="https://arxiv.org/abs/2103.11886">paper</a> notes that ViT struggles to attend at greater depths (past 12 layers), and suggests mixing the attention of each head post-softmax as a solution, dubbed Re-attention. The results line up with the <a href="https://github.com/lucidrains/x-transformers#talking-heads-attention">Talking Heads</a> paper from NLP.
You can use it as follows
```python
import torch
from vit_pytorch.deepvit import DeepViT
v = DeepViT(
image_size = 256,
patch_size = 32,
num_classes = 1000,
dim = 1024,
depth = 6,
heads = 16,
mlp_dim = 2048,
dropout = 0.1,
emb_dropout = 0.1
)
img = torch.randn(1, 3, 256, 256)
preds = v(img) # (1, 1000)
```
## Token-to-Token ViT
<img src="./t2t.png" width="400px"></img>
@@ -150,15 +177,17 @@ import torch
from vit_pytorch import ViT
from vit_pytorch.mpp import MPP
model = ViT(image_size=256,
patch_size=32,
num_classes=1000,
dim=1024,
depth=6,
heads=8,
mlp_dim=2048,
dropout=0.1,
emb_dropout=0.1)
model = ViT(
image_size=256,
patch_size=32,
num_classes=1000,
dim=1024,
depth=6,
heads=8,
mlp_dim=2048,
dropout=0.1,
emb_dropout=0.1
)
mpp_trainer = MPP(
transformer=model,
@@ -171,11 +200,9 @@ mpp_trainer = MPP(
opt = torch.optim.Adam(mpp_trainer.parameters(), lr=3e-4)
def sample_unlabelled_images():
return torch.randn(20, 3, 256, 256)
for _ in range(100):
images = sample_unlabelled_images()
loss = mpp_trainer(images)
@@ -345,12 +372,23 @@ Coming from computer vision and new to transformers? Here are some resources tha
```bibtex
@misc{yuan2021tokenstotoken,
title = {Tokens-to-Token ViT: Training Vision Transformers from Scratch on ImageNet},
author = {Li Yuan and Yunpeng Chen and Tao Wang and Weihao Yu and Yujun Shi and Francis EH Tay and Jiashi Feng and Shuicheng Yan},
year = {2021},
eprint = {2101.11986},
archivePrefix = {arXiv},
primaryClass = {cs.CV}
title = {Tokens-to-Token ViT: Training Vision Transformers from Scratch on ImageNet},
author = {Li Yuan and Yunpeng Chen and Tao Wang and Weihao Yu and Yujun Shi and Francis EH Tay and Jiashi Feng and Shuicheng Yan},
year = {2021},
eprint = {2101.11986},
archivePrefix = {arXiv},
primaryClass = {cs.CV}
}
```
```bibtex
@misc{zhou2021deepvit,
title = {DeepViT: Towards Deeper Vision Transformer},
author = {Daquan Zhou and Bingyi Kang and Xiaojie Jin and Linjie Yang and Xiaochen Lian and Qibin Hou and Jiashi Feng},
year = {2021},
eprint = {2103.11886},
archivePrefix = {arXiv},
primaryClass = {cs.CV}
}
```

View File

@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
setup(
name = 'vit-pytorch',
packages = find_packages(exclude=['examples']),
version = '0.8.0',
version = '0.9.2',
license='MIT',
description = 'Vision Transformer (ViT) - Pytorch',
author = 'Phil Wang',

View File

@@ -1 +1 @@
from vit_pytorch.vit_pytorch import ViT
from vit_pytorch.vit import ViT

View File

@@ -37,35 +37,41 @@ class Attention(nn.Module):
def __init__(self, dim, heads = 8, dim_head = 64, dropout = 0.):
super().__init__()
inner_dim = dim_head * heads
project_out = not (heads == 1 and dim_head == dim)
self.heads = heads
self.scale = dim_head ** -0.5
self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
self.reattn_weights = nn.Parameter(torch.randn(heads, heads))
self.reattn_norm = nn.Sequential(
Rearrange('b h i j -> b i j h'),
nn.LayerNorm(heads),
Rearrange('b i j h -> b h i j')
)
self.to_out = nn.Sequential(
nn.Linear(inner_dim, dim),
nn.Dropout(dropout)
) if project_out else nn.Identity()
)
def forward(self, x, mask = None):
def forward(self, x):
b, n, _, h = *x.shape, self.heads
qkv = self.to_qkv(x).chunk(3, dim = -1)
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), qkv)
# attention
dots = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale
mask_value = -torch.finfo(dots.dtype).max
if mask is not None:
mask = F.pad(mask.flatten(1), (1, 0), value = True)
assert mask.shape[-1] == dots.shape[-1], 'mask has incorrect dimensions'
mask = rearrange(mask, 'b i -> b () i ()') * rearrange(mask, 'b j -> b () () j')
dots.masked_fill_(~mask, mask_value)
del mask
attn = dots.softmax(dim=-1)
# re-attention
attn = einsum('b h i j, h g -> b g i j', attn, self.reattn_weights)
attn = self.reattn_norm(attn)
# aggregate and out
out = einsum('b h i j, b h j d -> b h i d', attn, v)
out = rearrange(out, 'b h n d -> b n (h d)')
out = self.to_out(out)
@@ -80,13 +86,13 @@ class Transformer(nn.Module):
Residual(PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout))),
Residual(PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout)))
]))
def forward(self, x, mask = None):
def forward(self, x):
for attn, ff in self.layers:
x = attn(x, mask = mask)
x = attn(x)
x = ff(x)
return x
class ViT(nn.Module):
class DeepViT(nn.Module):
def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool = 'cls', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
super().__init__()
assert image_size % patch_size == 0, 'Image dimensions must be divisible by the patch size.'
@@ -113,7 +119,7 @@ class ViT(nn.Module):
nn.Linear(dim, num_classes)
)
def forward(self, img, mask = None):
def forward(self, img):
x = self.to_patch_embedding(img)
b, n, _ = x.shape
@@ -122,7 +128,7 @@ class ViT(nn.Module):
x += self.pos_embedding[:, :(n + 1)]
x = self.dropout(x)
x = self.transformer(x, mask)
x = self.transformer(x)
x = x.mean(dim = 1) if self.pool == 'mean' else x[:, 0]

View File

@@ -1,7 +1,7 @@
import torch
import torch.nn.functional as F
from torch import nn
from vit_pytorch.vit_pytorch import ViT
from vit_pytorch.vit import ViT
from vit_pytorch.t2t import T2TViT
from vit_pytorch.efficient import ViT as EfficientViT
@@ -15,7 +15,7 @@ def exists(val):
# classes
class DistillMixin:
def forward(self, img, distill_token = None, mask = None):
def forward(self, img, distill_token = None):
distilling = exists(distill_token)
x = self.to_patch_embedding(img)
b, n, _ = x.shape
@@ -28,7 +28,7 @@ class DistillMixin:
distill_tokens = repeat(distill_token, '() n d -> b n d', b = b)
x = torch.cat((x, distill_tokens), dim = 1)
x = self._attend(x, mask)
x = self._attend(x)
if distilling:
x, distill_tokens = x[:, :-1], x[:, -1]
@@ -56,9 +56,9 @@ class DistillableViT(DistillMixin, ViT):
v.load_state_dict(self.state_dict())
return v
def _attend(self, x, mask):
def _attend(self, x):
x = self.dropout(x)
x = self.transformer(x, mask)
x = self.transformer(x)
return x
class DistillableT2TViT(DistillMixin, T2TViT):
@@ -74,7 +74,7 @@ class DistillableT2TViT(DistillMixin, T2TViT):
v.load_state_dict(self.state_dict())
return v
def _attend(self, x, mask):
def _attend(self, x):
x = self.dropout(x)
x = self.transformer(x)
return x
@@ -92,7 +92,7 @@ class DistillableEfficientViT(DistillMixin, EfficientViT):
v.load_state_dict(self.state_dict())
return v
def _attend(self, x, mask):
def _attend(self, x):
return self.transformer(x)
# knowledge distillation wrapper

View File

@@ -2,7 +2,7 @@ import math
import torch
from torch import nn
from vit_pytorch.vit_pytorch import Transformer
from vit_pytorch.vit import Transformer
from einops import rearrange, repeat
from einops.layers.torch import Rearrange
@@ -24,8 +24,7 @@ class RearrangeImage(nn.Module):
# main class
class T2TViT(nn.Module):
def __init__(
self, *, image_size, num_classes, dim, depth = None, heads = None, mlp_dim = None, pool = 'cls', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0., transformer = None, t2t_layers = ((7, 4), (3, 2), (3, 2))):
def __init__(self, *, image_size, num_classes, dim, depth = None, heads = None, mlp_dim = None, pool = 'cls', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0., transformer = None, t2t_layers = ((7, 4), (3, 2), (3, 2))):
super().__init__()
assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'