correct need for post-attention dropout

2025-12-30 08:02:29 +00:00 · 2022-03-30 10:50:57 -07:00
parent 6d7298d8ad
commit 4e6a42a0ca
20 changed files with 61 additions and 2 deletions
--- a/vit_pytorch/deepvit.py
+++ b/vit_pytorch/deepvit.py
@@ -42,6 +42,8 @@ class Attention(nn.Module):

        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)

+        self.dropout = nn.Dropout(dropout)
+
        self.reattn_weights = nn.Parameter(torch.randn(heads, heads))

        self.reattn_norm = nn.Sequential(
@@ -64,6 +66,7 @@ class Attention(nn.Module):

        dots = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale
        attn = dots.softmax(dim=-1)
+        attn = self.dropout(attn)

        # re-attention