decoder positional embedding needs to be reapplied https://twitter.com/giffmana/status/1479195631587631104

2025-12-30 08:02:29 +00:00 · 2022-01-06 13:03:50 -08:00
2 changed files with 3 additions and 6 deletions
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
 setup(
  name = 'vit-pytorch',
  packages = find_packages(exclude=['examples']),
-  version = '0.26.2',
+  version = '0.26.3',
  license='MIT',
  description = 'Vision Transformer (ViT) - Pytorch',
  author = 'Phil Wang',
--- a/vit_pytorch/mae.py
+++ b/vit_pytorch/mae.py
@@ -14,13 +14,11 @@ class MAE(nn.Module):
        masking_ratio = 0.75,
        decoder_depth = 1,
        decoder_heads = 8,
-        decoder_dim_head = 64,
-        apply_decoder_pos_emb_all = False # whether to (re)apply decoder positional embedding to encoder unmasked tokens
+        decoder_dim_head = 64
    ):
        super().__init__()
        assert masking_ratio > 0 and masking_ratio < 1, 'masking ratio must be kept between 0 and 1'
        self.masking_ratio = masking_ratio
-        self.apply_decoder_pos_emb_all = apply_decoder_pos_emb_all

        # extract some hyperparameters and functions from encoder (vision transformer to be trained)

@@ -75,8 +73,7 @@ class MAE(nn.Module):

        # reapply decoder position embedding to unmasked tokens, if desired

-        if self.apply_decoder_pos_emb_all:
-            decoder_tokens = decoder_tokens + self.decoder_pos_emb(unmasked_indices)
+        decoder_tokens = decoder_tokens + self.decoder_pos_emb(unmasked_indices)

        # repeat mask tokens for number of masked, and add the positions using the masked indices derived above