decoder positional embedding needs to be reapplied https://twitter.com/giffmana/status/1479195631587631104

This commit is contained in:
Phil Wang
2022-01-06 13:14:41 -08:00
parent 28eaba6115
commit 1cc0f182a6
2 changed files with 4 additions and 7 deletions

View File

@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
setup(
name = 'vit-pytorch',
packages = find_packages(exclude=['examples']),
version = '0.26.2',
version = '0.26.3',
license='MIT',
description = 'Vision Transformer (ViT) - Pytorch',
author = 'Phil Wang',

View File

@@ -14,13 +14,11 @@ class MAE(nn.Module):
masking_ratio = 0.75,
decoder_depth = 1,
decoder_heads = 8,
decoder_dim_head = 64,
apply_decoder_pos_emb_all = False # whether to (re)apply decoder positional embedding to encoder unmasked tokens
decoder_dim_head = 64
):
super().__init__()
assert masking_ratio > 0 and masking_ratio < 1, 'masking ratio must be kept between 0 and 1'
self.masking_ratio = masking_ratio
self.apply_decoder_pos_emb_all = apply_decoder_pos_emb_all
# extract some hyperparameters and functions from encoder (vision transformer to be trained)
@@ -73,10 +71,9 @@ class MAE(nn.Module):
decoder_tokens = self.enc_to_dec(encoded_tokens)
# reapply decoder position embedding to unmasked tokens, if desired
# reapply decoder position embedding to unmasked tokens
if self.apply_decoder_pos_emb_all:
decoder_tokens = decoder_tokens + self.decoder_pos_emb(unmasked_indices)
decoder_tokens = decoder_tokens + self.decoder_pos_emb(unmasked_indices)
# repeat mask tokens for number of masked, and add the positions using the masked indices derived above