From 598cffab534efc38c2e08922f831b5f6a2dd1a01 Mon Sep 17 00:00:00 2001
From: Phil Wang <lucidrains@gmail.com>
Date: Mon, 24 Jul 2023 13:55:54 -0700
Subject: [PATCH] release NaViT

---
 README.md | 4 ++--
 setup.py  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/README.md b/README.md
index 33d95f1..5264e94 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@
 - [Usage](#usage)
 - [Parameters](#parameters)
 - [Simple ViT](#simple-vit)
-- [NaViT](#na-vit)
+- [NaViT](#navit)
 - [Distillation](#distillation)
 - [Deep ViT](#deep-vit)
 - [CaiT](#cait)
@@ -142,7 +142,7 @@ preds = v(img) # (1, 1000)
 
 ## NaViT
 
-<img src="./images/na_vit.png" width="450px"></img>
+<img src="./images/navit.png" width="450px"></img>
 
 <a href="https://arxiv.org/abs/2307.06304">This paper</a> proposes to leverage the flexibility of attention and masking for variable lengthed sequences to train images of multiple resolution, packed into a single batch. They demonstrate much faster training and improved accuracies, with the only cost being extra complexity in the architecture and dataloading. They use factorized 2d positional encodings, token dropping, as well as query-key normalization.
 
diff --git a/setup.py b/setup.py
index b2c2b9a..ed0f185 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
 setup(
   name = 'vit-pytorch',
   packages = find_packages(exclude=['examples']),
-  version = '1.2.5',
+  version = '1.2.6',
   license='MIT',
   description = 'Vision Transformer (ViT) - Pytorch',
   long_description_content_type = 'text/markdown',