CCT allow for rectangular images

add proposed parallel vit from facebook ai for exploration purposes
2026-05-14 12:18:06 +00:00 · 2022-03-26 14:02:49 -07:00 · 2022-03-23 10:42:35 -07:00
3 changed files with 46 additions and 37 deletions
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@
 - [Adaptive Token Sampling](#adaptive-token-sampling)
 - [Patch Merger](#patch-merger)
 - [Vision Transformer for Small Datasets](#vision-transformer-for-small-datasets)
- [Parallel ViT](#parallelvit)
+- [Parallel ViT](#parallel-vit)
 - [Dino](#dino)
 - [Accessing Attention](#accessing-attention)
 - [Research Ideas](#research-ideas)
@@ -253,22 +253,25 @@ You can use this with two methods
 import torch
 from vit_pytorch.cct import CCT

-model = CCT(
-        img_size=224,
-        embedding_dim=384,
-        n_conv_layers=2,
-        kernel_size=7,
-        stride=2,
-        padding=3,
-        pooling_kernel_size=3,
-        pooling_stride=2,
-        pooling_padding=1,
-        num_layers=14,
-        num_heads=6,
-        mlp_radio=3.,
-        num_classes=1000,
-        positional_embedding='learnable', # ['sine', 'learnable', 'none']
-        )
+cct = CCT(
+    img_size = (224, 448),
+    embedding_dim = 384,
+    n_conv_layers = 2,
+    kernel_size = 7,
+    stride = 2,
+    padding = 3,
+    pooling_kernel_size = 3,
+    pooling_stride = 2,
+    pooling_padding = 1,
+    num_layers = 14,
+    num_heads = 6,
+    mlp_radio = 3.,
+    num_classes = 1000,
+    positional_embedding = 'learnable', # ['sine', 'learnable', 'none']
+)
+
+img = torch.randn(1, 3, 224, 448)
+pred = cct(img) # (1, 1000)
 ```

 Alternatively you can use one of several pre-defined models `[2,4,6,7,8,14,16]`
@@ -279,23 +282,23 @@ and the embedding dimension.
 import torch
 from vit_pytorch.cct import cct_14

-model = cct_14(
-        img_size=224,
-        n_conv_layers=1,
-        kernel_size=7,
-        stride=2,
-        padding=3,
-        pooling_kernel_size=3,
-        pooling_stride=2,
-        pooling_padding=1,
-        num_classes=1000,
-        positional_embedding='learnable', # ['sine', 'learnable', 'none']  
-        )
+cct = cct_14(
+    img_size = 224,
+    n_conv_layers = 1,
+    kernel_size = 7,
+    stride = 2,
+    padding = 3,
+    pooling_kernel_size = 3,
+    pooling_stride = 2,
+    pooling_padding = 1,
+    num_classes = 1000,
+    positional_embedding = 'learnable', # ['sine', 'learnable', 'none']
+)
 ```
+
 <a href="https://github.com/SHI-Labs/Compact-Transformers">Official
 Repository</a> includes links to pretrained model checkpoints.

-
 ## Cross ViT

 <img src="./images/cross_vit.png" width="400px"></img>
@@ -885,7 +888,7 @@ v = ViT(
    patch_size = 16,
    num_classes = 1000,
    dim = 1024,
-    depth = 12,
+    depth = 6,
    heads = 8,
    mlp_dim = 2048,
    num_parallel_branches = 2,  # in paper, they claimed 2 was optimal
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
 setup(
  name = 'vit-pytorch',
  packages = find_packages(exclude=['examples']),
-  version = '0.29.0',
+  version = '0.29.1',
  license='MIT',
  description = 'Vision Transformer (ViT) - Pytorch',
  author = 'Phil Wang',
--- a/vit_pytorch/cct.py
+++ b/vit_pytorch/cct.py
@@ -2,7 +2,13 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F

-# Pre-defined CCT Models
+# helpers
+
+def pair(t):
+    return t if isinstance(t, tuple) else (t, t)
+
+# CCT Models
+
 __all__ = ['cct_2', 'cct_4', 'cct_6', 'cct_7', 'cct_8', 'cct_14', 'cct_16']


@@ -55,8 +61,8 @@ def _cct(num_layers, num_heads, mlp_ratio, embedding_dim,
               padding=padding,
               *args, **kwargs)

+# modules

-# Modules
 class Attention(nn.Module):
    def __init__(self, dim, num_heads=8, attention_dropout=0.1, projection_dropout=0.1):
        super().__init__()
@@ -308,6 +314,7 @@ class CCT(nn.Module):
                 pooling_padding=1,
                 *args, **kwargs):
        super(CCT, self).__init__()
+        img_height, img_width = pair(img_size)

        self.tokenizer = Tokenizer(n_input_channels=n_input_channels,
                                   n_output_channels=embedding_dim,
@@ -324,8 +331,8 @@ class CCT(nn.Module):

        self.classifier = TransformerClassifier(
            sequence_length=self.tokenizer.sequence_length(n_channels=n_input_channels,
-                                                           height=img_size,
-                                                           width=img_size),
+                                                           height=img_height,
+                                                           width=img_width),
            embedding_dim=embedding_dim,
            seq_pool=True,
            dropout_rate=0.,
@@ -336,4 +343,3 @@ class CCT(nn.Module):
    def forward(self, x):
        x = self.tokenizer(x)
        return self.classifier(x)
-
Author	SHA1	Message	Date
Phil Wang	9cd56ff29b	CCT allow for rectangular images	2022-03-26 14:02:49 -07:00
Phil Wang	2aae406ce8	add proposed parallel vit from facebook ai for exploration purposes	2022-03-23 10:42:35 -07:00