Minor changes

2025-12-30 08:02:29 +00:00 · 2021-07-01 17:51:35 -07:00
parent a73030c9aa
commit 2ece3333da
2 changed files with 60 additions and 65 deletions
--- a/README.md
+++ b/README.md
@@ -62,70 +62,6 @@ Dropout rate.
 Embedding dropout rate.
 - `pool`: string, either `cls` token pooling or `mean` pooling

-## CCT
-<img src="https://raw.githubusercontent.com/SHI-Labs/Compact-Transformers/main/images/model_sym.png" width="400px"></img>
-
-<a href="https://arxiv.org/abs/2104.05704">CCT</a> proposes compact transformers
-by using convolutions instead of patching and performing sequence pooling. This
-allows for CCT to have high accuracy and a low number of parameters.
-
-You can use this with two methods
-```python
-import torch
-from vit_pytorch.cct import CCT
-
-model = CCT(
-        img_size=224,
-        embedding_dim=768,
-        n_input_channels=3,
-        n_conv_layers=1,
-        kernel_size=7,
-        stride=2,
-        padding=3,
-        pooling_kernel_size=3,
-        pooling_stride=2,
-        pooling_padding=1,
-        num_layers=12,
-        num_heads=12,
-        mlp_radio=4.,
-        num_classes=1000,
-        dropout_rate=0.1,
-        attention_dropout=0.1,
-        stochastic_depth_rate=0.1,
-        positional_embedding='sine', # ['sine', 'learnable', 'none']
-        sequence_length=None,        
-        )
-```
-
-Alternatively you can use one of several pre-defined models `[2,4,6,7,8,14,16]`
-which pre-define the number of layers, number of attention heads, the mlp ratio,
-and the embedding dimension.
-
-```python
-import torch
-from vit_pytorch.cct import cct_2
-
-model = cct_2(
-        img_size=224,
-        n_input_channels=3,
-        n_conv_layers=1,
-        kernel_size=7,
-        stride=2,
-        padding=3,
-        pooling_kernel_size=3,
-        pooling_stride=2,
-        pooling_padding=1,
-        num_classes=1000,
-        dropout_rate=0.1,
-        attention_dropout=0.1,
-        stochastic_depth_rate=0.1,
-        positional_embedding='sine', # ['sine', 'learnable', 'none']
-        sequence_length=None,        
-        )
-```
-<a href="https://github.com/SHI-Labs/Compact-Transformers">Official
-Repository</a>
-

 ## Distillation

@@ -267,6 +203,61 @@ img = torch.randn(1, 3, 224, 224)
 preds = v(img) # (1, 1000)
 ```

+## CCT
+<img src="https://raw.githubusercontent.com/SHI-Labs/Compact-Transformers/main/images/model_sym.png" width="400px"></img>
+
+<a href="https://arxiv.org/abs/2104.05704">CCT</a> proposes compact transformers
+by using convolutions instead of patching and performing sequence pooling. This
+allows for CCT to have high accuracy and a low number of parameters.
+
+You can use this with two methods
+```python
+import torch
+from vit_pytorch.cct import CCT
+
+model = CCT(
+        img_size=224,
+        embedding_dim=384,
+        n_conv_layers=2,
+        kernel_size=7,
+        stride=2,
+        padding=3,
+        pooling_kernel_size=3,
+        pooling_stride=2,
+        pooling_padding=1,
+        num_layers=14,
+        num_heads=6,
+        mlp_radio=3.,
+        num_classes=1000,
+        positional_embedding='learnable', # ['sine', 'learnable', 'none']
+        )
+```
+
+Alternatively you can use one of several pre-defined models `[2,4,6,7,8,14,16]`
+which pre-define the number of layers, number of attention heads, the mlp ratio,
+and the embedding dimension.
+
+```python
+import torch
+from vit_pytorch.cct import cct_14
+
+model = cct_14(
+        img_size=224,
+        n_conv_layers=1,
+        kernel_size=7,
+        stride=2,
+        padding=3,
+        pooling_kernel_size=3,
+        pooling_stride=2,
+        pooling_padding=1,
+        num_classes=1000,
+        positional_embedding='learnable', # ['sine', 'learnable', 'none']  
+        )
+```
+<a href="https://github.com/SHI-Labs/Compact-Transformers">Official
+Repository</a> includes links to pretrained model checkpoints.
+
+
 ## Cross ViT

 <img src="./images/cross_vit.png" width="400px"></img>
--- a/vit_pytorch/cct.py
+++ b/vit_pytorch/cct.py
@@ -3,7 +3,8 @@ import torch.nn as nn
 import torch.nn.functional as F

 # Pre-defined CCT Models
-__all__ = ['cct_2', 'cct_4', 'cct_6', 'cct_7', 'cct_8', 'cct_16']
+__all__ = ['cct_2', 'cct_4', 'cct_6', 'cct_7', 'cct_8', 'cct_14', 'cct_16']
+

 def cct_2(*args, **kwargs):
    return _cct(num_layers=2, num_heads=2, mlp_ratio=1, embedding_dim=128,
@@ -39,6 +40,7 @@ def cct_16(*args, **kwargs):
    return _cct(num_layers=16, num_heads=6, mlp_ratio=3, embedding_dim=384,
                *args, **kwargs)

+
 def _cct(num_layers, num_heads, mlp_ratio, embedding_dim,
         kernel_size=3, stride=None, padding=None,
         *args, **kwargs):
@@ -81,6 +83,7 @@ class Attention(nn.Module):
        x = self.proj_drop(x)
        return x

+
 class TransformerEncoderLayer(nn.Module):
    """
    Inspired by torch.nn.TransformerEncoderLayer and
@@ -143,6 +146,7 @@ class DropPath(nn.Module):
    def forward(self, x):
        return drop_path(x, self.drop_prob, self.training)

+
 class Tokenizer(nn.Module):
    def __init__(self,
                 kernel_size, stride, padding,