Skip to content

Commit 8d41071

Browse files
committed
Update comments in EVA ViT impl to reference PE, and timm SBB variants
1 parent 7101adb commit 8d41071

File tree

1 file changed

+39
-10
lines changed

1 file changed

+39
-10
lines changed

timm/models/eva.py

Lines changed: 39 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
""" EVA
22
3-
EVA from https://github.com/baaivision/EVA , paper: https://arxiv.org/abs/2211.07636
3+
EVA ViT from https://github.com/baaivision/EVA , paper: https://arxiv.org/abs/2211.07636
44
55
@article{EVA,
66
title={EVA: Exploring the Limits of Masked Visual Representation Learning at Scale},
@@ -18,7 +18,18 @@
1818
year={2023}
1919
}
2020
21-
This file contains EVA & EVA02 model implementations evolved from BEiT, additional models in vision_transformer.py.
21+
@article{bolya2025perception,
22+
title={Perception encoder: The best visual embeddings are not at the output of the network},
23+
author={Bolya, Daniel and Huang, Po-Yao and Sun, Peize and Cho, Jang Hyun and Madotto, Andrea and Wei, Chen and Ma,
24+
Tengyu and Zhi, Jiale and Rajasegaran, Jathushan and Rasheed, Hanoona and others},
25+
journal={arXiv preprint arXiv:2504.13181},
26+
year={2025}
27+
}
28+
29+
This file contains a number of ViT variants the utilise ROPE position embeddings, SwiGLU and other additions:
30+
* EVA & EVA02 model implementations that evolved from BEiT, additional models in vision_transformer.py.
31+
* `timm` original SBB ViT w/ ROPE position embeddings
32+
* Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)
2233
2334
Modifications by / Copyright 2023 Ross Wightman, original copyrights below
2435
"""
@@ -1295,30 +1306,31 @@ def _pe_cfg(url: str = '', **kwargs) -> Dict[str, Any]:
12951306

12961307
@register_model
12971308
def eva_giant_patch14_224(pretrained: bool = False, **kwargs) -> Eva:
1298-
""" EVA-g model https://arxiv.org/abs/2211.07636 """
1309+
"""EVA-g model https://arxiv.org/abs/2211.07636"""
12991310
model_args = dict(patch_size=14, embed_dim=1408, depth=40, num_heads=16, mlp_ratio=6144 / 1408)
13001311
model = _create_eva('eva_giant_patch14_224', pretrained=pretrained, **dict(model_args, **kwargs))
13011312
return model
13021313

13031314

13041315
@register_model
13051316
def eva_giant_patch14_336(pretrained: bool = False, **kwargs) -> Eva:
1306-
""" EVA-g model https://arxiv.org/abs/2211.07636 """
1317+
"""EVA-g model https://arxiv.org/abs/2211.07636"""
13071318
model_args = dict(patch_size=14, embed_dim=1408, depth=40, num_heads=16, mlp_ratio=6144 / 1408)
13081319
model = _create_eva('eva_giant_patch14_336', pretrained=pretrained, **dict(model_args, **kwargs))
13091320
return model
13101321

13111322

13121323
@register_model
13131324
def eva_giant_patch14_560(pretrained: bool = False, **kwargs) -> Eva:
1314-
""" EVA-g model https://arxiv.org/abs/2211.07636 """
1325+
"""EVA-g model https://arxiv.org/abs/2211.07636"""
13151326
model_args = dict(patch_size=14, embed_dim=1408, depth=40, num_heads=16, mlp_ratio=6144 / 1408)
13161327
model = _create_eva('eva_giant_patch14_560', pretrained=pretrained, **dict(model_args, **kwargs))
13171328
return model
13181329

13191330

13201331
@register_model
13211332
def eva02_tiny_patch14_224(pretrained: bool = False, **kwargs) -> Eva:
1333+
"""EVA02 Tiny https://arxiv.org/abs/2303.11331"""
13221334
model_args = dict(
13231335
img_size=224,
13241336
patch_size=14,
@@ -1336,6 +1348,7 @@ def eva02_tiny_patch14_224(pretrained: bool = False, **kwargs) -> Eva:
13361348

13371349
@register_model
13381350
def eva02_small_patch14_224(pretrained: bool = False, **kwargs) -> Eva:
1351+
"""EVA02 Small https://arxiv.org/abs/2303.11331"""
13391352
model_args = dict(
13401353
img_size=224,
13411354
patch_size=14,
@@ -1353,6 +1366,7 @@ def eva02_small_patch14_224(pretrained: bool = False, **kwargs) -> Eva:
13531366

13541367
@register_model
13551368
def eva02_base_patch14_224(pretrained: bool = False, **kwargs) -> Eva:
1369+
"""EVA02 Base https://arxiv.org/abs/2303.11331"""
13561370
model_args = dict(
13571371
img_size=224,
13581372
patch_size=14,
@@ -1372,6 +1386,7 @@ def eva02_base_patch14_224(pretrained: bool = False, **kwargs) -> Eva:
13721386

13731387
@register_model
13741388
def eva02_large_patch14_224(pretrained: bool = False, **kwargs) -> Eva:
1389+
"""EVA02 Large https://arxiv.org/abs/2303.11331"""
13751390
model_args = dict(
13761391
img_size=224,
13771392
patch_size=14,
@@ -1391,6 +1406,7 @@ def eva02_large_patch14_224(pretrained: bool = False, **kwargs) -> Eva:
13911406

13921407
@register_model
13931408
def eva02_tiny_patch14_336(pretrained: bool = False, **kwargs) -> Eva:
1409+
"""EVA02 Tiny https://arxiv.org/abs/2303.11331"""
13941410
model_args = dict(
13951411
img_size=336,
13961412
patch_size=14,
@@ -1408,6 +1424,7 @@ def eva02_tiny_patch14_336(pretrained: bool = False, **kwargs) -> Eva:
14081424

14091425
@register_model
14101426
def eva02_small_patch14_336(pretrained: bool = False, **kwargs) -> Eva:
1427+
"""EVA02 Small https://arxiv.org/abs/2303.11331"""
14111428
model_args = dict(
14121429
img_size=336,
14131430
patch_size=14,
@@ -1425,6 +1442,7 @@ def eva02_small_patch14_336(pretrained: bool = False, **kwargs) -> Eva:
14251442

14261443
@register_model
14271444
def eva02_base_patch14_448(pretrained: bool = False, **kwargs) -> Eva:
1445+
"""EVA02 Base https://arxiv.org/abs/2303.11331"""
14281446
model_args = dict(
14291447
img_size=448,
14301448
patch_size=14,
@@ -1444,6 +1462,7 @@ def eva02_base_patch14_448(pretrained: bool = False, **kwargs) -> Eva:
14441462

14451463
@register_model
14461464
def eva02_large_patch14_448(pretrained: bool = False, **kwargs) -> Eva:
1465+
"""EVA02 Large https://arxiv.org/abs/2303.11331"""
14471466
model_args = dict(
14481467
img_size=448,
14491468
patch_size=14,
@@ -1463,7 +1482,7 @@ def eva02_large_patch14_448(pretrained: bool = False, **kwargs) -> Eva:
14631482

14641483
@register_model
14651484
def eva_giant_patch14_clip_224(pretrained: bool = False, **kwargs) -> Eva:
1466-
""" EVA-g CLIP model (only difference from non-CLIP is the pooling) """
1485+
"""EVA-g CLIP model (only difference from non-CLIP is the pooling)"""
14671486
model_args = dict(
14681487
patch_size=14, embed_dim=1408, depth=40, num_heads=16, mlp_ratio=6144 / 1408,
14691488
global_pool=kwargs.pop('global_pool', 'token'))
@@ -1473,7 +1492,7 @@ def eva_giant_patch14_clip_224(pretrained: bool = False, **kwargs) -> Eva:
14731492

14741493
@register_model
14751494
def eva02_base_patch16_clip_224(pretrained: bool = False, **kwargs) -> Eva:
1476-
""" A EVA-CLIP specific variant that adds additional attn scale layernorm to eva02_base """
1495+
"""An EVA-CLIP specific variant that adds additional attn scale layer-norm to eva02_base"""
14771496
model_args = dict(
14781497
img_size=224,
14791498
patch_size=16,
@@ -1495,7 +1514,7 @@ def eva02_base_patch16_clip_224(pretrained: bool = False, **kwargs) -> Eva:
14951514

14961515
@register_model
14971516
def eva02_large_patch14_clip_224(pretrained: bool = False, **kwargs) -> Eva:
1498-
""" A EVA-CLIP specific variant that adds additional attn scale layernorm to eva02_large """
1517+
"""An EVA-CLIP specific variant that adds additional attn scale layer-norm to eva02_large"""
14991518
model_args = dict(
15001519
img_size=224,
15011520
patch_size=14,
@@ -1517,7 +1536,7 @@ def eva02_large_patch14_clip_224(pretrained: bool = False, **kwargs) -> Eva:
15171536

15181537
@register_model
15191538
def eva02_large_patch14_clip_336(pretrained: bool = False, **kwargs) -> Eva:
1520-
""" A EVA-CLIP specific variant that adds additional attn scale layernorm to eva02_large """
1539+
"""An EVA-CLIP specific variant that adds additional attn scale layer-norm to eva02_large"""
15211540
model_args = dict(
15221541
img_size=336,
15231542
patch_size=14,
@@ -1539,7 +1558,7 @@ def eva02_large_patch14_clip_336(pretrained: bool = False, **kwargs) -> Eva:
15391558

15401559
@register_model
15411560
def eva02_enormous_patch14_clip_224(pretrained: bool = False, **kwargs) -> Eva:
1542-
""" A EVA-CLIP specific variant that uses residual post-norm in blocks """
1561+
"""An EVA-CLIP specific variant that uses residual post-norm in blocks"""
15431562
model_args = dict(
15441563
img_size=224,
15451564
patch_size=14,
@@ -1556,6 +1575,7 @@ def eva02_enormous_patch14_clip_224(pretrained: bool = False, **kwargs) -> Eva:
15561575

15571576
@register_model
15581577
def vit_medium_patch16_rope_reg1_gap_256(pretrained: bool = False, **kwargs) -> Eva:
1578+
"""timm SBB ViT with ROPE"""
15591579
model_args = dict(
15601580
img_size=256,
15611581
patch_size=16,
@@ -1577,6 +1597,7 @@ def vit_medium_patch16_rope_reg1_gap_256(pretrained: bool = False, **kwargs) ->
15771597

15781598
@register_model
15791599
def vit_mediumd_patch16_rope_reg1_gap_256(pretrained: bool = False, **kwargs) -> Eva:
1600+
"""timm SBB ViT with ROPE"""
15801601
model_args = dict(
15811602
img_size=256,
15821603
patch_size=16,
@@ -1598,6 +1619,7 @@ def vit_mediumd_patch16_rope_reg1_gap_256(pretrained: bool = False, **kwargs) ->
15981619

15991620
@register_model
16001621
def vit_betwixt_patch16_rope_reg4_gap_256(pretrained: bool = False, **kwargs) -> Eva:
1622+
"""timm SBB ViT with ROPE"""
16011623
model_args = dict(
16021624
img_size=256,
16031625
patch_size=16,
@@ -1619,6 +1641,7 @@ def vit_betwixt_patch16_rope_reg4_gap_256(pretrained: bool = False, **kwargs) ->
16191641

16201642
@register_model
16211643
def vit_base_patch16_rope_reg1_gap_256(pretrained: bool = False, **kwargs) -> Eva:
1644+
"""timm SBB ViT with ROPE"""
16221645
model_args = dict(
16231646
img_size=256,
16241647
patch_size=16,
@@ -1640,6 +1663,7 @@ def vit_base_patch16_rope_reg1_gap_256(pretrained: bool = False, **kwargs) -> Ev
16401663

16411664
@register_model
16421665
def vit_pe_core_base_patch16_224(pretrained: bool = False, **kwargs) -> Eva:
1666+
"""Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
16431667
model_args = dict(
16441668
patch_size=16,
16451669
embed_dim=768,
@@ -1663,6 +1687,7 @@ def vit_pe_core_base_patch16_224(pretrained: bool = False, **kwargs) -> Eva:
16631687

16641688
@register_model
16651689
def vit_pe_core_large_patch14_336(pretrained: bool = False, **kwargs) -> Eva:
1690+
"""Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
16661691
model_args = dict(
16671692
patch_size=14,
16681693
embed_dim=1024,
@@ -1686,6 +1711,7 @@ def vit_pe_core_large_patch14_336(pretrained: bool = False, **kwargs) -> Eva:
16861711

16871712
@register_model
16881713
def vit_pe_core_gigantic_patch14_448(pretrained: bool = False, **kwargs) -> Eva:
1714+
"""Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
16891715
model_args = dict(
16901716
patch_size=14,
16911717
embed_dim=1536,
@@ -1709,6 +1735,7 @@ def vit_pe_core_gigantic_patch14_448(pretrained: bool = False, **kwargs) -> Eva:
17091735

17101736
@register_model
17111737
def vit_pe_lang_large_patch14_448(pretrained: bool = False, **kwargs) -> Eva:
1738+
"""Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
17121739
model_args = dict(
17131740
patch_size=14,
17141741
embed_dim=1024,
@@ -1733,6 +1760,7 @@ def vit_pe_lang_large_patch14_448(pretrained: bool = False, **kwargs) -> Eva:
17331760

17341761
@register_model
17351762
def vit_pe_lang_gigantic_patch14_448(pretrained: bool = False, **kwargs) -> Eva:
1763+
"""Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
17361764
model_args = dict(
17371765
patch_size=14,
17381766
embed_dim=1536,
@@ -1756,6 +1784,7 @@ def vit_pe_lang_gigantic_patch14_448(pretrained: bool = False, **kwargs) -> Eva:
17561784

17571785
@register_model
17581786
def vit_pe_spatial_gigantic_patch14_448(pretrained: bool = False, **kwargs) -> Eva:
1787+
"""Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
17591788
model_args = dict(
17601789
patch_size=14,
17611790
embed_dim=1536,

0 commit comments

Comments
 (0)