1
1
""" EVA
2
2
3
- EVA from https://github.com/baaivision/EVA , paper: https://arxiv.org/abs/2211.07636
3
+ EVA ViT from https://github.com/baaivision/EVA , paper: https://arxiv.org/abs/2211.07636
4
4
5
5
@article{EVA,
6
6
title={EVA: Exploring the Limits of Masked Visual Representation Learning at Scale},
18
18
year={2023}
19
19
}
20
20
21
- This file contains EVA & EVA02 model implementations evolved from BEiT, additional models in vision_transformer.py.
21
+ @article{bolya2025perception,
22
+ title={Perception encoder: The best visual embeddings are not at the output of the network},
23
+ author={Bolya, Daniel and Huang, Po-Yao and Sun, Peize and Cho, Jang Hyun and Madotto, Andrea and Wei, Chen and Ma,
24
+ Tengyu and Zhi, Jiale and Rajasegaran, Jathushan and Rasheed, Hanoona and others},
25
+ journal={arXiv preprint arXiv:2504.13181},
26
+ year={2025}
27
+ }
28
+
29
+ This file contains a number of ViT variants the utilise ROPE position embeddings, SwiGLU and other additions:
30
+ * EVA & EVA02 model implementations that evolved from BEiT, additional models in vision_transformer.py.
31
+ * `timm` original SBB ViT w/ ROPE position embeddings
32
+ * Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)
22
33
23
34
Modifications by / Copyright 2023 Ross Wightman, original copyrights below
24
35
"""
@@ -1295,30 +1306,31 @@ def _pe_cfg(url: str = '', **kwargs) -> Dict[str, Any]:
1295
1306
1296
1307
@register_model
1297
1308
def eva_giant_patch14_224 (pretrained : bool = False , ** kwargs ) -> Eva :
1298
- """ EVA-g model https://arxiv.org/abs/2211.07636 """
1309
+ """EVA-g model https://arxiv.org/abs/2211.07636"""
1299
1310
model_args = dict (patch_size = 14 , embed_dim = 1408 , depth = 40 , num_heads = 16 , mlp_ratio = 6144 / 1408 )
1300
1311
model = _create_eva ('eva_giant_patch14_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
1301
1312
return model
1302
1313
1303
1314
1304
1315
@register_model
1305
1316
def eva_giant_patch14_336 (pretrained : bool = False , ** kwargs ) -> Eva :
1306
- """ EVA-g model https://arxiv.org/abs/2211.07636 """
1317
+ """EVA-g model https://arxiv.org/abs/2211.07636"""
1307
1318
model_args = dict (patch_size = 14 , embed_dim = 1408 , depth = 40 , num_heads = 16 , mlp_ratio = 6144 / 1408 )
1308
1319
model = _create_eva ('eva_giant_patch14_336' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
1309
1320
return model
1310
1321
1311
1322
1312
1323
@register_model
1313
1324
def eva_giant_patch14_560 (pretrained : bool = False , ** kwargs ) -> Eva :
1314
- """ EVA-g model https://arxiv.org/abs/2211.07636 """
1325
+ """EVA-g model https://arxiv.org/abs/2211.07636"""
1315
1326
model_args = dict (patch_size = 14 , embed_dim = 1408 , depth = 40 , num_heads = 16 , mlp_ratio = 6144 / 1408 )
1316
1327
model = _create_eva ('eva_giant_patch14_560' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
1317
1328
return model
1318
1329
1319
1330
1320
1331
@register_model
1321
1332
def eva02_tiny_patch14_224 (pretrained : bool = False , ** kwargs ) -> Eva :
1333
+ """EVA02 Tiny https://arxiv.org/abs/2303.11331"""
1322
1334
model_args = dict (
1323
1335
img_size = 224 ,
1324
1336
patch_size = 14 ,
@@ -1336,6 +1348,7 @@ def eva02_tiny_patch14_224(pretrained: bool = False, **kwargs) -> Eva:
1336
1348
1337
1349
@register_model
1338
1350
def eva02_small_patch14_224 (pretrained : bool = False , ** kwargs ) -> Eva :
1351
+ """EVA02 Small https://arxiv.org/abs/2303.11331"""
1339
1352
model_args = dict (
1340
1353
img_size = 224 ,
1341
1354
patch_size = 14 ,
@@ -1353,6 +1366,7 @@ def eva02_small_patch14_224(pretrained: bool = False, **kwargs) -> Eva:
1353
1366
1354
1367
@register_model
1355
1368
def eva02_base_patch14_224 (pretrained : bool = False , ** kwargs ) -> Eva :
1369
+ """EVA02 Base https://arxiv.org/abs/2303.11331"""
1356
1370
model_args = dict (
1357
1371
img_size = 224 ,
1358
1372
patch_size = 14 ,
@@ -1372,6 +1386,7 @@ def eva02_base_patch14_224(pretrained: bool = False, **kwargs) -> Eva:
1372
1386
1373
1387
@register_model
1374
1388
def eva02_large_patch14_224 (pretrained : bool = False , ** kwargs ) -> Eva :
1389
+ """EVA02 Large https://arxiv.org/abs/2303.11331"""
1375
1390
model_args = dict (
1376
1391
img_size = 224 ,
1377
1392
patch_size = 14 ,
@@ -1391,6 +1406,7 @@ def eva02_large_patch14_224(pretrained: bool = False, **kwargs) -> Eva:
1391
1406
1392
1407
@register_model
1393
1408
def eva02_tiny_patch14_336 (pretrained : bool = False , ** kwargs ) -> Eva :
1409
+ """EVA02 Tiny https://arxiv.org/abs/2303.11331"""
1394
1410
model_args = dict (
1395
1411
img_size = 336 ,
1396
1412
patch_size = 14 ,
@@ -1408,6 +1424,7 @@ def eva02_tiny_patch14_336(pretrained: bool = False, **kwargs) -> Eva:
1408
1424
1409
1425
@register_model
1410
1426
def eva02_small_patch14_336 (pretrained : bool = False , ** kwargs ) -> Eva :
1427
+ """EVA02 Small https://arxiv.org/abs/2303.11331"""
1411
1428
model_args = dict (
1412
1429
img_size = 336 ,
1413
1430
patch_size = 14 ,
@@ -1425,6 +1442,7 @@ def eva02_small_patch14_336(pretrained: bool = False, **kwargs) -> Eva:
1425
1442
1426
1443
@register_model
1427
1444
def eva02_base_patch14_448 (pretrained : bool = False , ** kwargs ) -> Eva :
1445
+ """EVA02 Base https://arxiv.org/abs/2303.11331"""
1428
1446
model_args = dict (
1429
1447
img_size = 448 ,
1430
1448
patch_size = 14 ,
@@ -1444,6 +1462,7 @@ def eva02_base_patch14_448(pretrained: bool = False, **kwargs) -> Eva:
1444
1462
1445
1463
@register_model
1446
1464
def eva02_large_patch14_448 (pretrained : bool = False , ** kwargs ) -> Eva :
1465
+ """EVA02 Large https://arxiv.org/abs/2303.11331"""
1447
1466
model_args = dict (
1448
1467
img_size = 448 ,
1449
1468
patch_size = 14 ,
@@ -1463,7 +1482,7 @@ def eva02_large_patch14_448(pretrained: bool = False, **kwargs) -> Eva:
1463
1482
1464
1483
@register_model
1465
1484
def eva_giant_patch14_clip_224 (pretrained : bool = False , ** kwargs ) -> Eva :
1466
- """ EVA-g CLIP model (only difference from non-CLIP is the pooling) """
1485
+ """EVA-g CLIP model (only difference from non-CLIP is the pooling)"""
1467
1486
model_args = dict (
1468
1487
patch_size = 14 , embed_dim = 1408 , depth = 40 , num_heads = 16 , mlp_ratio = 6144 / 1408 ,
1469
1488
global_pool = kwargs .pop ('global_pool' , 'token' ))
@@ -1473,7 +1492,7 @@ def eva_giant_patch14_clip_224(pretrained: bool = False, **kwargs) -> Eva:
1473
1492
1474
1493
@register_model
1475
1494
def eva02_base_patch16_clip_224 (pretrained : bool = False , ** kwargs ) -> Eva :
1476
- """ A EVA-CLIP specific variant that adds additional attn scale layernorm to eva02_base """
1495
+ """An EVA-CLIP specific variant that adds additional attn scale layer-norm to eva02_base"""
1477
1496
model_args = dict (
1478
1497
img_size = 224 ,
1479
1498
patch_size = 16 ,
@@ -1495,7 +1514,7 @@ def eva02_base_patch16_clip_224(pretrained: bool = False, **kwargs) -> Eva:
1495
1514
1496
1515
@register_model
1497
1516
def eva02_large_patch14_clip_224 (pretrained : bool = False , ** kwargs ) -> Eva :
1498
- """ A EVA-CLIP specific variant that adds additional attn scale layernorm to eva02_large """
1517
+ """An EVA-CLIP specific variant that adds additional attn scale layer-norm to eva02_large"""
1499
1518
model_args = dict (
1500
1519
img_size = 224 ,
1501
1520
patch_size = 14 ,
@@ -1517,7 +1536,7 @@ def eva02_large_patch14_clip_224(pretrained: bool = False, **kwargs) -> Eva:
1517
1536
1518
1537
@register_model
1519
1538
def eva02_large_patch14_clip_336 (pretrained : bool = False , ** kwargs ) -> Eva :
1520
- """ A EVA-CLIP specific variant that adds additional attn scale layernorm to eva02_large """
1539
+ """An EVA-CLIP specific variant that adds additional attn scale layer-norm to eva02_large"""
1521
1540
model_args = dict (
1522
1541
img_size = 336 ,
1523
1542
patch_size = 14 ,
@@ -1539,7 +1558,7 @@ def eva02_large_patch14_clip_336(pretrained: bool = False, **kwargs) -> Eva:
1539
1558
1540
1559
@register_model
1541
1560
def eva02_enormous_patch14_clip_224 (pretrained : bool = False , ** kwargs ) -> Eva :
1542
- """ A EVA-CLIP specific variant that uses residual post-norm in blocks """
1561
+ """An EVA-CLIP specific variant that uses residual post-norm in blocks"""
1543
1562
model_args = dict (
1544
1563
img_size = 224 ,
1545
1564
patch_size = 14 ,
@@ -1556,6 +1575,7 @@ def eva02_enormous_patch14_clip_224(pretrained: bool = False, **kwargs) -> Eva:
1556
1575
1557
1576
@register_model
1558
1577
def vit_medium_patch16_rope_reg1_gap_256 (pretrained : bool = False , ** kwargs ) -> Eva :
1578
+ """timm SBB ViT with ROPE"""
1559
1579
model_args = dict (
1560
1580
img_size = 256 ,
1561
1581
patch_size = 16 ,
@@ -1577,6 +1597,7 @@ def vit_medium_patch16_rope_reg1_gap_256(pretrained: bool = False, **kwargs) ->
1577
1597
1578
1598
@register_model
1579
1599
def vit_mediumd_patch16_rope_reg1_gap_256 (pretrained : bool = False , ** kwargs ) -> Eva :
1600
+ """timm SBB ViT with ROPE"""
1580
1601
model_args = dict (
1581
1602
img_size = 256 ,
1582
1603
patch_size = 16 ,
@@ -1598,6 +1619,7 @@ def vit_mediumd_patch16_rope_reg1_gap_256(pretrained: bool = False, **kwargs) ->
1598
1619
1599
1620
@register_model
1600
1621
def vit_betwixt_patch16_rope_reg4_gap_256 (pretrained : bool = False , ** kwargs ) -> Eva :
1622
+ """timm SBB ViT with ROPE"""
1601
1623
model_args = dict (
1602
1624
img_size = 256 ,
1603
1625
patch_size = 16 ,
@@ -1619,6 +1641,7 @@ def vit_betwixt_patch16_rope_reg4_gap_256(pretrained: bool = False, **kwargs) ->
1619
1641
1620
1642
@register_model
1621
1643
def vit_base_patch16_rope_reg1_gap_256 (pretrained : bool = False , ** kwargs ) -> Eva :
1644
+ """timm SBB ViT with ROPE"""
1622
1645
model_args = dict (
1623
1646
img_size = 256 ,
1624
1647
patch_size = 16 ,
@@ -1640,6 +1663,7 @@ def vit_base_patch16_rope_reg1_gap_256(pretrained: bool = False, **kwargs) -> Ev
1640
1663
1641
1664
@register_model
1642
1665
def vit_pe_core_base_patch16_224 (pretrained : bool = False , ** kwargs ) -> Eva :
1666
+ """Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
1643
1667
model_args = dict (
1644
1668
patch_size = 16 ,
1645
1669
embed_dim = 768 ,
@@ -1663,6 +1687,7 @@ def vit_pe_core_base_patch16_224(pretrained: bool = False, **kwargs) -> Eva:
1663
1687
1664
1688
@register_model
1665
1689
def vit_pe_core_large_patch14_336 (pretrained : bool = False , ** kwargs ) -> Eva :
1690
+ """Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
1666
1691
model_args = dict (
1667
1692
patch_size = 14 ,
1668
1693
embed_dim = 1024 ,
@@ -1686,6 +1711,7 @@ def vit_pe_core_large_patch14_336(pretrained: bool = False, **kwargs) -> Eva:
1686
1711
1687
1712
@register_model
1688
1713
def vit_pe_core_gigantic_patch14_448 (pretrained : bool = False , ** kwargs ) -> Eva :
1714
+ """Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
1689
1715
model_args = dict (
1690
1716
patch_size = 14 ,
1691
1717
embed_dim = 1536 ,
@@ -1709,6 +1735,7 @@ def vit_pe_core_gigantic_patch14_448(pretrained: bool = False, **kwargs) -> Eva:
1709
1735
1710
1736
@register_model
1711
1737
def vit_pe_lang_large_patch14_448 (pretrained : bool = False , ** kwargs ) -> Eva :
1738
+ """Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
1712
1739
model_args = dict (
1713
1740
patch_size = 14 ,
1714
1741
embed_dim = 1024 ,
@@ -1733,6 +1760,7 @@ def vit_pe_lang_large_patch14_448(pretrained: bool = False, **kwargs) -> Eva:
1733
1760
1734
1761
@register_model
1735
1762
def vit_pe_lang_gigantic_patch14_448 (pretrained : bool = False , ** kwargs ) -> Eva :
1763
+ """Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
1736
1764
model_args = dict (
1737
1765
patch_size = 14 ,
1738
1766
embed_dim = 1536 ,
@@ -1756,6 +1784,7 @@ def vit_pe_lang_gigantic_patch14_448(pretrained: bool = False, **kwargs) -> Eva:
1756
1784
1757
1785
@register_model
1758
1786
def vit_pe_spatial_gigantic_patch14_448 (pretrained : bool = False , ** kwargs ) -> Eva :
1787
+ """Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
1759
1788
model_args = dict (
1760
1789
patch_size = 14 ,
1761
1790
embed_dim = 1536 ,
0 commit comments