Skip to content

Commit 6fb7aaf

Browse files
committed
Switching to timm specific weight instances for open_clip image encoders to facilitate hf-hub: use in timm and new transformers TimmWrapper
1 parent 364c567 commit 6fb7aaf

File tree

2 files changed

+32
-54
lines changed

2 files changed

+32
-54
lines changed

timm/models/convnext.py

Lines changed: 10 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -916,53 +916,43 @@ def _cfgv2(url='', **kwargs):
916916

917917
# CLIP original image tower weights
918918
'convnext_base.clip_laion2b': _cfg(
919-
hf_hub_id='laion/CLIP-convnext_base_w-laion2B-s13B-b82K',
920-
hf_hub_filename='open_clip_pytorch_model.bin',
919+
hf_hub_id='timm/',
921920
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
922921
input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=640),
923922
'convnext_base.clip_laion2b_augreg': _cfg(
924-
hf_hub_id='laion/CLIP-convnext_base_w-laion2B-s13B-b82K-augreg',
925-
hf_hub_filename='open_clip_pytorch_model.bin',
923+
hf_hub_id='timm/',
926924
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
927925
input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=640),
928926
'convnext_base.clip_laiona': _cfg(
929-
hf_hub_id='laion/CLIP-convnext_base_w-laion_aesthetic-s13B-b82K',
930-
hf_hub_filename='open_clip_pytorch_model.bin',
927+
hf_hub_id='timm/',
931928
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
932929
input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=640),
933930
'convnext_base.clip_laiona_320': _cfg(
934-
hf_hub_id='laion/CLIP-convnext_base_w_320-laion_aesthetic-s13B-b82K',
935-
hf_hub_filename='open_clip_pytorch_model.bin',
931+
hf_hub_id='timm/',
936932
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
937933
input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=1.0, num_classes=640),
938934
'convnext_base.clip_laiona_augreg_320': _cfg(
939-
hf_hub_id='laion/CLIP-convnext_base_w_320-laion_aesthetic-s13B-b82K-augreg',
940-
hf_hub_filename='open_clip_pytorch_model.bin',
935+
hf_hub_id='timm/',
941936
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
942937
input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=1.0, num_classes=640),
943938
'convnext_large_mlp.clip_laion2b_augreg': _cfg(
944-
hf_hub_id='laion/CLIP-convnext_large_d.laion2B-s26B-b102K-augreg',
945-
hf_hub_filename='open_clip_pytorch_model.bin',
939+
hf_hub_id='timm/',
946940
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
947941
input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=768),
948942
'convnext_large_mlp.clip_laion2b_ft_320': _cfg(
949-
hf_hub_id='laion/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft',
950-
hf_hub_filename='open_clip_pytorch_model.bin',
943+
hf_hub_id='timm/',
951944
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
952945
input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=1.0, num_classes=768),
953946
'convnext_large_mlp.clip_laion2b_ft_soup_320': _cfg(
954-
hf_hub_id='laion/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft-soup',
955-
hf_hub_filename='open_clip_pytorch_model.bin',
947+
hf_hub_id='timm/',
956948
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
957949
input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=1.0, num_classes=768),
958950
'convnext_xxlarge.clip_laion2b_soup': _cfg(
959-
hf_hub_id='laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup',
960-
hf_hub_filename='open_clip_pytorch_model.bin',
951+
hf_hub_id='timm/',
961952
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
962953
input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=1024),
963954
'convnext_xxlarge.clip_laion2b_rewind': _cfg(
964-
hf_hub_id='laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-rewind',
965-
hf_hub_filename='open_clip_pytorch_model.bin',
955+
hf_hub_id='timm/',
966956
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
967957
input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=1024),
968958

timm/models/vision_transformer.py

Lines changed: 22 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1556,9 +1556,6 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
15561556
hf_hub_id='timm/',
15571557
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0),
15581558

1559-
'vit_base_patch32_clip_224.laion2b_ft_in12k': _cfg(
1560-
#hf_hub_id='timm/vit_base_patch32_clip_224.laion2b_ft_in12k', # FIXME weight exists, need to push
1561-
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821),
15621559
'vit_base_patch16_clip_224.laion2b_ft_in12k': _cfg(
15631560
hf_hub_id='timm/',
15641561
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821),
@@ -1569,9 +1566,6 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
15691566
hf_hub_id='timm/',
15701567
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=11821),
15711568

1572-
'vit_base_patch32_clip_224.openai_ft_in12k': _cfg(
1573-
# hf_hub_id='timm/vit_base_patch32_clip_224.openai_ft_in12k', # FIXME weight exists, need to push
1574-
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821),
15751569
'vit_base_patch16_clip_224.openai_ft_in12k': _cfg(
15761570
hf_hub_id='timm/',
15771571
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821),
@@ -1580,28 +1574,22 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
15801574
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=11821),
15811575

15821576
'vit_base_patch32_clip_224.laion2b': _cfg(
1583-
hf_hub_id='laion/CLIP-ViT-B-32-laion2B-s34B-b79K',
1584-
hf_hub_filename='open_clip_pytorch_model.bin',
1577+
hf_hub_id='timm/',
15851578
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),
15861579
'vit_base_patch16_clip_224.laion2b': _cfg(
1587-
hf_hub_id='laion/CLIP-ViT-B-16-laion2B-s34B-b88K',
1588-
hf_hub_filename='open_clip_pytorch_model.bin',
1580+
hf_hub_id='timm/',
15891581
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512),
15901582
'vit_large_patch14_clip_224.laion2b': _cfg(
1591-
hf_hub_id='laion/CLIP-ViT-L-14-laion2B-s32B-b82K',
1592-
hf_hub_filename='open_clip_pytorch_model.bin',
1583+
hf_hub_id='timm/',
15931584
mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, crop_pct=1.0, num_classes=768),
15941585
'vit_huge_patch14_clip_224.laion2b': _cfg(
1595-
hf_hub_id='laion/CLIP-ViT-H-14-laion2B-s32B-b79K',
1596-
hf_hub_filename='open_clip_pytorch_model.bin',
1586+
hf_hub_id='timm/',
15971587
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024),
15981588
'vit_giant_patch14_clip_224.laion2b': _cfg(
1599-
hf_hub_id='laion/CLIP-ViT-g-14-laion2B-s12B-b42K',
1600-
hf_hub_filename='open_clip_pytorch_model.bin',
1589+
hf_hub_id='timm/',
16011590
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024),
16021591
'vit_gigantic_patch14_clip_224.laion2b': _cfg(
1603-
hf_hub_id='laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
1604-
hf_hub_filename='open_clip_pytorch_model.bin',
1592+
hf_hub_id='timm/',
16051593
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1280),
16061594

16071595
'vit_base_patch32_clip_224.laion400m_e32': _cfg(
@@ -1620,21 +1608,17 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
16201608
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768),
16211609

16221610
'vit_base_patch32_clip_224.datacompxl': _cfg(
1623-
hf_hub_id='laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K',
1624-
hf_hub_filename='open_clip_pytorch_model.bin',
1611+
hf_hub_id='timm/',
16251612
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512),
16261613
'vit_base_patch32_clip_256.datacompxl': _cfg(
1627-
hf_hub_id='laion/CLIP-ViT-B-32-256x256-DataComp-s34B-b86K',
1628-
hf_hub_filename='open_clip_pytorch_model.bin',
1614+
hf_hub_id='timm/',
16291615
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
16301616
crop_pct=1.0, input_size=(3, 256, 256), num_classes=512),
16311617
'vit_base_patch16_clip_224.datacompxl': _cfg(
1632-
hf_hub_id='laion/CLIP-ViT-B-16-DataComp.XL-s13B-b90K',
1633-
hf_hub_filename='open_clip_pytorch_model.bin',
1618+
hf_hub_id='timm/',
16341619
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512),
16351620
'vit_large_patch14_clip_224.datacompxl': _cfg(
1636-
hf_hub_id='laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K',
1637-
hf_hub_filename='open_clip_pytorch_model.bin',
1621+
hf_hub_id='timm/',
16381622
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768),
16391623

16401624
'vit_base_patch16_clip_224.dfn2b': _cfg(
@@ -1659,42 +1643,46 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
16591643
crop_pct=1.0, input_size=(3, 378, 378), num_classes=1024),
16601644

16611645
'vit_base_patch32_clip_224.metaclip_2pt5b': _cfg(
1662-
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
1646+
hf_hub_id='timm/',
16631647
license='cc-by-nc-4.0',
16641648
notes=('natively QuickGELU, use quickgelu model variant for original results',),
16651649
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512),
16661650
'vit_base_patch16_clip_224.metaclip_2pt5b': _cfg(
1667-
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
1651+
hf_hub_id='timm/',
16681652
license='cc-by-nc-4.0',
16691653
notes=('natively QuickGELU, use quickgelu model variant for original results',),
16701654
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512),
16711655
'vit_large_patch14_clip_224.metaclip_2pt5b': _cfg(
1672-
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
1656+
hf_hub_id='timm/',
16731657
license='cc-by-nc-4.0',
16741658
notes=('natively QuickGELU, use quickgelu model variant for original results',),
16751659
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768),
16761660
'vit_huge_patch14_clip_224.metaclip_2pt5b': _cfg(
1677-
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
1661+
hf_hub_id='timm/',
16781662
license='cc-by-nc-4.0',
16791663
notes=('natively QuickGELU, use quickgelu model variant for original results',),
16801664
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024),
1665+
'vit_huge_patch14_clip_224.metaclip_altogether': _cfg(
1666+
hf_hub_id='timm/',
1667+
license='cc-by-nc-4.0',
1668+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024),
16811669
'vit_gigantic_patch14_clip_224.metaclip_2pt5b': _cfg(
1682-
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
1670+
hf_hub_id='timm/',
16831671
license='cc-by-nc-4.0',
16841672
notes=('natively QuickGELU, use quickgelu model variant for original results',),
16851673
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1280),
16861674
'vit_base_patch32_clip_224.metaclip_400m': _cfg(
1687-
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
1675+
hf_hub_id='timm/',
16881676
license='cc-by-nc-4.0',
16891677
notes=('natively QuickGELU, use quickgelu model variant for original results',),
16901678
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512),
16911679
'vit_base_patch16_clip_224.metaclip_400m': _cfg(
1692-
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
1680+
hf_hub_id='timm/',
16931681
license='cc-by-nc-4.0',
16941682
notes=('natively QuickGELU, use quickgelu model variant for original results',),
16951683
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512),
16961684
'vit_large_patch14_clip_224.metaclip_400m': _cfg(
1697-
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
1685+
hf_hub_id='timm/',
16981686
license='cc-by-nc-4.0',
16991687
notes=('natively QuickGELU, use quickgelu model variant for original results',),
17001688
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768),

0 commit comments

Comments
 (0)