huggingface
diff --git a/‎tests/test_models.py
Lines changed: 27 additions & 1 deletion b/‎tests/test_models.py
Lines changed: 27 additions & 1 deletion
diff --git a/‎timm/models/beit.py
Lines changed: 4 additions & 1 deletion b/‎timm/models/beit.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎timm/models/byobnet.py
Lines changed: 4 additions & 1 deletion b/‎timm/models/byobnet.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎timm/models/cait.py
Lines changed: 5 additions & 2 deletions b/‎timm/models/cait.py
Lines changed: 5 additions & 2 deletions
diff --git a/‎timm/models/crossvit.py
Lines changed: 1 addition & 6 deletions b/‎timm/models/crossvit.py
Lines changed: 1 addition & 6 deletions
diff --git a/‎timm/models/davit.py
Lines changed: 5 additions & 2 deletions b/‎timm/models/davit.py
Lines changed: 5 additions & 2 deletions
diff --git a/‎timm/models/dla.py
Lines changed: 0 additions & 1 deletion b/‎timm/models/dla.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎timm/models/efficientnet.py
Lines changed: 5 additions & 3 deletions b/‎timm/models/efficientnet.py
Lines changed: 5 additions & 3 deletions
diff --git a/‎timm/models/efficientvit_mit.py
Lines changed: 8 additions & 2 deletions b/‎timm/models/efficientvit_mit.py
Lines changed: 8 additions & 2 deletions
diff --git a/‎timm/models/efficientvit_msra.py
Lines changed: 5 additions & 2 deletions b/‎timm/models/efficientvit_msra.py
Lines changed: 5 additions & 2 deletions
@@ -186,6 +186,18 @@ def test_model_forward(model_name, batch_size):
     assert outputs.shape[0] == batch_size
     assert not torch.isnan(outputs).any(), 'Output included NaNs'
 
+    # Test that grad-checkpointing, if supported, doesn't cause model failures or change in output
+    try:
+        model.set_grad_checkpointing()
+    except:
+        # throws if not supported, that's fine
+        pass
+    else:
+        outputs2 = model(inputs)
+        if isinstance(outputs, tuple):
+            outputs2 = torch.cat(outputs2)
+        assert torch.allclose(outputs, outputs2, rtol=1e-4, atol=1e-5), 'Output does not match'
+
 
 @pytest.mark.base
 @pytest.mark.timeout(timeout120)
@@ -529,6 +541,20 @@ def test_model_forward_intermediates(model_name, batch_size):
     output2 = model.forward_features(inpt)
     assert torch.allclose(output, output2)
 
+    # Test that grad-checkpointing, if supported
+    try:
+        model.set_grad_checkpointing()
+    except:
+        # throws if not supported, that's fine
+        pass
+    else:
+        output3, _ = model.forward_intermediates(
+            inpt,
+            output_fmt=output_fmt,
+        )
+        assert torch.allclose(output, output3, rtol=1e-4, atol=1e-5), 'Output does not match'
+
+
 
 def _create_fx_model(model, train=False):
     # This block of code does a bit of juggling to handle any case where there are multiple outputs in train mode
@@ -717,4 +743,4 @@ def test_model_forward_torchscript_with_features_fx(model_name, batch_size):
 
         for tensor in outputs:
             assert tensor.shape[0] == batch_size
-            assert not torch.isnan(tensor).any(), 'Output included NaNs'
+            assert not torch.isnan(tensor).any(), 'Output included NaNs'
@@ -615,7 +615,10 @@ def forward_intermediates(
         else:
             blocks = self.blocks[:max_index + 1]
         for i, blk in enumerate(blocks):
-            x = blk(x, shared_rel_pos_bias=rel_pos_bias)
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(blk, x, shared_rel_pos_bias=rel_pos_bias)
+            else:
+                x = blk(x, shared_rel_pos_bias=rel_pos_bias)
             if i in take_indices:
                 # normalize intermediates with final norm layer if enabled
                 intermediates.append(self.norm(x) if norm else x)
 
@@ -1508,7 +1508,10 @@ def forward_intermediates(
             stages = self.stages[:max_index]
         for stage in stages:
             feat_idx += 1
-            x = stage(x)
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint_seq(stage, x)
+            else:
+                x = stage(x)
             if not exclude_final_conv and feat_idx == last_idx:
                 # default feature_info for this model uses final_conv as the last feature output (if present)
                 x = self.final_conv(x)
 
@@ -18,7 +18,7 @@
 from timm.layers import PatchEmbed, Mlp, DropPath, trunc_normal_, use_fused_attn
 from ._builder import build_model_with_cfg
 from ._features import feature_take_indices
-from ._manipulate import checkpoint_seq
+from ._manipulate import checkpoint, checkpoint_seq
 from ._registry import register_model, generate_default_cfgs
 
 __all__ = ['Cait', 'ClassAttn', 'LayerScaleBlockClassAttn', 'LayerScaleBlock', 'TalkingHeadAttn']
@@ -373,7 +373,10 @@ def forward_intermediates(
         else:
             blocks = self.blocks[:max_index + 1]
         for i, blk in enumerate(blocks):
-            x = blk(x)
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(blk, x)
+            else:
+                x = blk(x)
             if i in take_indices:
                 # normalize intermediates with final norm layer if enabled
                 intermediates.append(self.norm(x) if norm else x)
 
@@ -14,21 +14,16 @@
 NOTE: model names have been renamed from originals to represent actual input res all *_224 -> *_240 and *_384 -> *_408
 
 Modifications and additions for timm hacked together by / Copyright 2021, Ross Wightman
+Modified from Timm. https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
 """
 
 # Copyright IBM All Rights Reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-
-"""
-Modified from Timm. https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
-
-"""
 from functools import partial
 from typing import List, Optional, Tuple
 
 import torch
-import torch.hub
 import torch.nn as nn
 
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 
@@ -25,7 +25,7 @@
 from ._builder import build_model_with_cfg
 from ._features import feature_take_indices
 from ._features_fx import register_notrace_function
-from ._manipulate import checkpoint_seq
+from ._manipulate import checkpoint, checkpoint_seq
 from ._registry import generate_default_cfgs, register_model
 
 __all__ = ['DaVit']
@@ -671,7 +671,10 @@ def forward_intermediates(
             stages = self.stages[:max_index + 1]
 
         for feat_idx, stage in enumerate(stages):
-            x = stage(x)
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(stage, x)
+            else:
+                x = stage(x)
             if feat_idx in take_indices:
                 if norm and feat_idx == last_idx:
                     x_inter = self.norm_pre(x)  # applying final norm to last intermediate
 
@@ -10,7 +10,6 @@
 
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 from timm.layers import create_classifier
 
@@ -259,9 +259,11 @@ def forward_intermediates(
             blocks = self.blocks
         else:
             blocks = self.blocks[:max_index]
-        for blk in blocks:
-            feat_idx += 1
-            x = blk(x)
+        for feat_idx, blk in enumerate(blocks, start=1):
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint_seq(blk, x)
+            else:
+                x = blk(x)
             if feat_idx in take_indices:
                 intermediates.append(x)
 
 
@@ -789,7 +789,10 @@ def forward_intermediates(
             stages = self.stages[:max_index + 1]
 
         for feat_idx, stage in enumerate(stages):
-            x = stage(x)
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint_seq(stages, x)
+            else:
+                x = stage(x)
             if feat_idx in take_indices:
                 intermediates.append(x)
 
@@ -943,7 +946,10 @@ def forward_intermediates(
             stages = self.stages[:max_index + 1]
 
         for feat_idx, stage in enumerate(stages):
-            x = stage(x)
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint_seq(stages, x)
+            else:
+                x = stage(x)
             if feat_idx in take_indices:
                 intermediates.append(x)
 
 
@@ -18,7 +18,7 @@
 from timm.layers import SqueezeExcite, SelectAdaptivePool2d, trunc_normal_, _assert
 from ._builder import build_model_with_cfg
 from ._features import feature_take_indices
-from ._manipulate import checkpoint_seq
+from ._manipulate import checkpoint, checkpoint_seq
 from ._registry import register_model, generate_default_cfgs
 
 
@@ -510,7 +510,10 @@ def forward_intermediates(
             stages = self.stages[:max_index + 1]
 
         for feat_idx, stage in enumerate(stages):
-            x = stage(x)
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(stage, x)
+            else:
+                x = stage(x)
             if feat_idx in take_indices:
                 intermediates.append(x)