Merge pull request #233 from rwightman/torchamp

rwightman · web-flow · commit 5247eb37a7c0 · 2020-09-02T17:42:41.000-07:00
Native Torch AMP and channels_last support for train.py and validate.py
diff --git a/tests/test_models.py b/tests/test_models.py
@@ -120,6 +120,12 @@ def test_model_load_pretrained(model_name, batch_size):
         in_chans = 3 if 'pruned' in model_name else 1  # pruning not currently supported with in_chans change
         create_model(model_name, pretrained=True, in_chans=in_chans)
 
+    @pytest.mark.timeout(120)
+    @pytest.mark.parametrize('model_name', list_models(pretrained=True))
+    @pytest.mark.parametrize('batch_size', [1])
+    def test_model_features_pretrained(model_name, batch_size):
+        """Create that pretrained weights load when features_only==True."""
+        create_model(model_name, pretrained=True, features_only=True)
 
 EXCLUDE_JIT_FILTERS = [
     '*iabn*', 'tresnet*',  # models using inplace abn unlikely to ever be scriptable
diff --git a/timm/models/efficientnet_blocks.py b/timm/models/efficientnet_blocks.py
@@ -106,20 +106,18 @@ class SqueezeExcite(nn.Module):
     def __init__(self, in_chs, se_ratio=0.25, reduced_base_chs=None,
                  act_layer=nn.ReLU, gate_fn=sigmoid, divisor=1, **_):
         super(SqueezeExcite, self).__init__()
-        self.gate_fn = gate_fn
         reduced_chs = make_divisible((reduced_base_chs or in_chs) * se_ratio, divisor)
-        self.avg_pool = nn.AdaptiveAvgPool2d(1)
         self.conv_reduce = nn.Conv2d(in_chs, reduced_chs, 1, bias=True)
         self.act1 = act_layer(inplace=True)
         self.conv_expand = nn.Conv2d(reduced_chs, in_chs, 1, bias=True)
+        self.gate_fn = gate_fn
 
     def forward(self, x):
-        x_se = self.avg_pool(x)
+        x_se = x.mean((2, 3), keepdim=True)
         x_se = self.conv_reduce(x_se)
         x_se = self.act1(x_se)
         x_se = self.conv_expand(x_se)
-        x = x * self.gate_fn(x_se)
-        return x
+        return x * self.gate_fn(x_se)
 
 
 class ConvBnAct(nn.Module):
diff --git a/timm/models/factory.py b/timm/models/factory.py
@@ -39,20 +39,17 @@ def create_model(
         kwargs.pop('bn_momentum', None)
         kwargs.pop('bn_eps', None)
 
-    # Parameters that aren't supported by all models should default to None in command line args,
-    # remove them if they are present and not set so that non-supporting models don't break.
-    if kwargs.get('drop_block_rate', None) is None:
-        kwargs.pop('drop_block_rate', None)
-
     # handle backwards compat with drop_connect -> drop_path change
     drop_connect_rate = kwargs.pop('drop_connect_rate', None)
     if drop_connect_rate is not None and kwargs.get('drop_path_rate', None) is None:
         print("WARNING: 'drop_connect' as an argument is deprecated, please use 'drop_path'."
               " Setting drop_path to %f." % drop_connect_rate)
         kwargs['drop_path_rate'] = drop_connect_rate
 
-    if kwargs.get('drop_path_rate', None) is None:
-        kwargs.pop('drop_path_rate', None)
+    # Parameters that aren't supported by all models or are intended to only override model defaults if set
+    # should default to None in command line args/cfg. Remove them if they are present and not set so that
+    # non-supporting models don't break and default args remain in effect.
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
 
     with set_layer_config(scriptable=scriptable, exportable=exportable, no_jit=no_jit):
         if is_model(model_name):
diff --git a/timm/models/helpers.py b/timm/models/helpers.py
@@ -48,30 +48,41 @@ def load_checkpoint(model, checkpoint_path, use_ema=False, strict=True):
     model.load_state_dict(state_dict, strict=strict)
 
 
-def resume_checkpoint(model, checkpoint_path):
-    other_state = {}
+def resume_checkpoint(model, checkpoint_path, optimizer=None, loss_scaler=None, log_info=True):
     resume_epoch = None
     if os.path.isfile(checkpoint_path):
         checkpoint = torch.load(checkpoint_path, map_location='cpu')
         if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
+            if log_info:
+                _logger.info('Restoring model state from checkpoint...')
             new_state_dict = OrderedDict()
             for k, v in checkpoint['state_dict'].items():
                 name = k[7:] if k.startswith('module') else k
                 new_state_dict[name] = v
             model.load_state_dict(new_state_dict)
-            if 'optimizer' in checkpoint:
-                other_state['optimizer'] = checkpoint['optimizer']
-            if 'amp' in checkpoint:
-                other_state['amp'] = checkpoint['amp']
+
+            if optimizer is not None and 'optimizer' in checkpoint:
+                if log_info:
+                    _logger.info('Restoring optimizer state from checkpoint...')
+                optimizer.load_state_dict(checkpoint['optimizer'])
+
+            if loss_scaler is not None and loss_scaler.state_dict_key in checkpoint:
+                if log_info:
+                    _logger.info('Restoring AMP loss scaler state from checkpoint...')
+                loss_scaler.load_state_dict(checkpoint[loss_scaler.state_dict_key])
+
             if 'epoch' in checkpoint:
                 resume_epoch = checkpoint['epoch']
                 if 'version' in checkpoint and checkpoint['version'] > 1:
                     resume_epoch += 1  # start at the next epoch, old checkpoints incremented before save
-            _logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, checkpoint['epoch']))
+
+            if log_info:
+                _logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, checkpoint['epoch']))
         else:
             model.load_state_dict(checkpoint)
-            _logger.info("Loaded checkpoint '{}'".format(checkpoint_path))
-        return other_state, resume_epoch
+            if log_info:
+                _logger.info("Loaded checkpoint '{}'".format(checkpoint_path))
+        return resume_epoch
     else:
         _logger.error("No checkpoint found at '{}'".format(checkpoint_path))
         raise FileNotFoundError()
diff --git a/timm/models/hrnet.py b/timm/models/hrnet.py
@@ -773,12 +773,14 @@ def forward(self, x) -> List[torch.tensor]:
 
 def _create_hrnet(variant, pretrained, **model_kwargs):
     model_cls = HighResolutionNet
+    strict = True
     if model_kwargs.pop('features_only', False):
         model_cls = HighResolutionNetFeatures
+        strict = False
 
     return build_model_with_cfg(
         model_cls, variant, pretrained, default_cfg=default_cfgs[variant],
-        model_cfg=cfg_cls[variant], **model_kwargs)
+        model_cfg=cfg_cls[variant], pretrained_strict=strict, **model_kwargs)
 
 
 @register_model
diff --git a/timm/models/layers/adaptive_avgmax_pool.py b/timm/models/layers/adaptive_avgmax_pool.py
@@ -49,6 +49,15 @@ def select_adaptive_pool2d(x, pool_type='avg', output_size=1):
     return x
 
 
+class FastAdaptiveAvgPool2d(nn.Module):
+    def __init__(self, flatten=False):
+        super(FastAdaptiveAvgPool2d, self).__init__()
+        self.flatten = flatten
+
+    def forward(self, x):
+        return x.mean((2, 3)) if self.flatten else x.mean((2, 3), keepdim=True)
+
+
 class AdaptiveAvgMaxPool2d(nn.Module):
     def __init__(self, output_size=1):
         super(AdaptiveAvgMaxPool2d, self).__init__()
@@ -70,12 +79,16 @@ def forward(self, x):
 class SelectAdaptivePool2d(nn.Module):
     """Selectable global pooling layer with dynamic input kernel size
     """
-    def __init__(self, output_size=1, pool_type='avg', flatten=False):
+    def __init__(self, output_size=1, pool_type='fast', flatten=False):
         super(SelectAdaptivePool2d, self).__init__()
         self.pool_type = pool_type or ''  # convert other falsy values to empty string for consistent TS typing
         self.flatten = flatten
         if pool_type == '':
             self.pool = nn.Identity()  # pass through
+        elif pool_type == 'fast':
+            assert output_size == 1
+            self.pool = FastAdaptiveAvgPool2d(self.flatten)
+            self.flatten = False
         elif pool_type == 'avg':
             self.pool = nn.AdaptiveAvgPool2d(output_size)
         elif pool_type == 'avgmax':
diff --git a/timm/models/layers/cbam.py b/timm/models/layers/cbam.py
@@ -10,6 +10,7 @@
 
 import torch
 from torch import nn as nn
+import torch.nn.functional as F
 from .conv_bn_act import ConvBnAct
 
 
@@ -18,15 +19,13 @@ class ChannelAttn(nn.Module):
     """
     def __init__(self, channels, reduction=16, act_layer=nn.ReLU):
         super(ChannelAttn, self).__init__()
-        self.avg_pool = nn.AdaptiveAvgPool2d(1)
-        self.max_pool = nn.AdaptiveMaxPool2d(1)
         self.fc1 = nn.Conv2d(channels, channels // reduction, 1, bias=False)
         self.act = act_layer(inplace=True)
         self.fc2 = nn.Conv2d(channels // reduction, channels, 1, bias=False)
 
     def forward(self, x):
-        x_avg = self.avg_pool(x)
-        x_max = self.max_pool(x)
+        x_avg = x.mean((2, 3), keepdim=True)
+        x_max = F.adaptive_max_pool2d(x, 1)
         x_avg = self.fc2(self.act(self.fc1(x_avg)))
         x_max = self.fc2(self.act(self.fc1(x_max)))
         x_attn = x_avg + x_max
@@ -40,7 +39,7 @@ def __init__(self, channels, reduction=16):
         super(LightChannelAttn, self).__init__(channels, reduction)
 
     def forward(self, x):
-        x_pool = 0.5 * self.avg_pool(x) + 0.5 * self.max_pool(x)
+        x_pool = 0.5 * x.mean((2, 3), keepdim=True) + 0.5 * F.adaptive_max_pool2d(x, 1)
         x_attn = self.fc2(self.act(self.fc1(x_pool)))
         return x * x_attn.sigmoid()
 
diff --git a/timm/models/layers/eca.py b/timm/models/layers/eca.py
@@ -52,22 +52,15 @@ class EcaModule(nn.Module):
     def __init__(self, channels=None, kernel_size=3, gamma=2, beta=1):
         super(EcaModule, self).__init__()
         assert kernel_size % 2 == 1
-
         if channels is not None:
             t = int(abs(math.log(channels, 2) + beta) / gamma)
             kernel_size = max(t if t % 2 else t + 1, 3)
 
-        self.avg_pool = nn.AdaptiveAvgPool2d(1)
         self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=(kernel_size - 1) // 2, bias=False)
 
     def forward(self, x):
-        # Feature descriptor on the global spatial information
-        y = self.avg_pool(x)
-        # Reshape for convolution
-        y = y.view(x.shape[0], 1, -1)
-        # Two different branches of ECA module
+        y = x.mean((2, 3)).view(x.shape[0], 1, -1)  # view for 1d conv
         y = self.conv(y)
-        # Multi-scale information fusion
         y = y.view(x.shape[0], -1, 1, 1).sigmoid()
         return x * y.expand_as(x)
 
@@ -95,30 +88,20 @@ class CecaModule(nn.Module):
     def __init__(self, channels=None, kernel_size=3, gamma=2, beta=1):
         super(CecaModule, self).__init__()
         assert kernel_size % 2 == 1
-
         if channels is not None:
             t = int(abs(math.log(channels, 2) + beta) / gamma)
             kernel_size = max(t if t % 2 else t + 1, 3)
 
-        self.avg_pool = nn.AdaptiveAvgPool2d(1)
-        #pytorch circular padding mode is buggy as of pytorch 1.4
-        #see https://github.com/pytorch/pytorch/pull/17240
-
-        #implement manual circular padding
+        # PyTorch circular padding mode is buggy as of pytorch 1.4
+        # see https://github.com/pytorch/pytorch/pull/17240
+        # implement manual circular padding
         self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=0, bias=False)
         self.padding = (kernel_size - 1) // 2
 
     def forward(self, x):
-        # Feature descriptor on the global spatial information
-        y = self.avg_pool(x)
-
+        y = x.mean((2, 3)).view(x.shape[0], 1, -1)
         # Manually implement circular padding, F.pad does not seemed to be bugged
-        y = F.pad(y.view(x.shape[0], 1, -1), (self.padding, self.padding), mode='circular')
-
-        # Two different branches of ECA module
+        y = F.pad(y, (self.padding, self.padding), mode='circular')
         y = self.conv(y)
-
-        # Multi-scale information fusion
         y = y.view(x.shape[0], -1, 1, 1).sigmoid()
-
         return x * y.expand_as(x)
diff --git a/timm/models/layers/se.py b/timm/models/layers/se.py
@@ -1,40 +1,36 @@
 from torch import nn as nn
-from .create_act import get_act_fn
+from .create_act import create_act_layer
 
 
 class SEModule(nn.Module):
 
     def __init__(self, channels, reduction=16, act_layer=nn.ReLU, min_channels=8, reduction_channels=None,
-                 gate_fn='sigmoid'):
+                 gate_layer='sigmoid'):
         super(SEModule, self).__init__()
-        self.avg_pool = nn.AdaptiveAvgPool2d(1)
         reduction_channels = reduction_channels or max(channels // reduction, min_channels)
-        self.fc1 = nn.Conv2d(
-            channels, reduction_channels, kernel_size=1, padding=0, bias=True)
+        self.fc1 = nn.Conv2d(channels, reduction_channels, kernel_size=1, bias=True)
         self.act = act_layer(inplace=True)
-        self.fc2 = nn.Conv2d(
-            reduction_channels, channels, kernel_size=1, padding=0, bias=True)
-        self.gate_fn = get_act_fn(gate_fn)
+        self.fc2 = nn.Conv2d(reduction_channels, channels, kernel_size=1, bias=True)
+        self.gate = create_act_layer(gate_layer)
 
     def forward(self, x):
-        x_se = self.avg_pool(x)
+        x_se = x.mean((2, 3), keepdim=True)
         x_se = self.fc1(x_se)
         x_se = self.act(x_se)
         x_se = self.fc2(x_se)
-        return x * self.gate_fn(x_se)
+        return x * self.gate(x_se)
 
 
 class EffectiveSEModule(nn.Module):
     """ 'Effective Squeeze-Excitation
     From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667
     """
-    def __init__(self, channels, gate_fn='hard_sigmoid'):
+    def __init__(self, channels, gate_layer='hard_sigmoid'):
         super(EffectiveSEModule, self).__init__()
-        self.avg_pool = nn.AdaptiveAvgPool2d(1)
         self.fc = nn.Conv2d(channels, channels, kernel_size=1, padding=0)
-        self.gate_fn = get_act_fn(gate_fn)
+        self.gate = create_act_layer(gate_layer, inplace=True)
 
     def forward(self, x):
-        x_se = self.avg_pool(x)
+        x_se = x.mean((2, 3), keepdim=True)
         x_se = self.fc(x_se)
-        return x * self.gate_fn(x_se, inplace=True)
+        return x * self.gate(x_se)
diff --git a/timm/models/layers/selective_kernel.py b/timm/models/layers/selective_kernel.py
@@ -27,16 +27,14 @@ def __init__(self, channels, num_paths=2, attn_channels=32,
         """
         super(SelectiveKernelAttn, self).__init__()
         self.num_paths = num_paths
-        self.pool = nn.AdaptiveAvgPool2d(1)
         self.fc_reduce = nn.Conv2d(channels, attn_channels, kernel_size=1, bias=False)
         self.bn = norm_layer(attn_channels)
         self.act = act_layer(inplace=True)
         self.fc_select = nn.Conv2d(attn_channels, channels * num_paths, kernel_size=1, bias=False)
 
     def forward(self, x):
         assert x.shape[1] == self.num_paths
-        x = torch.sum(x, dim=1)
-        x = self.pool(x)
+        x = x.sum(1).mean((2, 3), keepdim=True)
         x = self.fc_reduce(x)
         x = self.bn(x)
         x = self.act(x)
diff --git a/timm/models/rexnet.py b/timm/models/rexnet.py
@@ -59,18 +59,15 @@ class SEWithNorm(nn.Module):
     def __init__(self, channels, reduction=16, act_layer=nn.ReLU, divisor=1, reduction_channels=None,
                  gate_layer='sigmoid'):
         super(SEWithNorm, self).__init__()
-        self.avg_pool = nn.AdaptiveAvgPool2d(1)
         reduction_channels = reduction_channels or make_divisible(channels // reduction, divisor=divisor)
-        self.fc1 = nn.Conv2d(
-            channels, reduction_channels, kernel_size=1, padding=0, bias=True)
+        self.fc1 = nn.Conv2d(channels, reduction_channels, kernel_size=1, bias=True)
         self.bn = nn.BatchNorm2d(reduction_channels)
         self.act = act_layer(inplace=True)
-        self.fc2 = nn.Conv2d(
-            reduction_channels, channels, kernel_size=1, padding=0, bias=True)
+        self.fc2 = nn.Conv2d(reduction_channels, channels, kernel_size=1, bias=True)
         self.gate = create_act_layer(gate_layer)
 
     def forward(self, x):
-        x_se = self.avg_pool(x)
+        x_se = x.mean((2, 3), keepdim=True)
         x_se = self.fc1(x_se)
         x_se = self.bn(x_se)
         x_se = self.act(x_se)
diff --git a/timm/models/senet.py b/timm/models/senet.py
@@ -71,17 +71,14 @@ class SEModule(nn.Module):
 
     def __init__(self, channels, reduction):
         super(SEModule, self).__init__()
-        self.avg_pool = nn.AdaptiveAvgPool2d(1)
-        self.fc1 = nn.Conv2d(
-            channels, channels // reduction, kernel_size=1, padding=0)
+        self.fc1 = nn.Conv2d(channels, channels // reduction, kernel_size=1)
         self.relu = nn.ReLU(inplace=True)
-        self.fc2 = nn.Conv2d(
-            channels // reduction, channels, kernel_size=1, padding=0)
+        self.fc2 = nn.Conv2d(channels // reduction, channels, kernel_size=1)
         self.sigmoid = nn.Sigmoid()
 
     def forward(self, x):
         module_input = x
-        x = self.avg_pool(x)
+        x = x.mean((2, 3), keepdim=True)
         x = self.fc1(x)
         x = self.relu(x)
         x = self.fc2(x)
diff --git a/timm/models/tresnet.py b/timm/models/tresnet.py
diff --git a/timm/utils.py b/timm/utils.py
diff --git a/train.py b/train.py
diff --git a/validate.py b/validate.py