huggingface
diff --git a/‎timm/models/efficientnet.py
Lines changed: 2 additions & 1 deletion b/‎timm/models/efficientnet.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎timm/models/efficientnet_blocks.py
Lines changed: 19 additions & 16 deletions b/‎timm/models/efficientnet_blocks.py
Lines changed: 19 additions & 16 deletions
diff --git a/‎timm/models/efficientnet_builder.py
Lines changed: 2 additions & 1 deletion b/‎timm/models/efficientnet_builder.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎timm/models/layers/__init__.py
Lines changed: 6 additions & 2 deletions b/‎timm/models/layers/__init__.py
Lines changed: 6 additions & 2 deletions
diff --git a/‎timm/models/layers/activations.py
Lines changed: 22 additions & 13 deletions b/‎timm/models/layers/activations.py
Lines changed: 22 additions & 13 deletions
diff --git a/‎timm/models/layers/cond_conv2d.py
Lines changed: 118 additions & 0 deletions b/‎timm/models/layers/cond_conv2d.py
Lines changed: 118 additions & 0 deletions
@@ -27,7 +27,8 @@
 from .feature_hooks import FeatureHooks
 from .registry import register_model
 from .helpers import load_pretrained
-from .layers import SelectAdaptivePool2d, select_conv2d
+from .layers import SelectAdaptivePool2d
+from timm.models.layers import select_conv2d
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
 
 
 
@@ -1,11 +1,8 @@
-
-from functools import partial
-
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
+from torch.nn import functional as F
 from .layers.activations import sigmoid
-from .layers.conv2d_layers import *
+from .layers import select_conv2d
 
 
 # Defaults used for Google/Tensorflow training of mobile networks /w RMSprop as per
@@ -72,7 +69,7 @@ def round_channels(channels, multiplier=1.0, divisor=8, channel_min=None):
     return make_divisible(channels, divisor, channel_min)
 
 
-def drop_connect(inputs, training=False, drop_connect_rate=0.):
+def drop_connect(inputs, training: bool = False, drop_connect_rate: float = 0.):
     """Apply drop connect."""
     if not training:
         return inputs
@@ -160,7 +157,7 @@ def __init__(self, in_chs, out_chs, dw_kernel_size=3,
                  norm_layer=nn.BatchNorm2d, norm_kwargs=None, drop_connect_rate=0.):
         super(DepthwiseSeparableConv, self).__init__()
         norm_kwargs = norm_kwargs or {}
-        self.has_se = se_ratio is not None and se_ratio > 0.
+        has_se = se_ratio is not None and se_ratio > 0.
         self.has_residual = (stride == 1 and in_chs == out_chs) and not noskip
         self.has_pw_act = pw_act  # activation after point-wise conv
         self.drop_connect_rate = drop_connect_rate
@@ -171,9 +168,11 @@ def __init__(self, in_chs, out_chs, dw_kernel_size=3,
         self.act1 = act_layer(inplace=True)
 
         # Squeeze-and-excitation
-        if self.has_se:
+        if has_se:
             se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer)
             self.se = SqueezeExcite(in_chs, se_ratio=se_ratio, **se_kwargs)
+        else:
+            self.se = None
 
         self.conv_pw = select_conv2d(in_chs, out_chs, pw_kernel_size, padding=pad_type)
         self.bn2 = norm_layer(out_chs, **norm_kwargs)
@@ -193,7 +192,7 @@ def forward(self, x):
         x = self.bn1(x)
         x = self.act1(x)
 
-        if self.has_se:
+        if self.se is not None:
             x = self.se(x)
 
         x = self.conv_pw(x)
@@ -219,7 +218,7 @@ def __init__(self, in_chs, out_chs, dw_kernel_size=3,
         norm_kwargs = norm_kwargs or {}
         conv_kwargs = conv_kwargs or {}
         mid_chs = make_divisible(in_chs * exp_ratio)
-        self.has_se = se_ratio is not None and se_ratio > 0.
+        has_se = se_ratio is not None and se_ratio > 0.
         self.has_residual = (in_chs == out_chs and stride == 1) and not noskip
         self.drop_connect_rate = drop_connect_rate
 
@@ -236,9 +235,11 @@ def __init__(self, in_chs, out_chs, dw_kernel_size=3,
         self.act2 = act_layer(inplace=True)
 
         # Squeeze-and-excitation
-        if self.has_se:
+        if has_se:
             se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer)
             self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio, **se_kwargs)
+        else:
+            self.se = None
 
         # Point-wise linear projection
         self.conv_pwl = select_conv2d(mid_chs, out_chs, pw_kernel_size, padding=pad_type, **conv_kwargs)
@@ -269,7 +270,7 @@ def forward(self, x):
         x = self.act2(x)
 
         # Squeeze-and-excitation
-        if self.has_se:
+        if self.se is not None:
             x = self.se(x)
 
         # Point-wise linear projection
@@ -323,7 +324,7 @@ def forward(self, x):
         x = self.act2(x)
 
         # Squeeze-and-excitation
-        if self.has_se:
+        if self.se is not None:
             x = self.se(x)
 
         # Point-wise linear projection
@@ -350,7 +351,7 @@ def __init__(self, in_chs, out_chs, exp_kernel_size=3, exp_ratio=1.0, fake_in_ch
             mid_chs = make_divisible(fake_in_chs * exp_ratio)
         else:
             mid_chs = make_divisible(in_chs * exp_ratio)
-        self.has_se = se_ratio is not None and se_ratio > 0.
+        has_se = se_ratio is not None and se_ratio > 0.
         self.has_residual = (in_chs == out_chs and stride == 1) and not noskip
         self.drop_connect_rate = drop_connect_rate
 
@@ -360,9 +361,11 @@ def __init__(self, in_chs, out_chs, exp_kernel_size=3, exp_ratio=1.0, fake_in_ch
         self.act1 = act_layer(inplace=True)
 
         # Squeeze-and-excitation
-        if self.has_se:
+        if has_se:
             se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer)
             self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio, **se_kwargs)
+        else:
+            self.se = None
 
         # Point-wise linear projection
         self.conv_pwl = select_conv2d(
@@ -389,7 +392,7 @@ def forward(self, x):
         x = self.act1(x)
 
         # Squeeze-and-excitation
-        if self.has_se:
+        if self.se is not None:
             x = self.se(x)
 
         # Point-wise linear projection
 
@@ -5,7 +5,8 @@
 from copy import deepcopy
 
 import torch.nn as nn
-from .layers.activations import sigmoid, HardSwish, Swish
+from .layers import CondConv2d, get_condconv_initializer
+from .layers.activations import HardSwish, Swish
 from .efficientnet_blocks import *
 
 
 
@@ -1,8 +1,12 @@
-from .conv2d_layers import select_conv2d, MixedConv2d, CondConv2d, ConvBnAct, SelectiveKernelConv
+from .conv_bn_act import ConvBnAct
+from .mixed_conv2d import MixedConv2d
+from .cond_conv2d import CondConv2d, get_condconv_initializer
+from .select_conv2d import select_conv2d
+from .selective_kernel import SelectiveKernelConv
 from .eca import EcaModule, CecaModule
 from .activations import *
 from .adaptive_avgmax_pool import \
     adaptive_avgmax_pool2d, select_adaptive_pool2d, AdaptiveAvgMaxPool2d, SelectAdaptivePool2d
-from .nn_ops import DropBlock2d, DropPath
+from .drop import DropBlock2d, DropPath
 from .test_time_pool import TestTimePoolHead, apply_test_time_pool
 from .split_batchnorm import SplitBatchNorm2d, convert_splitbn_model
@@ -1,9 +1,18 @@
+""" Activations
+
+A collection of activations fn and modules with a common interface so that they can
+easily be swapped. All have an `inplace` arg even if not used.
+
+Hacked together by Ross Wightman
+"""
+
+
 import torch
 from torch import nn as nn
 from torch.nn import functional as F
 
 
-_USE_MEM_EFFICIENT_ISH = True
+_USE_MEM_EFFICIENT_ISH = False
 if _USE_MEM_EFFICIENT_ISH:
     # This version reduces memory overhead of Swish during training by
     # recomputing torch.sigmoid(x) in backward instead of saving it.
@@ -66,20 +75,20 @@ def mish(x, _inplace=False):
         return MishJitAutoFn.apply(x)
 
 else:
-    def swish(x, inplace=False):
+    def swish(x, inplace: bool = False):
         """Swish - Described in: https://arxiv.org/abs/1710.05941
         """
         return x.mul_(x.sigmoid()) if inplace else x.mul(x.sigmoid())
 
 
-    def mish(x, _inplace=False):
+    def mish(x, _inplace: bool = False):
         """Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
         """
         return x.mul(F.softplus(x).tanh())
 
 
 class Swish(nn.Module):
-    def __init__(self, inplace=False):
+    def __init__(self, inplace: bool = False):
         super(Swish, self).__init__()
         self.inplace = inplace
 
@@ -88,65 +97,65 @@ def forward(self, x):
 
 
 class Mish(nn.Module):
-    def __init__(self, inplace=False):
+    def __init__(self, inplace: bool = False):
         super(Mish, self).__init__()
         self.inplace = inplace
 
     def forward(self, x):
         return mish(x, self.inplace)
 
 
-def sigmoid(x, inplace=False):
+def sigmoid(x, inplace: bool = False):
     return x.sigmoid_() if inplace else x.sigmoid()
 
 
 # PyTorch has this, but not with a consistent inplace argmument interface
 class Sigmoid(nn.Module):
-    def __init__(self, inplace=False):
+    def __init__(self, inplace: bool = False):
         super(Sigmoid, self).__init__()
         self.inplace = inplace
 
     def forward(self, x):
         return x.sigmoid_() if self.inplace else x.sigmoid()
 
 
-def tanh(x, inplace=False):
+def tanh(x, inplace: bool = False):
     return x.tanh_() if inplace else x.tanh()
 
 
 # PyTorch has this, but not with a consistent inplace argmument interface
 class Tanh(nn.Module):
-    def __init__(self, inplace=False):
+    def __init__(self, inplace: bool = False):
         super(Tanh, self).__init__()
         self.inplace = inplace
 
     def forward(self, x):
         return x.tanh_() if self.inplace else x.tanh()
 
 
-def hard_swish(x, inplace=False):
+def hard_swish(x, inplace: bool = False):
     inner = F.relu6(x + 3.).div_(6.)
     return x.mul_(inner) if inplace else x.mul(inner)
 
 
 class HardSwish(nn.Module):
-    def __init__(self, inplace=False):
+    def __init__(self, inplace: bool = False):
         super(HardSwish, self).__init__()
         self.inplace = inplace
 
     def forward(self, x):
         return hard_swish(x, self.inplace)
 
 
-def hard_sigmoid(x, inplace=False):
+def hard_sigmoid(x, inplace: bool = False):
     if inplace:
         return x.add_(3.).clamp_(0., 6.).div_(6.)
     else:
         return F.relu6(x + 3.) / 6.
 
 
 class HardSigmoid(nn.Module):
-    def __init__(self, inplace=False):
+    def __init__(self, inplace: bool = False):
         super(HardSigmoid, self).__init__()
         self.inplace = inplace
 
 
@@ -0,0 +1,118 @@
+""" Conditional Convolution
+
+Hacked together by Ross Wightman
+"""
+
+import math
+from functools import partial
+import numpy as np
+import torch
+from torch import nn as nn
+from torch.nn import functional as F
+
+from .conv2d_same import get_padding_value, conv2d_same
+from .conv_helpers import tup_pair
+
+
+def get_condconv_initializer(initializer, num_experts, expert_shape):
+    def condconv_initializer(weight):
+        """CondConv initializer function."""
+        num_params = np.prod(expert_shape)
+        if (len(weight.shape) != 2 or weight.shape[0] != num_experts or
+                weight.shape[1] != num_params):
+            raise (ValueError(
+                'CondConv variables must have shape [num_experts, num_params]'))
+        for i in range(num_experts):
+            initializer(weight[i].view(expert_shape))
+    return condconv_initializer
+
+
+class CondConv2d(nn.Module):
+    """ Conditional Convolution
+    Inspired by: https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/condconv/condconv_layers.py
+
+    Grouped convolution hackery for parallel execution of the per-sample kernel filters inspired by this discussion:
+    https://github.com/pytorch/pytorch/issues/17983
+    """
+    __constants__ = ['bias', 'in_channels', 'out_channels', 'dynamic_padding']
+
+    def __init__(self, in_channels, out_channels, kernel_size=3,
+                 stride=1, padding='', dilation=1, groups=1, bias=False, num_experts=4):
+        super(CondConv2d, self).__init__()
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = tup_pair(kernel_size)
+        self.stride = tup_pair(stride)
+        padding_val, is_padding_dynamic = get_padding_value(
+            padding, kernel_size, stride=stride, dilation=dilation)
+        self.dynamic_padding = is_padding_dynamic  # if in forward to work with torchscript
+        self.padding = tup_pair(padding_val)
+        self.dilation = tup_pair(dilation)
+        self.groups = groups
+        self.num_experts = num_experts
+
+        self.weight_shape = (self.out_channels, self.in_channels // self.groups) + self.kernel_size
+        weight_num_param = 1
+        for wd in self.weight_shape:
+            weight_num_param *= wd
+        self.weight = torch.nn.Parameter(torch.Tensor(self.num_experts, weight_num_param))
+
+        if bias:
+            self.bias_shape = (self.out_channels,)
+            self.bias = torch.nn.Parameter(torch.Tensor(self.num_experts, self.out_channels))
+        else:
+            self.register_parameter('bias', None)
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        init_weight = get_condconv_initializer(
+            partial(nn.init.kaiming_uniform_, a=math.sqrt(5)), self.num_experts, self.weight_shape)
+        init_weight(self.weight)
+        if self.bias is not None:
+            fan_in = np.prod(self.weight_shape[1:])
+            bound = 1 / math.sqrt(fan_in)
+            init_bias = get_condconv_initializer(
+                partial(nn.init.uniform_, a=-bound, b=bound), self.num_experts, self.bias_shape)
+            init_bias(self.bias)
+
+    def forward(self, x, routing_weights):
+        B, C, H, W = x.shape
+        weight = torch.matmul(routing_weights, self.weight)
+        new_weight_shape = (B * self.out_channels, self.in_channels // self.groups) + self.kernel_size
+        weight = weight.view(new_weight_shape)
+        bias = None
+        if self.bias is not None:
+            bias = torch.matmul(routing_weights, self.bias)
+            bias = bias.view(B * self.out_channels)
+        # move batch elements with channels so each batch element can be efficiently convolved with separate kernel
+        x = x.view(1, B * C, H, W)
+        if self.dynamic_padding:
+            out = conv2d_same(
+                x, weight, bias, stride=self.stride, padding=self.padding,
+                dilation=self.dilation, groups=self.groups * B)
+        else:
+            out = F.conv2d(
+                x, weight, bias, stride=self.stride, padding=self.padding,
+                dilation=self.dilation, groups=self.groups * B)
+        out = out.permute([1, 0, 2, 3]).view(B, self.out_channels, out.shape[-2], out.shape[-1])
+
+        # Literal port (from TF definition)
+        # x = torch.split(x, 1, 0)
+        # weight = torch.split(weight, 1, 0)
+        # if self.bias is not None:
+        #     bias = torch.matmul(routing_weights, self.bias)
+        #     bias = torch.split(bias, 1, 0)
+        # else:
+        #     bias = [None] * B
+        # out = []
+        # for xi, wi, bi in zip(x, weight, bias):
+        #     wi = wi.view(*self.weight_shape)
+        #     if bi is not None:
+        #         bi = bi.view(*self.bias_shape)
+        #     out.append(self.conv_fn(
+        #         xi, wi, bi, stride=self.stride, padding=self.padding,
+        #         dilation=self.dilation, groups=self.groups))
+        # out = torch.cat(out, 0)
+        return out