Add MixNet (https://arxiv.org/abs/1907.09595) with pretrained weights converted from Tensorflow impl

rwightman · rwightman · commit dfa9298b4eeb · 2019-07-25T11:42:01.000-07:00
* refactor 'same' convolution and add helper to use MixedConv2d when needed
* improve performance of 'same' padding for cases that can be handled statically
* add support for extra exp, pw, and dw kernel specs with grouping support to decoder/string defs for MixNet
* shuffle some args for a bit more consistency, a little less clutter overall in gen_efficientnet.py
diff --git a/README.md b/README.md
@@ -31,16 +31,17 @@ I've included a few of my favourite models, but this is not an exhaustive collec
 * PNasNet & NASNet-A (from [Cadene](https://github.com/Cadene/pretrained-models.pytorch))
 * DPN (from [me](https://github.com/rwightman/pytorch-dpn-pretrained), weights hosted by Cadene)
     * DPN-68, DPN-68b, DPN-92, DPN-98, DPN-131, DPN-107
-* Generic EfficientNet (from my standalone [GenMobileNet](https://github.com/rwightman/genmobilenet-pytorch)) - A generic model that implements many of the mobile optimized architecture search derived models that utilize similar DepthwiseSeparable and InvertedResidual blocks
+* Generic EfficientNet (from my standalone [GenMobileNet](https://github.com/rwightman/genmobilenet-pytorch)) - A generic model that implements many of the efficient models that utilize similar DepthwiseSeparable and InvertedResidual blocks
     * EfficientNet (B0-B5) (https://arxiv.org/abs/1905.11946) -- validated, compat with TF weights
+    * MixNet (https://arxiv.org/abs/1907.09595) -- validated, compat with TF weights
     * MNASNet B1, A1 (Squeeze-Excite), and Small (https://arxiv.org/abs/1807.11626)
     * MobileNet-V1 (https://arxiv.org/abs/1704.04861)
     * MobileNet-V2 (https://arxiv.org/abs/1801.04381)
     * MobileNet-V3 (https://arxiv.org/abs/1905.02244) -- pretrained model good, still no official impl to verify against
     * ChamNet (https://arxiv.org/abs/1812.08934) -- specific arch details hard to find, currently an educated guess
     * FBNet-C (https://arxiv.org/abs/1812.03443) -- TODO A/B variants
     * Single-Path NAS (https://arxiv.org/abs/1904.02877) -- pixel1 variant
-    
+
 Use the  `--model` arg to specify model for train, validation, inference scripts. Match the all lowercase
 creation fn for the model you'd like.
 
@@ -118,11 +119,17 @@ I've leveraged the training scripts in this repository to train a few of the mod
 | gluon_resnext50_32x4d    | 79.356 (20.644) | 94.424 (5.576) | 25.03  | bicubic | |
 | gluon_resnet101_v1b      | 79.304 (20.696) | 94.524 (5.476) | 44.55  | bicubic | |
 | gluon_resnet50_v1d       | 79.074 (20.926) | 94.476 (5.524) | 25.58  | bicubic | |
+| tf_mixnet_l *tfp         | 78.846 (21.154) | 94.212 (5.788) | 7.33  | bilinear | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet/mixnet) |
+| tf_mixnet_l              | 78.770 (21.230) | 94.004 (5.996) | 7.33  | bicubic | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet/mixnet) |
 | gluon_resnet50_v1s       | 78.712 (21.288) | 94.242 (5.758) | 25.68  | bicubic | |
 | gluon_resnet50_v1c       | 78.010 (21.990) | 93.988 (6.012) | 25.58  | bicubic | |
 | gluon_resnet50_v1b       | 77.578 (22.422) | 93.718 (6.282) | 25.56  | bicubic | |
+| tf_mixnet_m *tfp         | 77.072 (22.928) | 93.368 (6.632) | 5.01  | bilinear | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet/mixnet) |
+| tf_mixnet_m              | 76.950 (23.050) | 93.156 (6.844) | 5.01  | bicubic | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet/mixnet) |
 | tf_efficientnet_b0 *tfp  | 76.828 (23.172) | 93.226 (6.774) | 5.29  | bicubic | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet) |
 | tf_efficientnet_b0       | 76.528 (23.472) | 93.010 (6.990) | 5.29  | bicubic | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet) |
+| tf_mixnet_s *tfp         | 75.800 (24.200) | 92.788 (7.212) | 4.13  | bilinear | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet/mixnet) |
+| tf_mixnet_s              | 75.648 (24.352) | 92.636 (7.364) | 4.13  | bicubic | [Google](https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet/mixnet) |
 | gluon_resnet34_v1b       | 74.580 (25.420) | 91.988 (8.012) | 21.80 | bicubic | |
 | gluon_resnet18_v1b       | 70.830 (29.170) | 89.756 (10.244) | 11.69 | bicubic | |
 
diff --git a/timm/data/loader.py b/timm/data/loader.py
@@ -112,7 +112,8 @@ def create_loader(
 
     if tf_preprocessing and use_prefetcher:
         from timm.data.tf_preprocessing import TfPreprocessTransform
-        transform = TfPreprocessTransform(is_training=is_training, size=img_size)
+        transform = TfPreprocessTransform(
+            is_training=is_training, size=img_size, interpolation=interpolation)
     else:
         if is_training:
             transform = transforms_imagenet_train(
diff --git a/timm/data/tf_preprocessing.py b/timm/data/tf_preprocessing.py
@@ -83,7 +83,7 @@ def _at_least_x_are_equal(a, b, x):
     return tf.greater_equal(tf.reduce_sum(match), x)
 
 
-def _decode_and_random_crop(image_bytes, image_size):
+def _decode_and_random_crop(image_bytes, image_size, resize_method):
     """Make a random crop of image_size."""
     bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4])
     image = distorted_bounding_box_crop(
@@ -100,13 +100,12 @@ def _decode_and_random_crop(image_bytes, image_size):
     image = tf.cond(
         bad,
         lambda: _decode_and_center_crop(image_bytes, image_size),
-        lambda: tf.image.resize_bicubic([image],  # pylint: disable=g-long-lambda
-                                        [image_size, image_size])[0])
+        lambda: tf.image.resize([image], [image_size, image_size], resize_method)[0])
 
     return image
 
 
-def _decode_and_center_crop(image_bytes, image_size):
+def _decode_and_center_crop(image_bytes, image_size, resize_method):
     """Crops to center of image with padding then scales image_size."""
     shape = tf.image.extract_jpeg_shape(image_bytes)
     image_height = shape[0]
@@ -122,7 +121,7 @@ def _decode_and_center_crop(image_bytes, image_size):
     crop_window = tf.stack([offset_height, offset_width,
                             padded_center_crop_size, padded_center_crop_size])
     image = tf.image.decode_and_crop_jpeg(image_bytes, crop_window, channels=3)
-    image = tf.image.resize_bicubic([image], [image_size, image_size])[0]
+    image = tf.image.resize([image], [image_size, image_size], resize_method)[0]
 
     return image
 
@@ -133,37 +132,41 @@ def _flip(image):
     return image
 
 
-def preprocess_for_train(image_bytes, use_bfloat16, image_size=IMAGE_SIZE):
+def preprocess_for_train(image_bytes, use_bfloat16, image_size=IMAGE_SIZE, interpolation='bicubic'):
     """Preprocesses the given image for evaluation.
 
     Args:
       image_bytes: `Tensor` representing an image binary of arbitrary size.
       use_bfloat16: `bool` for whether to use bfloat16.
       image_size: image size.
+      interpolation: image interpolation method
 
     Returns:
       A preprocessed image `Tensor`.
     """
-    image = _decode_and_random_crop(image_bytes, image_size)
+    resize_method = tf.image.ResizeMethod.BICUBIC if interpolation == 'bicubic' else tf.image.ResizeMethod.BILINEAR
+    image = _decode_and_random_crop(image_bytes, image_size, resize_method)
     image = _flip(image)
     image = tf.reshape(image, [image_size, image_size, 3])
     image = tf.image.convert_image_dtype(
         image, dtype=tf.bfloat16 if use_bfloat16 else tf.float32)
     return image
 
 
-def preprocess_for_eval(image_bytes, use_bfloat16, image_size=IMAGE_SIZE):
+def preprocess_for_eval(image_bytes, use_bfloat16, image_size=IMAGE_SIZE, interpolation='bicubic'):
     """Preprocesses the given image for evaluation.
 
     Args:
       image_bytes: `Tensor` representing an image binary of arbitrary size.
       use_bfloat16: `bool` for whether to use bfloat16.
       image_size: image size.
+      interpolation: image interpolation method
 
     Returns:
       A preprocessed image `Tensor`.
     """
-    image = _decode_and_center_crop(image_bytes, image_size)
+    resize_method = tf.image.ResizeMethod.BICUBIC if interpolation == 'bicubic' else tf.image.ResizeMethod.BILINEAR
+    image = _decode_and_center_crop(image_bytes, image_size, resize_method)
     image = tf.reshape(image, [image_size, image_size, 3])
     image = tf.image.convert_image_dtype(
         image, dtype=tf.bfloat16 if use_bfloat16 else tf.float32)
@@ -173,29 +176,32 @@ def preprocess_for_eval(image_bytes, use_bfloat16, image_size=IMAGE_SIZE):
 def preprocess_image(image_bytes,
                      is_training=False,
                      use_bfloat16=False,
-                     image_size=IMAGE_SIZE):
+                     image_size=IMAGE_SIZE,
+                     interpolation='bicubic'):
     """Preprocesses the given image.
 
     Args:
       image_bytes: `Tensor` representing an image binary of arbitrary size.
       is_training: `bool` for whether the preprocessing is for training.
       use_bfloat16: `bool` for whether to use bfloat16.
       image_size: image size.
+      interpolation: image interpolation method
 
     Returns:
       A preprocessed image `Tensor` with value range of [0, 255].
     """
     if is_training:
-        return preprocess_for_train(image_bytes, use_bfloat16, image_size)
+        return preprocess_for_train(image_bytes, use_bfloat16, image_size, interpolation)
     else:
-        return preprocess_for_eval(image_bytes, use_bfloat16, image_size)
+        return preprocess_for_eval(image_bytes, use_bfloat16, image_size, interpolation)
 
 
 class TfPreprocessTransform:
 
-    def __init__(self, is_training=False, size=224):
+    def __init__(self, is_training=False, size=224, interpolation='bicubic'):
         self.is_training = is_training
         self.size = size[0] if isinstance(size, tuple) else size
+        self.interpolation = interpolation
         self._image_bytes = None
         self.process_image = self._build_tf_graph()
         self.sess = None
@@ -206,7 +212,8 @@ def _build_tf_graph(self):
                 shape=[],
                 dtype=tf.string,
             )
-            img = preprocess_image(self._image_bytes, self.is_training, False, self.size)
+            img = preprocess_image(
+                self._image_bytes, self.is_training, False, self.size, self.interpolation)
         return img
 
     def __call__(self, image_bytes):
diff --git a/timm/models/conv2d_helpers.py b/timm/models/conv2d_helpers.py
@@ -0,0 +1,120 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+
+
+def _is_static_pad(kernel_size, stride=1, dilation=1, **_):
+    return stride == 1 and (dilation * (kernel_size - 1)) % 2 == 0
+
+
+def _get_padding(kernel_size, stride=1, dilation=1, **_):
+    padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
+    return padding
+
+
+def _calc_same_pad(i, k, s, d):
+    return max((math.ceil(i / s) - 1) * s + (k - 1) * d + 1 - i, 0)
+
+
+def _split_channels(num_chan, num_groups):
+    split = [num_chan // num_groups for _ in range(num_groups)]
+    split[0] += num_chan - sum(split)
+    return split
+
+
+class Conv2dSame(nn.Conv2d):
+    """ Tensorflow like 'SAME' convolution wrapper for 2D convolutions
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True):
+        super(Conv2dSame, self).__init__(
+            in_channels, out_channels, kernel_size, stride, 0, dilation,
+            groups, bias)
+
+    def forward(self, x):
+        ih, iw = x.size()[-2:]
+        kh, kw = self.weight.size()[-2:]
+        pad_h = _calc_same_pad(ih, kh, self.stride[0], self.dilation[0])
+        pad_w = _calc_same_pad(iw, kw, self.stride[1], self.dilation[1])
+        if pad_h > 0 or pad_w > 0:
+            x = F.pad(x, [pad_w//2, pad_w - pad_w//2, pad_h//2, pad_h - pad_h//2])
+        return F.conv2d(x, self.weight, self.bias, self.stride,
+                        self.padding, self.dilation, self.groups)
+
+
+def conv2d_pad(in_chs, out_chs, kernel_size, **kwargs):
+    padding = kwargs.pop('padding', '')
+    kwargs.setdefault('bias', False)
+    if isinstance(padding, str):
+        # for any string padding, the padding will be calculated for you, one of three ways
+        padding = padding.lower()
+        if padding == 'same':
+            # TF compatible 'SAME' padding, has a performance and GPU memory allocation impact
+            if _is_static_pad(kernel_size, **kwargs):
+                # static case, no extra overhead
+                padding = _get_padding(kernel_size, **kwargs)
+                return nn.Conv2d(in_chs, out_chs, kernel_size, padding=padding, **kwargs)
+            else:
+                # dynamic padding
+                return Conv2dSame(in_chs, out_chs, kernel_size, **kwargs)
+        elif padding == 'valid':
+            # 'VALID' padding, same as padding=0
+            return nn.Conv2d(in_chs, out_chs, kernel_size, padding=0, **kwargs)
+        else:
+            # Default to PyTorch style 'same'-ish symmetric padding
+            padding = _get_padding(kernel_size, **kwargs)
+            return nn.Conv2d(in_chs, out_chs, kernel_size, padding=padding, **kwargs)
+    else:
+        # padding was specified as a number or pair
+        return nn.Conv2d(in_chs, out_chs, kernel_size, padding=padding, **kwargs)
+
+
+class MixedConv2d(nn.Module):
+    """ Mixed Grouped Convolution
+    Based on MDConv and GroupedConv in MixNet impl:
+      https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mixnet/custom_layers.py
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size=3,
+                 stride=1, padding='', dilated=False, depthwise=False, **kwargs):
+        super(MixedConv2d, self).__init__()
+
+        kernel_size = kernel_size if isinstance(kernel_size, list) else [kernel_size]
+        num_groups = len(kernel_size)
+        in_splits = _split_channels(in_channels, num_groups)
+        out_splits = _split_channels(out_channels, num_groups)
+        for idx, (k, in_ch, out_ch) in enumerate(zip(kernel_size, in_splits, out_splits)):
+            d = 1
+            # FIXME make compat with non-square kernel/dilations/strides
+            if stride == 1 and dilated:
+                d, k = (k - 1) // 2, 3
+            conv_groups = out_ch if depthwise else 1
+            # use add_module to keep key space clean
+            self.add_module(
+                str(idx),
+                conv2d_pad(
+                    in_ch, out_ch, k, stride=stride,
+                    padding=padding, dilation=d, groups=conv_groups, **kwargs)
+            )
+        self.splits = in_splits
+
+    def forward(self, x):
+        x_split = torch.split(x, self.splits, 1)
+        x_out = [c(x) for x, c in zip(x_split, self._modules.values())]
+        x = torch.cat(x_out, 1)
+        return x
+
+
+# helper method
+def select_conv2d(in_chs, out_chs, kernel_size, **kwargs):
+    assert 'groups' not in kwargs  # only use 'depthwise' bool arg
+    if isinstance(kernel_size, list):
+        # We're going to use only lists for defining the MixedConv2d kernel groups,
+        # ints, tuples, other iterables will continue to pass to normal conv and specify h, w.
+        return MixedConv2d(in_chs, out_chs, kernel_size, **kwargs)
+    else:
+        depthwise = kwargs.pop('depthwise', False)
+        groups = out_chs if depthwise else 1
+        return conv2d_pad(in_chs, out_chs, kernel_size, groups=groups, **kwargs)
+
diff --git a/timm/models/conv2d_same.py b/timm/models/conv2d_same.py
diff --git a/timm/models/gen_efficientnet.py b/timm/models/gen_efficientnet.py