Miscellaneous fixes for MobileNet

theabhirath · theabhirath · commit ace6833480b5 · 2022-06-25T22:05:12.000+05:30
diff --git a/src/convnets/convmixer.jl b/src/convnets/convmixer.jl
@@ -9,7 +9,7 @@ Creates a ConvMixer model.
 
   - `planes`: number of planes in the output of each block
   - `depth`: number of layers
-  - `inchannels`: number of channels in the input
+  - `inchannels`: The number of channels in the input. The default value is 3.
   - `kernel_size`: kernel size of the convolutional layers
   - `patch_size`: size of the patches
   - `activation`: activation function used after the convolutional layers
@@ -45,7 +45,7 @@ Creates a ConvMixer model.
 # Arguments
 
   - `mode`: the mode of the model, either `:base`, `:small` or `:large`
-  - `inchannels`: number of channels in the input
+  - `inchannels`: The number of channels in the input. The default value is 3.
   - `activation`: activation function used after the convolutional layers
   - `nclasses`: number of classes in the output
 """
diff --git a/src/convnets/convnext.jl b/src/convnets/convnext.jl
@@ -33,8 +33,8 @@ Creates the layers for a ConvNeXt model.
   - `depths`: list with configuration for depth of each block
   - `planes`: list with configuration for number of output channels in each block
   - `drop_path_rate`: Stochastic depth rate.
-  - `λ`: Initial value for [`LayerScale`](#) 
-         ([reference](https://arxiv.org/abs/2103.17239))
+  - `λ`: Initial value for [`LayerScale`](#)
+    ([reference](https://arxiv.org/abs/2103.17239))
   - `nclasses`: number of output classes
 """
 function convnext(depths, planes; inchannels = 3, drop_path_rate = 0.0, λ = 1.0f-6,
@@ -92,7 +92,7 @@ Creates a ConvNeXt model.
 
 # Arguments:
 
-  - `inchannels`: number of input channels.
+  - `inchannels`: The number of channels in the input. The default value is 3.
   - `drop_path_rate`: Stochastic depth rate.
   - `λ`: Init value for [LayerScale](https://arxiv.org/abs/2103.17239)
   - `nclasses`: number of output classes
diff --git a/src/convnets/inception.jl b/src/convnets/inception.jl
@@ -326,7 +326,7 @@ Creates an Inceptionv4 model.
 # Arguments
 
   - `pretrain`: set to `true` to load the pre-trained weights for ImageNet
-  - `inchannels`: number of input channels.
+  - `inchannels`: The number of channels in the input. The default value is 3.
   - `dropout`: rate of dropout in classifier head.
   - `nclasses`: the number of output classes.
 
@@ -426,7 +426,7 @@ Creates an InceptionResNetv2 model.
 
 # Arguments
 
-  - `inchannels`: number of input channels.
+  - `inchannels`: The number of channels in the input. The default value is 3.
   - `dropout`: rate of dropout in classifier head.
   - `nclasses`: the number of output classes.
 """
@@ -459,12 +459,12 @@ Creates an InceptionResNetv2 model.
 # Arguments
 
   - `pretrain`: set to `true` to load the pre-trained weights for ImageNet
-  - `inchannels`: number of input channels.
+  - `inchannels`: The number of channels in the input. The default value is 3.
   - `dropout`: rate of dropout in classifier head.
   - `nclasses`: the number of output classes.
 
 !!! warning
-
+    
     `InceptionResNetv2` does not currently support pretrained weights.
 """
 struct InceptionResNetv2
@@ -496,7 +496,7 @@ Create an Xception block.
 
 # Arguments
 
-  - `inchannels`: number of input channels.
+  - `inchannels`: The number of channels in the input. The default value is 3.
   - `outchannels`: number of output channels.
   - `nrepeats`: number of repeats of depthwise separable convolution layers.
   - `stride`: stride by which to downsample the input.
@@ -540,7 +540,7 @@ Creates an Xception model.
 
 # Arguments
 
-  - `inchannels`: number of input channels.
+  - `inchannels`: The number of channels in the input. The default value is 3.
   - `dropout`: rate of dropout in classifier head.
   - `nclasses`: the number of output classes.
 """
@@ -571,7 +571,7 @@ Creates an Xception model.
 # Arguments
 
   - `pretrain`: set to `true` to load the pre-trained weights for ImageNet.
-  - `inchannels`: number of input channels.
+  - `inchannels`: The number of channels in the input. The default value is 3.
   - `dropout`: rate of dropout in classifier head.
   - `nclasses`: the number of output classes.
 
diff --git a/src/convnets/mobilenet.jl b/src/convnets/mobilenet.jl
@@ -4,8 +4,7 @@
     mobilenetv1(width_mult, config;
                 activation = relu,
                 inchannels = 3,
-                nclasses = 1000,
-                fcsize = 1024)
+                nclasses = 1000)
 
 Create a MobileNetv1 model ([reference](https://arxiv.org/abs/1704.04861v1)).
 
@@ -21,23 +20,24 @@ Create a MobileNetv1 model ([reference](https://arxiv.org/abs/1704.04861v1)).
       + `s`: The stride of the convolutional kernel
       + `r`: The number of time this configuration block is repeated
   - `activate`: The activation function to use throughout the network
-  - `inchannels`: The number of input feature maps``
+  - `inchannels`: The number of input channels. The default value is 3.
   - `fcsize`: The intermediate fully-connected size between the convolution and final layers
   - `nclasses`: The number of output classes
 """
 function mobilenetv1(width_mult, config;
                      activation = relu,
                      inchannels = 3,
-                     nclasses = 1000,
-                     fcsize = 1024)
+					 fcsize = 1024,
+                     nclasses = 1000)
     layers = []
     for (dw, outch, stride, nrepeats) in config
         outch = Int(outch * width_mult)
         for _ in 1:nrepeats
             layer = dw ?
                     depthwise_sep_conv_bn((3, 3), inchannels, outch, activation;
                                           stride = stride, pad = 1, bias = false) :
-                    conv_bn((3, 3), inchannels, outch, activation; stride = stride, pad = 1)
+                    conv_bn((3, 3), inchannels, outch, activation; stride = stride, pad = 1,
+                            bias = false)
             append!(layers, layer)
             inchannels = outch
         end
@@ -51,7 +51,7 @@ function mobilenetv1(width_mult, config;
 end
 
 const mobilenetv1_configs = [
-    #     dw,    c, s, r
+    # dw, c, s, r
     (false, 32, 2, 1),
     (true, 64, 1, 1),
     (true, 128, 2, 1),
@@ -65,7 +65,7 @@ const mobilenetv1_configs = [
 ]
 
 """
-    MobileNetv1(width_mult = 1; pretrain = false, nclasses = 1000)
+    MobileNetv1(width_mult = 1; inchannels = 3, pretrain = false, nclasses = 1000)
 
 Create a MobileNetv1 model with the baseline configuration
 ([reference](https://arxiv.org/abs/1704.04861v1)).
@@ -76,6 +76,7 @@ Set `pretrain` to `true` to load the pretrained weights for ImageNet.
   - `width_mult`: Controls the number of output feature maps in each block
     (with 1.0 being the default in the paper;
     this is usually a value between 0.1 and 1.4)
+  - `inchannels`: The number of input channels. The default value is 3.
   - `pretrain`: Whether to load the pre-trained weights for ImageNet
   - `nclasses`: The number of output classes
 
@@ -85,10 +86,10 @@ struct MobileNetv1
     layers::Any
 end
 
-function MobileNetv1(width_mult::Number = 1; pretrain = false, nclasses = 1000)
-    layers = mobilenetv1(width_mult, mobilenetv1_configs; nclasses = nclasses)
+function MobileNetv1(width_mult::Number = 1; inchannels = 3, pretrain = false,
+                     nclasses = 1000)
+    layers = mobilenetv1(width_mult, mobilenetv1_configs; inchannels, nclasses)
     pretrain && loadpretrain!(layers, string("MobileNetv1"))
-
     return MobileNetv1(layers)
 end
 
@@ -102,7 +103,7 @@ classifier(m::MobileNetv1) = m.layers[2]
 # MobileNetv2
 
 """
-    mobilenetv2(width_mult, configs; max_width = 1280, nclasses = 1000)
+    mobilenetv2(width_mult, configs; inchannels = 3, max_width = 1280, nclasses = 1000)
 
 Create a MobileNetv2 model.
 ([reference](https://arxiv.org/abs/1801.04381)).
@@ -119,14 +120,15 @@ Create a MobileNetv2 model.
       + `n`: The number of times a block is repeated
       + `s`: The stride of the convolutional kernel
       + `a`: The activation function used in the bottleneck layer
+  - `inchannels`: The number of input channels. The default value is 3.
   - `max_width`: The maximum number of feature maps in any layer of the network
   - `nclasses`: The number of output classes
 """
-function mobilenetv2(width_mult, configs; max_width = 1280, nclasses = 1000)
+function mobilenetv2(width_mult, configs; inchannels = 3, max_width = 1280, nclasses = 1000)
     # building first layer
     inplanes = _round_channels(32 * width_mult, width_mult == 0.1 ? 4 : 8)
     layers = []
-    append!(layers, conv_bn((3, 3), 3, inplanes; stride = 2))
+    append!(layers, conv_bn((3, 3), inchannels, inplanes; pad = 1, stride = 2))
     # building inverted residual blocks
     for (t, c, n, s, a) in configs
         outplanes = _round_channels(c * width_mult, width_mult == 0.1 ? 4 : 8)
@@ -165,7 +167,7 @@ struct MobileNetv2
 end
 
 """
-    MobileNetv2(width_mult = 1.0; pretrain = false, nclasses = 1000)
+    MobileNetv2(width_mult = 1.0; inchannels = 3, pretrain = false, nclasses = 1000)
 
 Create a MobileNetv2 model with the specified configuration.
 ([reference](https://arxiv.org/abs/1801.04381)).
@@ -176,13 +178,15 @@ Set `pretrain` to `true` to load the pretrained weights for ImageNet.
   - `width_mult`: Controls the number of output feature maps in each block
     (with 1.0 being the default in the paper;
     this is usually a value between 0.1 and 1.4)
+  - `inchannels`: The number of input channels. The default value is 3.
   - `pretrain`: Whether to load the pre-trained weights for ImageNet
   - `nclasses`: The number of output classes
 
 See also [`Metalhead.mobilenetv2`](#).
 """
-function MobileNetv2(width_mult::Number = 1; pretrain = false, nclasses = 1000)
-    layers = mobilenetv2(width_mult, mobilenetv2_configs; nclasses = nclasses)
+function MobileNetv2(width_mult::Number = 1; inchannels = 3, pretrain = false,
+                     nclasses = 1000)
+    layers = mobilenetv2(width_mult, mobilenetv2_configs; inchannels, nclasses)
     pretrain && loadpretrain!(layers, string("MobileNetv2"))
     return MobileNetv2(layers)
 end
@@ -197,7 +201,7 @@ classifier(m::MobileNetv2) = m.layers[2]
 # MobileNetv3
 
 """
-    mobilenetv3(width_mult, configs; max_width = 1024, nclasses = 1000)
+    mobilenetv3(width_mult, configs; inchannels = 3, max_width = 1024, nclasses = 1000)
 
 Create a MobileNetv3 model.
 ([reference](https://arxiv.org/abs/1905.02244)).
@@ -216,14 +220,17 @@ Create a MobileNetv3 model.
       + `r::Integer` - The reduction factor (`>= 1` or `nothing` to skip) for squeeze and excite layers
       + `s::Integer` - The stride of the convolutional kernel
       + `a` - The activation function used in the bottleneck (typically `hardswish` or `relu`)
+  - `inchannels`: The number of input channels. The default value is 3.
   - `max_width`: The maximum number of feature maps in any layer of the network
   - `nclasses`: the number of output classes
 """
-function mobilenetv3(width_mult, configs; max_width = 1024, nclasses = 1000)
+function mobilenetv3(width_mult, configs; inchannels = 3, max_width = 1024, nclasses = 1000)
     # building first layer
     inplanes = _round_channels(16 * width_mult, 8)
     layers = []
-    append!(layers, conv_bn((3, 3), 3, inplanes, hardswish; stride = 2))
+    append!(layers,
+            conv_bn((3, 3), inchannels, inplanes, hardswish; pad = 1, stride = 2,
+                    bias = false))
     explanes = 0
     # building inverted residual blocks
     for (k, t, c, r, a, s) in configs
@@ -249,7 +256,7 @@ end
 
 # Configurations for small and large mode for MobileNetv3
 mobilenetv3_configs = Dict(:small => [
-                               #  k,    t,  c,      SE,         a, s
+                               # k, t, c, SE, a, s
                                (3, 1, 16, 4, relu, 2),
                                (3, 4.5, 24, nothing, relu, 2),
                                (3, 3.67, 24, nothing, relu, 1),
@@ -263,7 +270,7 @@ mobilenetv3_configs = Dict(:small => [
                                (5, 6, 96, 4, hardswish, 1),
                            ],
                            :large => [
-                               #  k,   t,   c,      SE,         a, s
+                               # k, t, c, SE, a, s
                                (3, 1, 16, nothing, relu, 1),
                                (3, 4, 24, nothing, relu, 2),
                                (3, 3, 24, nothing, relu, 1),
@@ -287,7 +294,7 @@ struct MobileNetv3
 end
 
 """
-    MobileNetv3(mode::Symbol = :small, width_mult::Number = 1; pretrain = false, nclasses = 1000)
+    MobileNetv3(mode::Symbol = :small, width_mult::Number = 1; inchannels = 3, pretrain = false, nclasses = 1000)
 
 Create a MobileNetv3 model with the specified configuration.
 ([reference](https://arxiv.org/abs/1905.02244)).
@@ -299,17 +306,18 @@ Set `pretrain = true` to load the model with pre-trained weights for ImageNet.
   - `width_mult`: Controls the number of output feature maps in each block
     (with 1.0 being the default in the paper;
     this is usually a value between 0.1 and 1.4)
+  - `inchannels`: The number of channels in the input. The default value is 3.
   - `pretrain`: whether to load the pre-trained weights for ImageNet
   - `nclasses`: the number of output classes
 
 See also [`Metalhead.mobilenetv3`](#).
 """
-function MobileNetv3(mode::Symbol = :small, width_mult::Number = 1; pretrain = false,
-                     nclasses = 1000)
+function MobileNetv3(mode::Symbol = :small, width_mult::Number = 1; inchannels = 3,
+                     pretrain = false, nclasses = 1000)
     @assert mode in [:large, :small] "`mode` has to be either :large or :small"
     max_width = (mode == :large) ? 1280 : 1024
-    layers = mobilenetv3(width_mult, mobilenetv3_configs[mode]; max_width = max_width,
-                         nclasses = nclasses)
+    layers = mobilenetv3(width_mult, mobilenetv3_configs[mode]; inchannels, max_width,
+                         nclasses)
     pretrain && loadpretrain!(layers, string("MobileNetv3", mode))
     return MobileNetv3(layers)
 end
diff --git a/src/convnets/resnext.jl b/src/convnets/resnext.jl
@@ -112,7 +112,7 @@ Create a ResNeXt model with specified configuration. Currently supported values
 Set `pretrain = true` to load the model with pre-trained weights for ImageNet.
 
 !!! warning
-
+    
     `ResNeXt` does not currently support pretrained weights.
 
 See also [`Metalhead.resnext`](#).
diff --git a/src/layers/embeddings.jl b/src/layers/embeddings.jl
@@ -11,7 +11,7 @@ patches.
 # Arguments:
 
   - `imsize`: the size of the input image
-  - `inchannels`: the number of channels in the input image
+  - `inchannels`: the number of channels in the input. The default value is 3.
   - `patch_size`: the size of the patches
   - `embedplanes`: the number of channels in the embedding
   - `norm_layer`: the normalization layer - by default the identity function but otherwise takes a
diff --git a/src/vit-based/vit.jl b/src/vit-based/vit.jl
@@ -80,7 +80,7 @@ Creates a Vision Transformer (ViT) model.
 # Arguments
 
   - `mode`: the model configuration, one of
-            `[:tiny, :small, :base, :large, :huge, :giant, :gigantic]`
+    `[:tiny, :small, :base, :large, :huge, :giant, :gigantic]`
   - `imsize`: image size
   - `inchannels`: number of input channels
   - `patch_size`: size of the patches