Some more cleanup

theabhirath · theabhirath · commit b49fc40915cb · 2022-09-02T13:02:40.000+05:30
diff --git a/src/convnets/inceptions/xception.jl b/src/convnets/inceptions/xception.jl
@@ -34,7 +34,7 @@ function xception_block(inchannels::Integer, outchannels::Integer, nrepeats::Int
         end
         push!(layers, relu)
         append!(layers,
-                dwsep_conv_norm((3, 3), inc, outc; pad = 1, use_norm = (false, false)))
+                dwsep_conv_norm((3, 3), inc, outc; pad = 1, norm_layer = identity))
         push!(layers, BatchNorm(outc))
     end
     layers = start_with_relu ? layers : layers[2:end]
diff --git a/src/convnets/resnets/core.jl b/src/convnets/resnets/core.jl
@@ -133,7 +133,7 @@ function bottle2neck(inplanes::Integer, planes::Integer; stride::Integer = 1,
                   norm_layer, revnorm)...,
         attn_fn(outplanes),
     ]
-    return Chain(filter(!=(identity), layers)...)
+    return Chain(filter!(!=(identity), layers)...)
 end
 
 ## Downsample layers
@@ -345,34 +345,42 @@ Wide ResNet, ResNeXt and Res2Net. For an _even_ more generic model API, see [`Me
 
 # Arguments
 
-    - `block_type`: The type of block to be used in the model. This can be one of [`Metalhead.basicblock`](@ref),
-    [`Metalhead.bottleneck`](@ref) and [`Metalhead.bottle2neck`](@ref). `basicblock` is used in the 
+  - `block_type`: The type of block to be used in the model. This can be one of [`Metalhead.basicblock`](@ref),
+    [`Metalhead.bottleneck`](@ref) and [`Metalhead.bottle2neck`](@ref). `basicblock` is used in the
     original ResNet paper for ResNet-18 and ResNet-34, and `bottleneck` is used in the original ResNet-50
     and ResNet-101 models, as well as for the Wide ResNet and ResNeXt models. `bottle2neck` is introduced in
     the `Res2Net` paper.
-    - `block_repeats`: A `Vector` of integers specifying the number of times each block is repeated
+  - `block_repeats`: A `Vector` of integers specifying the number of times each block is repeated
     in each stage of the ResNet model. For example, `[3, 4, 6, 3]` is the configuration used in
     ResNet-50, which has 3 blocks in the first stage, 4 blocks in the second stage, 6 blocks in the
     third stage and 3 blocks in the fourth stage.
-    - `downsample_opt`: A `NTuple` of two callbacks that are used to determine the downsampling
+  - `downsample_opt`: A `NTuple` of two callbacks that are used to determine the downsampling
     operation to be used in the model. The first callback is used to determine the convolutional
     operation to be used in the downsampling operation and the second callback is used to determine
     the identity operation to be used in the downsampling operation.
-    - `cardinality`: The number of groups to be used in the 3x3 convolutional layer in the bottleneck
+  - `cardinality`: The number of groups to be used in the 3x3 convolutional layer in the bottleneck
     block. This is usually modified from the default value of `1` in the ResNet models to `32` or `64`
     in the `ResNeXt` models.
-    - `base_width`: The base width of the convolutional layer in the blocks of the model.
-    - `inplanes`: The number of input channels in the first convolutional layer.
-    - `reduction_factor`: The reduction factor used in the model.
-    - `connection`: This is a function that determines the residual connection in the model. For
+  - `base_width`: The base width of the convolutional layer in the blocks of the model.
+  - `inplanes`: The number of input channels in the first convolutional layer.
+  - `reduction_factor`: The reduction factor used in the model.
+  - `connection`: This is a function that determines the residual connection in the model. For
     `resnets`, either of [`Metalhead.addact`](@ref) or [`Metalhead.actadd`](@ref) is recommended.
-    - `norm_layer`: The normalisation layer to be used in the model.
-    - `revnorm`: set to `true` to place the normalisation layers before the convolutions
-    - `attn_fn`: A callback that is used to determine the attention function to be used in the model.
+  - `norm_layer`: The normalisation layer to be used in the model.
+  - `revnorm`: set to `true` to place the normalisation layers before the convolutions
+  - `attn_fn`: A callback that is used to determine the attention function to be used in the model.
     See [`Metalhead.Layers.squeeze_excite`](@ref) for an example.
-    - `pool_layer`: A fully-insta
-    - `use_conv`: Set to true to use convolutions instead of identity operations in the model.
-    - `dropblock_prob`: The probability of using DropBlock in the model.
+  - `pool_layer`: A fully-instantiated pooling layer passed in to be used by the classifier head.
+    For example, `AdaptiveMeanPool((1, 1))` is used in the ResNet family by default, but something
+    like `MeanPool((3, 3))` should also work provided the dimensions after applying the pooling
+    layer are compatible with the rest of the classifier head.
+  - `use_conv`: Set to true to use convolutions instead of identity operations in the model.
+  - `dropblock_prob`: `DropBlock` probability to be used in the model. Set to `nothing` to disable
+    DropBlock. See [`Metalhead.DropBlock`](@ref) for more details.
+  - `stochastic_depth_prob`: `StochasticDepth` probability to be used in the model. Set to `nothing` to disable
+    StochasticDepth. See [`Metalhead.StochasticDepth`](@ref) for more details.
+  - `dropout_prob`: `Dropout` probability to be used in the classifier head. Set to `nothing` to
+    disable Dropout.
 """
 function resnet(block_type, block_repeats::AbstractVector{<:Integer},
                 downsample_opt::NTuple{2, Any} = (downsample_conv, downsample_identity);
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
@@ -1,9 +1,8 @@
 """
     conv_norm(kernel_size::Dims{2}, inplanes::Integer, outplanes::Integer,
               activation = relu; norm_layer = BatchNorm, revnorm::Bool = false,
-              eps::Float32 = 1.0f-5, preact::Bool = false, use_norm::Bool = true,
-              stride::Integer = 1, pad::Integer = 0, dilation::Integer = 1, 
-              groups::Integer = 1, [bias, weight, init])
+              preact::Bool = false, stride::Integer = 1, pad::Integer = 0,
+              dilation::Integer = 1, groups::Integer = 1, [bias, weight, init])
 
 Create a convolution + normalisation layer pair with activation.
 
@@ -14,33 +13,27 @@ Create a convolution + normalisation layer pair with activation.
   - `outplanes`: number of output feature maps
   - `activation`: the activation function for the final layer
   - `norm_layer`: the normalisation layer used. Note that using `identity` as the normalisation
-    layer will result in no normalisation being applied i.e. this will be the same as
-    setting `use_norm = false`.
+    layer will result in no normalisation being applied. (This is only compatible with `preact`
+    and `revnorm` both set to `false`.)
   - `revnorm`: set to `true` to place the normalisation layer before the convolution
   - `preact`: set to `true` to place the activation function before the normalisation layer
     (only compatible with `revnorm = false`)
-  - `use_norm`: set to `false` to disable normalisation
-    (only compatible with `revnorm = false` and `preact = false`)
+  - `bias`: bias for the convolution kernel. This is set to `false` by default if
+    `norm_layer` is not `identity` and `true` otherwise.
   - `stride`: stride of the convolution kernel
   - `pad`: padding of the convolution kernel
   - `dilation`: dilation of the convolution kernel
   - `groups`: groups for the convolution kernel
-  - `bias`: bias for the convolution kernel. This is set to `false` by default if
-    `use_norm = true`.
   - `weight`, `init`: initialization for the convolution kernel (see [`Flux.Conv`](@ref))
 """
 function conv_norm(kernel_size::Dims{2}, inplanes::Integer, outplanes::Integer,
                    activation = relu; norm_layer = BatchNorm, revnorm::Bool = false,
-                   eps::Float32 = 1.0f-5, preact::Bool = false, use_norm::Bool = true,
-                   bias = !use_norm, kwargs...)
-    # no normalization layer (including case where normalization layer is identity)
-    use_norm = use_norm && norm_layer !== identity
-    if !use_norm
+                   preact::Bool = false, bias = !(norm_layer !== identity), kwargs...)
+    # no normalization layer
+    if !(norm_layer !== identity)
         if preact || revnorm
-            throw(ArgumentError("`preact` only supported with `use_norm = true`. Check if
-            `use_norm = false` is intended. Note that it is also possible to trigger this
-            error if you set `norm_layer` to `identity` since that returns the same
-            behaviour as `use_norm`."))
+            throw(ArgumentError("`preact` only supported with `norm_layer !== identity`.
+            Check if a non-`identity` norm layer is intended."))
         else
             # early return if no norm layer is required
             return [Conv(kernel_size, inplanes => outplanes, activation; kwargs...)]
@@ -64,7 +57,7 @@ function conv_norm(kernel_size::Dims{2}, inplanes::Integer, outplanes::Integer,
     end
     # layers
     layers = [Conv(kernel_size, inplanes => outplanes, activations.conv; bias, kwargs...),
-        norm_layer(normplanes, activations.norm; ϵ = eps)]
+        norm_layer(normplanes, activations.norm)]
     return revnorm ? reverse(layers) : layers
 end
 
@@ -86,6 +79,8 @@ TensorFlow implementation.
 """
 function basic_conv_bn(kernel_size::Dims{2}, inplanes, outplanes, activation = relu;
                        kwargs...)
-    return conv_norm(kernel_size, inplanes, outplanes, activation; norm_layer = BatchNorm,
-                     eps = 1.0f-3, kwargs...)
+    # TensorFlow uses a default epsilon of 1e-3 for BatchNorm
+    norm_layer = (args...; kwargs...) -> BatchNorm(args...; ϵ = 1.0f-3, kwargs...)
+    return conv_norm(kernel_size, inplanes, outplanes, activation; norm_layer = norm_layer,
+                     kwargs...)
 end
diff --git a/src/layers/mbconv.jl b/src/layers/mbconv.jl
@@ -1,17 +1,16 @@
 """
     dwsep_conv_norm(kernel_size::Dims{2}, inplanes::Integer, outplanes::Integer,
-                    activation = relu; eps::Float32 = 1.0f-5, revnorm::Bool = false, 
-                    stride::Integer = 1, use_norm::NTuple{2, Bool} = (true, true),
-                    pad::Integer = 0, [bias, weight, init])
+                    activation = relu; norm_layer = BatchNorm, stride::Integer = 1,
+                    bias::Bool = !(norm_layer !== identity), pad::Integer = 0, [bias, weight, init])
 
 Create a depthwise separable convolution chain as used in MobileNetv1.
 This is sequence of layers:
 
   - a `kernel_size` depthwise convolution from `inplanes => inplanes`
-  - a (batch) normalisation layer + `activation` (if `use_norm[1] == true`; otherwise
+  - a (batch) normalisation layer + `activation` (if `norm_layer !== identity`; otherwise
     `activation` is applied to the convolution output)
   - a `kernel_size` convolution from `inplanes => outplanes`
-  - a (batch) normalisation layer + `activation` (if `use_norm[2] == true`; otherwise
+  - a (batch) normalisation layer + `activation` (if `norm_layer !== identity`; otherwise
     `activation` is applied to the convolution output)
 
 See Fig. 3 in [reference](https://arxiv.org/abs/1704.04861v1).
@@ -22,25 +21,19 @@ See Fig. 3 in [reference](https://arxiv.org/abs/1704.04861v1).
   - `inplanes`: number of input feature maps
   - `outplanes`: number of output feature maps
   - `activation`: the activation function for the final layer
-  - `revnorm`: set to `true` to place the batch norm before the convolution
-  - `use_norm`: a tuple of two booleans to specify whether to use normalization for the first and
-    second convolution
-  - `bias`: a tuple of two booleans to specify whether to use bias for the first and second
-    convolution. This is set to `(false, false)` by default if `use_norm[0] == true` and
-    `use_norm[1] == true`.
+  - `norm_layer`: the normalisation layer used. Note that using `identity` as the normalisation
+    layer will result in no normalisation being applied.
+  - `bias`: whether to use bias in the convolution layers.
   - `stride`: stride of the first convolution kernel
   - `pad`: padding of the first convolution kernel
   - `weight`, `init`: initialization for the convolution kernel (see [`Flux.Conv`](@ref))
 """
 function dwsep_conv_norm(kernel_size::Dims{2}, inplanes::Integer, outplanes::Integer,
-                         activation = relu; norm_layer = BatchNorm, eps::Float32 = 1.0f-5,
-                         use_norm::NTuple{2, Bool} = (true, true), stride::Integer = 1,
-                         bias::NTuple{2, Bool} = (!use_norm[1], !use_norm[2]), kwargs...)
+                         activation = relu; norm_layer = BatchNorm, stride::Integer = 1,
+                         bias::Bool = !(norm_layer !== identity), kwargs...)
     return vcat(conv_norm(kernel_size, inplanes, inplanes, activation; eps, norm_layer,
-                          use_norm = use_norm[1], stride, bias = bias[1],
-                          groups = inplanes, kwargs...), # depthwise convolution
-                conv_norm((1, 1), inplanes, outplanes, activation; eps, norm_layer,
-                          use_norm = use_norm[2], bias = bias[2])) # pointwise convolution
+                          stride, bias, groups = inplanes, kwargs...), # depthwise convolution
+                conv_norm((1, 1), inplanes, outplanes, activation; eps, norm_layer, bias)) # pointwise convolution
 end
 
 """
diff --git a/src/layers/selayers.jl b/src/layers/selayers.jl
@@ -1,38 +1,30 @@
 """
-    squeeze_excite(inplanes::Integer, squeeze_planes::Integer;
-                   norm_layer = planes -> identity, activation = relu,
-                   gate_activation = sigmoid)
-
-    squeeze_excite(inplanes::Integer; reduction::Real = 16,
-                   norm_layer = planes -> identity, activation = relu,
-                   gate_activation = sigmoid)
+    squeeze_excite(inplanes::Integer; reduction::Real = 16, round_fn = _round_channels, 
+                   norm_layer = identity, activation = relu, gate_activation = sigmoid)
 
 Creates a squeeze-and-excitation layer used in MobileNets, EfficientNets and SE-ResNets.
 
 # Arguments
 
   - `inplanes`: The number of input feature maps
-  - `squeeze_planes`: The number of feature maps in the intermediate layers. Alternatively,
-    specify the keyword arguments `reduction` and `rd_divisior`, which determine the number
-    of feature maps in the intermediate layers from the number of input feature maps as:
-    `squeeze_planes = _round_channels(inplanes ÷ reduction)`. (See [`_round_channels`](@ref) for details.)
+  - `reduction`: The reduction factor for the number of hidden feature maps in the
+    squeeze and excite layer. The number of hidden feature maps is calculated as
+    `round_fn(inplanes / reduction)`.
+  - `round_fn`: The function to round the number of reduced feature maps.
   - `activation`: The activation function for the first convolution layer
   - `gate_activation`: The activation function for the gate layer
   - `norm_layer`: The normalization layer to be used after the convolution layers
   - `rd_planes`: The number of hidden feature maps in a squeeze and excite layer
 """
-function squeeze_excite(inplanes::Integer, squeeze_planes::Integer; norm_layer = identity,
-                        activation = relu, gate_activation = sigmoid)
+function squeeze_excite(inplanes::Integer; reduction::Real = 16, round_fn = _round_channels,
+                        norm_layer = identity, activation = relu, gate_activation = sigmoid)
+    squeeze_planes = round_fn(inplanes ÷ reduction)
     return SkipConnection(Chain(AdaptiveMeanPool((1, 1)),
                                 conv_norm((1, 1), inplanes, squeeze_planes, activation;
                                           norm_layer)...,
                                 conv_norm((1, 1), squeeze_planes, inplanes,
                                           gate_activation; norm_layer)...), .*)
 end
-function squeeze_excite(inplanes::Integer; reduction::Real = 16,
-                        round_fn = _round_channels, kwargs...)
-    return squeeze_excite(inplanes, round_fn(inplanes / reduction); kwargs...)
-end
 
 """
     effective_squeeze_excite(inplanes, gate_activation = sigmoid)