FluxML
diff --git a/‎.github/workflows/CI.yml
Lines changed: 1 addition & 2 deletions b/‎.github/workflows/CI.yml
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/convnets/alexnet.jl
Lines changed: 6 additions & 6 deletions b/‎src/convnets/alexnet.jl
Lines changed: 6 additions & 6 deletions
diff --git a/‎src/convnets/convmixer.jl
Lines changed: 10 additions & 10 deletions b/‎src/convnets/convmixer.jl
Lines changed: 10 additions & 10 deletions
diff --git a/‎src/convnets/convnext.jl
Lines changed: 22 additions & 20 deletions b/‎src/convnets/convnext.jl
Lines changed: 22 additions & 20 deletions
diff --git a/‎src/convnets/densenet.jl
Lines changed: 24 additions & 25 deletions b/‎src/convnets/densenet.jl
Lines changed: 24 additions & 25 deletions
@@ -34,8 +34,7 @@ jobs:
           - '"Inception"'
           - '"DenseNet"'
           - '["ConvNeXt", "ConvMixer"]'
-          - 'r"ViTs"'
-          - 'r"Mixers"'
+          - '[r"ViTs", r"Mixers"]'
     steps:
       - uses: actions/checkout@v2
       - uses: julia-actions/setup-julia@v1
 
@@ -1,5 +1,5 @@
 """
-    alexnet(; nclasses = 1000)
+    alexnet(; nclasses::Integer = 1000)
 
 Create an AlexNet model
 ([reference](https://papers.nips.cc/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf)).
@@ -8,8 +8,8 @@ Create an AlexNet model
 
   - `nclasses`: the number of output classes
 """
-function alexnet(; nclasses = 1000)
-    layers = Chain(Chain(Conv((11, 11), 3 => 64, relu; stride = (4, 4), pad = (2, 2)),
+function alexnet(; inchannels::Integer = 3, nclasses::Integer = 1000)
+    layers = Chain(Chain(Conv((11, 11), inchannels => 64, relu; stride = (4, 4), pad = (2, 2)),
                          MaxPool((3, 3); stride = (2, 2)),
                          Conv((5, 5), 64 => 192, relu; pad = (2, 2)),
                          MaxPool((3, 3); stride = (2, 2)),
@@ -28,7 +28,7 @@ function alexnet(; nclasses = 1000)
 end
 
 """
-    AlexNet(; pretrain = false, nclasses = 1000)
+    AlexNet(; pretrain::Bool = false, nclasses::Integer = 1000)
 
 Create a `AlexNet`.
 See also [`alexnet`](#).
@@ -47,8 +47,8 @@ struct AlexNet
 end
 @functor AlexNet
 
-function AlexNet(; pretrain = false, nclasses = 1000)
-    layers = alexnet(; nclasses = nclasses)
+function AlexNet(; pretrain::Bool = false, inchannels::Integer = 3, nclasses::Integer = 1000)
+    layers = alexnet(; inchannels, nclasses)
     if pretrain
         loadpretrain!(layers, "AlexNet")
     end
 
@@ -1,6 +1,7 @@
 """
-    convmixer(planes, depth; inchannels = 3, kernel_size = (9, 9), patch_size::Dims{2} = 7,
-              activation = gelu, nclasses = 1000)
+    convmixer(planes::Integer, depth::Integer; kernel_size = (9, 9),
+              patch_size::Dims{2} = (7, 7), activation = gelu,
+              inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a ConvMixer model.
 ([reference](https://arxiv.org/abs/2201.09792))
@@ -9,14 +10,15 @@ Creates a ConvMixer model.
 
   - `planes`: number of planes in the output of each block
   - `depth`: number of layers
-  - `inchannels`: The number of channels in the input.
   - `kernel_size`: kernel size of the convolutional layers
   - `patch_size`: size of the patches
   - `activation`: activation function used after the convolutional layers
+  - `inchannels`: The number of channels in the input.
   - `nclasses`: number of classes in the output
 """
-function convmixer(planes, depth; inchannels = 3, kernel_size = (9, 9),
-                   patch_size::Dims{2} = (7, 7), activation = gelu, nclasses = 1000)
+function convmixer(planes::Integer, depth::Integer; kernel_size = (9, 9),
+                   patch_size::Dims{2} = (7, 7), activation = gelu,
+                   inchannels::Integer = 3, nclasses::Integer = 1000)
     stem = conv_norm(patch_size, inchannels, planes, activation; preact = true,
                      stride = patch_size[1])
     blocks = [Chain(SkipConnection(Chain(conv_norm(kernel_size, planes, planes, activation;
@@ -39,7 +41,7 @@ const CONVMIXER_CONFIGS = Dict(:base => Dict(:planes => 1536, :depth => 20,
                                               :patch_size => (7, 7)))
 
 """
-    ConvMixer(mode::Symbol = :base; inchannels = 3, activation = gelu, nclasses = 1000)
+    ConvMixer(mode::Symbol; inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a ConvMixer model.
 ([reference](https://arxiv.org/abs/2201.09792))
@@ -48,22 +50,20 @@ Creates a ConvMixer model.
 
   - `mode`: the mode of the model, either `:base`, `:small` or `:large`
   - `inchannels`: The number of channels in the input.
-  - `activation`: activation function used after the convolutional layers
   - `nclasses`: number of classes in the output
 """
 struct ConvMixer
     layers::Any
 end
 @functor ConvMixer
 
-function ConvMixer(mode::Symbol = :base; inchannels = 3, activation = gelu, nclasses = 1000)
+function ConvMixer(mode::Symbol; inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(mode, keys(CONVMIXER_CONFIGS))
     planes = CONVMIXER_CONFIGS[mode][:planes]
     depth = CONVMIXER_CONFIGS[mode][:depth]
     kernel_size = CONVMIXER_CONFIGS[mode][:kernel_size]
     patch_size = CONVMIXER_CONFIGS[mode][:patch_size]
-    layers = convmixer(planes, depth; inchannels, kernel_size, patch_size, activation,
-                       nclasses)
+    layers = convmixer(planes, depth; inchannels, kernel_size, patch_size, nclasses)
     return ConvMixer(layers)
 end
 
 
@@ -1,5 +1,5 @@
 """
-    convnextblock(planes, drop_path_rate = 0., λ = 1f-6)
+    convnextblock(planes::Integer, drop_path_rate = 0.0, layerscale_init = 1.0f-6)
 
 Creates a single block of ConvNeXt.
 ([reference](https://arxiv.org/abs/2201.03545))
@@ -8,21 +8,23 @@ Creates a single block of ConvNeXt.
 
   - `planes`: number of input channels.
   - `drop_path_rate`: Stochastic depth rate.
-  - `λ`: Initial value for [`LayerScale`](#)
+  - `layerscale_init`: Initial value for [`LayerScale`](#)
 """
-function convnextblock(planes, drop_path_rate = 0.0, λ = 1.0f-6)
+function convnextblock(planes::Integer, drop_path_rate = 0.0, layerscale_init = 1.0f-6)
     layers = SkipConnection(Chain(DepthwiseConv((7, 7), planes => planes; pad = 3),
                                   swapdims((3, 1, 2, 4)),
                                   LayerNorm(planes; ϵ = 1.0f-6),
                                   mlp_block(planes, 4 * planes),
-                                  LayerScale(planes, λ),
+                                  LayerScale(planes, layerscale_init),
                                   swapdims((2, 3, 1, 4)),
                                   DropPath(drop_path_rate)), +)
     return layers
 end
 
 """
-    convnext(depths, planes; inchannels = 3, drop_path_rate = 0., λ = 1f-6, nclasses = 1000)
+    convnext(depths::Vector{<:Integer}, planes::Vector{<:Integer};
+             drop_path_rate = 0.0, layerscale_init = 1.0f-6, inchannels::Integer = 3,
+             nclasses::Integer = 1000)
 
 Creates the layers for a ConvNeXt model.
 ([reference](https://arxiv.org/abs/2201.03545))
@@ -33,12 +35,13 @@ Creates the layers for a ConvNeXt model.
   - `depths`: list with configuration for depth of each block
   - `planes`: list with configuration for number of output channels in each block
   - `drop_path_rate`: Stochastic depth rate.
-  - `λ`: Initial value for [`LayerScale`](#)
+  - `layerscale_init`: Initial value for [`LayerScale`](#)
     ([reference](https://arxiv.org/abs/2103.17239))
   - `nclasses`: number of output classes
 """
-function convnext(depths, planes; inchannels = 3, drop_path_rate = 0.0, λ = 1.0f-6,
-                  nclasses = 1000)
+function convnext(depths::Vector{<:Integer}, planes::Vector{<:Integer};
+                  drop_path_rate = 0.0, layerscale_init = 1.0f-6, inchannels::Integer = 3,
+                  nclasses::Integer = 1000)
     @assert length(depths) == length(planes)
     "`planes` should have exactly one value for each block"
     downsample_layers = []
@@ -54,7 +57,9 @@ function convnext(depths, planes; inchannels = 3, drop_path_rate = 0.0, λ = 1.0
     dp_rates = linear_scheduler(drop_path_rate; depth = sum(depths))
     cur = 0
     for i in eachindex(depths)
-        push!(stages, [convnextblock(planes[i], dp_rates[cur + j], λ) for j in 1:depths[i]])
+        push!(stages,
+              [convnextblock(planes[i], dp_rates[cur + j], layerscale_init)
+               for j in 1:depths[i]])
         cur += depths[i]
     end
     backbone = collect(Iterators.flatten(Iterators.flatten(zip(downsample_layers, stages))))
@@ -72,30 +77,27 @@ const CONVNEXT_CONFIGS = Dict(:tiny => ([3, 3, 9, 3], [96, 192, 384, 768]),
                               :large => ([3, 3, 27, 3], [192, 384, 768, 1536]),
                               :xlarge => ([3, 3, 27, 3], [256, 512, 1024, 2048]))
 
-struct ConvNeXt
-    layers::Any
-end
-@functor ConvNeXt
-
 """
-    ConvNeXt(mode::Symbol = :base; inchannels = 3, drop_path_rate = 0., λ = 1f-6, nclasses = 1000)
+    ConvNeXt(mode::Symbol; inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a ConvNeXt model.
 ([reference](https://arxiv.org/abs/2201.03545))
 
 # Arguments
 
   - `inchannels`: The number of channels in the input.
-  - `drop_path_rate`: Stochastic depth rate.
-  - `λ`: Init value for [LayerScale](https://arxiv.org/abs/2103.17239)
   - `nclasses`: number of output classes
 
 See also [`Metalhead.convnext`](#).
 """
-function ConvNeXt(mode::Symbol = :base; inchannels = 3, drop_path_rate = 0.0, λ = 1.0f-6,
-                  nclasses = 1000)
+struct ConvNeXt
+    layers::Any
+end
+@functor ConvNeXt
+
+function ConvNeXt(mode::Symbol; inchannels::Integer = 3, nclasses::Integer = 1000)
     _checkconfig(mode, keys(CONVNEXT_CONFIGS))
-    layers = convnext(CONVNEXT_CONFIGS[mode]...; inchannels, drop_path_rate, λ, nclasses)
+    layers = convnext(CONVNEXT_CONFIGS[mode]...; inchannels, nclasses)
     return ConvNeXt(layers)
 end
 
 
@@ -10,7 +10,7 @@ Create a Densenet bottleneck layer
   - `outplanes`: number of output feature maps on bottleneck branch
     (and scaling factor for inner feature maps; see ref)
 """
-function dense_bottleneck(inplanes, outplanes)
+function dense_bottleneck(inplanes::Integer, outplanes::Integer)
     inner_channels = 4 * outplanes
     return SkipConnection(Chain(conv_norm((1, 1), inplanes, inner_channels; bias = false,
                                           revnorm = true)...,
@@ -30,7 +30,7 @@ Create a DenseNet transition sequence
   - `inplanes`: number of input feature maps
   - `outplanes`: number of output feature maps
 """
-function transition(inplanes, outplanes)
+function transition(inplanes::Integer, outplanes::Integer)
     return Chain(conv_norm((1, 1), inplanes, outplanes; bias = false, revnorm = true)...,
                  MeanPool((2, 2)))
 end
@@ -48,14 +48,14 @@ the number of output feature maps by `growth_rates` with each block
   - `growth_rates`: the growth (additive) rates of output feature maps
     after each block (a vector of `k`s from the ref)
 """
-function dense_block(inplanes, growth_rates)
+function dense_block(inplanes::Integer, growth_rates)
     return [dense_bottleneck(i, o)
             for (i, o) in zip(inplanes .+ cumsum([0, growth_rates[1:(end - 1)]...]),
                               growth_rates)]
 end
 
 """
-    densenet(inplanes, growth_rates; reduction = 0.5, nclasses = 1000)
+    densenet(inplanes, growth_rates; reduction = 0.5, nclasses::Integer = 1000)
 
 Create a DenseNet model
 ([reference](https://arxiv.org/abs/1608.06993)).
@@ -68,9 +68,11 @@ Create a DenseNet model
   - `reduction`: the factor by which the number of feature maps is scaled across each transition
   - `nclasses`: the number of output classes
 """
-function densenet(inplanes, growth_rates; reduction = 0.5, nclasses = 1000)
+function densenet(inplanes::Integer, growth_rates; reduction = 0.5, inchannels::Integer = 3,
+                  nclasses::Integer = 1000)
     layers = []
-    append!(layers, conv_norm((7, 7), 3, inplanes; stride = 2, pad = (3, 3), bias = false))
+    append!(layers,
+            conv_norm((7, 7), inchannels, inplanes; stride = 2, pad = (3, 3), bias = false))
     push!(layers, MaxPool((3, 3); stride = 2, pad = (1, 1)))
     outplanes = 0
     for (i, rates) in enumerate(growth_rates)
@@ -88,7 +90,7 @@ function densenet(inplanes, growth_rates; reduction = 0.5, nclasses = 1000)
 end
 
 """
-    densenet(nblocks; growth_rate = 32, reduction = 0.5, nclasses = 1000)
+    densenet(nblocks; growth_rate = 32, reduction = 0.5, nclasses::Integer = 1000)
 
 Create a DenseNet model
 ([reference](https://arxiv.org/abs/1608.06993)).
@@ -100,15 +102,15 @@ Create a DenseNet model
   - `reduction`: the factor by which the number of feature maps is scaled across each transition
   - `nclasses`: the number of output classes
 """
-function densenet(nblocks::NTuple{N, <:Integer}; growth_rate = 32, reduction = 0.5,
-                  nclasses = 1000) where {N}
+function densenet(nblocks::Vector{<:Integer}; growth_rate::Integer = 32, reduction = 0.5,
+                  inchannels::Integer = 3, nclasses::Integer = 1000)
     return densenet(2 * growth_rate, [fill(growth_rate, n) for n in nblocks];
-                    reduction = reduction, nclasses = nclasses)
+                    reduction, inchannels, nclasses)
 end
 
 """
-    DenseNet(nblocks::NTuple{N, <:Integer};
-             growth_rate = 32, reduction = 0.5, nclasses = 1000)
+    DenseNet(nblocks::Vector{<:Integer}; growth_rate::Integer = 32, reduction = 0.5,
+             inchannels = 3, nclasses::Integer = 1000)
 
 Create a DenseNet model
 ([reference](https://arxiv.org/abs/1608.06993)).
@@ -124,29 +126,26 @@ See also [`densenet`](#).
 struct DenseNet
     layers::Any
 end
+@functor DenseNet
 
-function DenseNet(nblocks::NTuple{N, <:Integer};
-                  growth_rate = 32, reduction = 0.5, nclasses = 1000) where {N}
-    layers = densenet(nblocks; growth_rate = growth_rate,
-                      reduction = reduction,
-                      nclasses = nclasses)
+function DenseNet(nblocks::Vector{<:Integer}; growth_rate::Integer = 32, reduction = 0.5,
+                  inchannels = 3, nclasses::Integer = 1000)
+    layers = densenet(nblocks; growth_rate, reduction, inchannels, nclasses)
     return DenseNet(layers)
 end
 
-@functor DenseNet
-
 (m::DenseNet)(x) = m.layers(x)
 
 backbone(m::DenseNet) = m.layers[1]
 classifier(m::DenseNet) = m.layers[2]
 
-const DENSENET_CONFIGS = Dict(121 => (6, 12, 24, 16),
-                              161 => (6, 12, 36, 24),
-                              169 => (6, 12, 32, 32),
-                              201 => (6, 12, 48, 32))
+const DENSENET_CONFIGS = Dict(121 => [6, 12, 24, 16],
+                              161 => [6, 12, 36, 24],
+                              169 => [6, 12, 32, 32],
+                              201 => [6, 12, 48, 32])
 
 """
-    DenseNet(config::Integer = 121; pretrain = false, nclasses = 1000)
+    DenseNet(config::Integer = 121; pretrain::Bool = false, nclasses::Integer = 1000)
     DenseNet(transition_configs::NTuple{N,Integer})
 
 Create a DenseNet model with specified configuration. Currently supported values are (121, 161, 169, 201)
@@ -159,7 +158,7 @@ Set `pretrain = true` to load the model with pre-trained weights for ImageNet.
 
 See also [`Metalhead.densenet`](#).
 """
-function DenseNet(config::Integer = 121; pretrain = false, nclasses = 1000)
+function DenseNet(config::Integer = 121; pretrain::Bool = false, nclasses::Integer = 1000)
     _checkconfig(config, keys(DENSENET_CONFIGS))
     model = DenseNet(DENSENET_CONFIGS[config]; nclasses = nclasses)
     if pretrain