Merge pull request #159 from theabhirath/repeat-fix-2

darsnack · web-flow · commit edf83e0932a8 · 2022-05-27T10:50:16.000-05:00
diff --git a/Project.toml b/Project.toml
@@ -10,7 +10,6 @@ Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 LazyArtifacts = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
-NeuralAttentionlib = "12afc1b8-fad6-47e1-9132-84abc478905f"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [compat]
@@ -20,15 +19,14 @@ Functors = "0.2"
 MLUtils = "0.2"
 NNlib = "0.7.34, 0.8"
 julia = "1.6"
-NeuralAttentionlib = "0.0"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [publish]
-title = "Metalhead.jl"
-theme = "_flux-theme"
 ignore = ["^(gh-pages|juliamnt|julia.dmg)$"]
+theme = "_flux-theme"
+title = "Metalhead.jl"
 
 [targets]
 test = ["Test"]
diff --git a/src/Metalhead.jl b/src/Metalhead.jl
@@ -7,7 +7,6 @@ using BSON
 using Artifacts, LazyArtifacts
 using Statistics
 using MLUtils
-using NeuralAttentionlib
 
 import Functors
 
diff --git a/src/convnets/densenet.jl b/src/convnets/densenet.jl
@@ -114,7 +114,7 @@ struct DenseNet
 end
 
 function DenseNet(nblocks::NTuple{N, <:Integer};
-                  growth_rate = 32, reduction = 0.5, nclasses = 1000) where N
+                  growth_rate = 32, reduction = 0.5, nclasses = 1000) where {N}
   layers = densenet(nblocks; growth_rate = growth_rate,
                              reduction = reduction,
                              nclasses = nclasses)
@@ -135,7 +135,8 @@ const densenet_config = Dict(121 => (6, 12, 24, 16),
                              201 => (6, 12, 48, 32))
 
 """
-    DenseNet(config::Int = 121; pretrain = false, nclasses = 1000)
+    DenseNet(config::Integer = 121; pretrain = false, nclasses = 1000)
+    DenseNet(transition_config::NTuple{N,Integer})
 
 Create a DenseNet model with specified configuration. Currently supported values are (121, 161, 169, 201)
 ([reference](https://arxiv.org/abs/1608.06993)).
@@ -146,7 +147,7 @@ Set `pretrain = true` to load the model with pre-trained weights for ImageNet.
 
 See also [`Metalhead.densenet`](#).
 """
-function DenseNet(config::Int = 121; pretrain = false, nclasses = 1000)
+function DenseNet(config::Integer = 121; pretrain = false, nclasses = 1000)
   @assert config in keys(densenet_config) "`config` must be one out of $(sort(collect(keys(densenet_config))))."
   model = DenseNet(densenet_config[config]; nclasses = nclasses)
 
diff --git a/src/convnets/mobilenet.jl b/src/convnets/mobilenet.jl
@@ -28,9 +28,9 @@ function mobilenetv1(width_mult, config;
                      nclasses = 1000,
                      fcsize = 1024)
   layers = []
-  for (dw, outch, stride, repeats) in config
+  for (dw, outch, stride, nrepeats) in config
     outch = Int(outch * width_mult)
-    for _ in 1:repeats
+    for _ in 1:nrepeats
       layer = dw ? depthwise_sep_conv_bn((3, 3), inchannels, outch, activation;
                                          stride = stride, pad = 1) :
                    conv_bn((3, 3), inchannels, outch, activation; stride = stride, pad = 1)
@@ -198,11 +198,11 @@ Create a MobileNetv3 model.
                 (with 1.0 being the default in the paper;
                  this is usually a value between 0.1 and 1.4)
 - `configs`: a "list of tuples" configuration for each layer that details:
-  - `k::Int` - The size of the convolutional kernel
+  - `k::Integer` - The size of the convolutional kernel
   - `c::Float` - The multiplier factor for deciding the number of feature maps in the hidden layer
-  - `t::Int` - The number of output feature maps for a given block
-  - `r::Int` - The reduction factor (`>= 1` or `nothing` to skip) for squeeze and excite layers
-  - `s::Int` - The stride of the convolutional kernel
+  - `t::Integer` - The number of output feature maps for a given block
+  - `r::Integer` - The reduction factor (`>= 1` or `nothing` to skip) for squeeze and excite layers
+  - `s::Integer` - The stride of the convolutional kernel
   - `a` - The activation function used in the bottleneck (typically `hardswish` or `relu`)
 - `max_width`: The maximum number of feature maps in any layer of the network
 - `nclasses`: the number of output classes
diff --git a/src/convnets/resnet.jl b/src/convnets/resnet.jl
@@ -236,7 +236,7 @@ as shown below:
 resnet50_v1 = ResNet([1, 1, 4], [3, 4, 6, 3], :B; block = Metalhead.bottleneck_v1)
 ```
 """
-function ResNet(depth::Int = 50; pretrain = false, nclasses = 1000)
+function ResNet(depth::Integer = 50; pretrain = false, nclasses = 1000)
     @assert depth in keys(resnet_config) "`depth` must be one of $(sort(collect(keys(resnet_config))))"
 
     config, block = resnet_config[depth]
diff --git a/src/convnets/resnext.jl b/src/convnets/resnext.jl
@@ -99,7 +99,7 @@ const resnext_config = Dict(
 )
 
 """
-    ResNeXt(config::Int = 50; cardinality = 32, width = 4, pretrain = false, nclasses = 1000)
+    ResNeXt(config::Integer = 50; cardinality = 32, width = 4, pretrain = false, nclasses = 1000)
 
 Create a ResNeXt model with specified configuration. Currently supported values for `config` are (50, 101).
 ([reference](https://arxiv.org/abs/1611.05431)).
@@ -110,7 +110,7 @@ Set `pretrain = true` to load the model with pre-trained weights for ImageNet.
 
 See also [`Metalhead.resnext`](#).
 """
-function ResNeXt(config::Int = 50; cardinality = 32, width = 4, pretrain = false, nclasses = 1000)
+function ResNeXt(config::Integer = 50; cardinality = 32, width = 4, pretrain = false, nclasses = 1000)
   @assert config in keys(resnext_config) "`config` must be one of $(sort(collect(keys(resnext_config))))"
 
   model = ResNeXt(cardinality, width; block_config = resnext_config[config], nclasses)
diff --git a/src/convnets/vgg.jl b/src/convnets/vgg.jl
@@ -115,9 +115,9 @@ Construct a VGG model with the specified input image size. Typically, the image
 
 ## Keyword Arguments:
 - `config` : VGG convolutional block configuration. It is defined as a vector of tuples `(output_channels, num_convolutions)` for each block 
-- `inchannels`::Int : number of input channels
+- `inchannels`::Integer : number of input channels
 - `batchnorm`::Bool : set to `true` to use batch normalization after each convolution
-- `nclasses`::Int : number of output classes
+- `nclasses`::Integer : number of output classes
 - `fcsize`: intermediate fully connected layer size
             (see [`Metalhead.vgg_classifier_layers`](#))
 - `dropout`: dropout level between fully connected layers
@@ -142,7 +142,7 @@ backbone(m::VGG) = m.layers[1]
 classifier(m::VGG) = m.layers[2]
 
 """
-    VGG(depth::Int = 16; pretrain = false, batchnorm = false)
+    VGG(depth::Integer = 16; pretrain = false, batchnorm = false)
 
 Create a VGG style model with specified `depth`. Available values include (11, 13, 16, 19).
 ([reference](https://arxiv.org/abs/1409.1556v6)).
@@ -154,7 +154,7 @@ See also [`VGG`](#).
 # Arguments
 - `pretrain`: set to `true` to load pre-trained model weights for ImageNet
 """
-function VGG(depth::Int = 16; pretrain = false, batchnorm = false, nclasses = 1000)
+function VGG(depth::Integer = 16; pretrain = false, batchnorm = false, nclasses = 1000)
   @assert depth in keys(vgg_config) "depth must be from one in $(sort(collect(keys(vgg_config))))"
 
   model = VGG((224, 224); config = vgg_conv_config[vgg_config[depth]],
diff --git a/src/layers/Layers.jl b/src/layers/Layers.jl
@@ -5,7 +5,6 @@ using Flux: outputsize, Zygote
 using Functors
 using Statistics
 using MLUtils
-using NeuralAttentionlib
 
 include("../utilities.jl")
 
diff --git a/src/layers/attention.jl b/src/layers/attention.jl
@@ -1,5 +1,5 @@
 """
-    MHAttention(nheads::Int, qkv_layer, attn_drop, projection)
+    MHAttention(nheads::Integer, qkv_layer, attn_drop, projection)
 
 Multi-head self-attention layer.
 
@@ -17,7 +17,7 @@ struct MHAttention{P, Q, R}
 end
 
 """
-    MHAttention(planes, nheads = 8; qkv_bias = false, attn_drop = 0., proj_drop = 0.)
+    MHAttention(planes::Integer, nheads::Integer = 8; qkv_bias::Bool = false, attn_drop = 0., proj_drop = 0.)
 
 Multi-head self-attention layer.
 
@@ -28,7 +28,7 @@ Multi-head self-attention layer.
 - `attn_drop`: dropout rate after the self-attention layer
 - `proj_drop`: dropout rate after the projection layer
 """
-function MHAttention(planes, nheads = 8; qkv_bias = false, attn_drop = 0., proj_drop = 0.)
+function MHAttention(planes::Integer, nheads::Integer = 8; qkv_bias::Bool = false, attn_drop = 0., proj_drop = 0.)
   @assert planes % nheads == 0 "planes should be divisible by nheads"
   qkv_layer = Dense(planes, planes * 3; bias = qkv_bias)
   attn_drop = Dropout(attn_drop)
@@ -39,10 +39,20 @@ end
 
 @functor MHAttention
 
-function (m::MHAttention)(x::AbstractArray{T, 3}) where T
-  features, len_seq, batch_size = size(x)
-  q, k, v = chunk(reshape(m.qkv_layer(x), features ÷ m.nheads, m.nheads, len_seq, 3 * batch_size), 3; dims = 4)
-  scale = convert(T, sqrt(size(q, 1) / m.nheads))
-  attn = m.attn_drop(softmax(NeuralAttentionlib.matmul(q, permutedims(k, (2, 1, 3, 4))) * scale))
-  x = m.projection(reshape(NeuralAttentionlib.matmul(attn, v), (features, len_seq, batch_size)))
+function (m::MHAttention)(x::AbstractArray{T, 3}) where {T}
+  nfeatures, seq_len, batch_size = size(x)
+  x_reshaped = reshape(x, nfeatures, seq_len * batch_size)
+  qkv = m.qkv_layer(x_reshaped)
+  qkv_reshaped = reshape(qkv, nfeatures ÷ m.nheads, m.nheads, seq_len, 3 * batch_size)
+  query, key, value = chunk(qkv_reshaped, 3; dims = 4)
+  scale = convert(T, sqrt(size(query, 1) / m.nheads))
+  key_reshaped = reshape(
+    permutedims(key, (2, 1, 3, 4)), m.nheads, nfeatures ÷ m.nheads, seq_len * batch_size
+  )
+  query_reshaped = reshape(query, nfeatures ÷ m.nheads, m.nheads, seq_len * batch_size)
+  attention = m.attn_drop(softmax(batched_mul(query_reshaped, key_reshaped) .* scale))
+  value_reshaped = reshape(value, nfeatures ÷ m.nheads, m.nheads, seq_len * batch_size)
+  pre_projection = reshape(batched_mul(attention, value_reshaped), (nfeatures, seq_len, batch_size))
+  y = m.projection(reshape(pre_projection, size(pre_projection, 1), :))
+  return reshape(y, :, seq_len, batch_size)
 end
diff --git a/src/layers/embeddings.jl b/src/layers/embeddings.jl
@@ -17,7 +17,7 @@ patches.
                 single argument constructor for a normalization layer like LayerNorm or BatchNorm
 - `flatten`: set true to flatten the input spatial dimensions after the embedding
 """
-function PatchEmbedding(imsize::Dims{2} = (224, 224); inchannels = 3,
+function PatchEmbedding(imsize::Dims{2} = (224, 224); inchannels::Integer = 3,
                         patch_size::Dims{2} = (16, 16), embedplanes = 768,
                         norm_layer = planes -> identity, flatten = true)
 
@@ -33,15 +33,15 @@ function PatchEmbedding(imsize::Dims{2} = (224, 224); inchannels = 3,
 end
 
 """
-    ViPosEmbedding(embedsize, npatches; init = (dims::Dims{2}) -> rand(Float32, dims))
+    ViPosEmbedding(embedsize::Integer, npatches::Integer; init = (dims::Dims{2}) -> rand(Float32, dims))
 
 Positional embedding layer used by many vision transformer-like models.
 """
 struct ViPosEmbedding{T}
   vectors::T
 end
 
-ViPosEmbedding(embedsize, npatches; init = (dims::Dims{2}) -> rand(Float32, dims)) =
+ViPosEmbedding(embedsize::Integer, npatches::Integer; init = (dims::Dims{2}) -> rand(Float32, dims)) =
   ViPosEmbedding(init((embedsize, npatches)))
 
 (p::ViPosEmbedding)(x) = x .+ p.vectors
@@ -59,8 +59,8 @@ end
 
 ClassTokens(dim::Integer; init = Flux.zeros32) = ClassTokens(init(dim, 1, 1))
 
-function (m::ClassTokens)(x)
-  tokens = repeat(m.token, 1, 1, size(x, 3))
+function (m::ClassTokens)(x::AbstractArray{T, 3}) where {T}
+  tokens = m.token .* fill!(similar(x, 1, 1, size(x, 3)), one(T))
   return hcat(tokens, x)
 end
 
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
@@ -2,7 +2,7 @@
 prenorm(planes, fn) = Chain(LayerNorm(planes), fn)
 
 """
-    ChannelLayerNorm(sz::Int, λ = identity; ϵ = 1f-5)
+    ChannelLayerNorm(sz::Integer, λ = identity; ϵ = 1f-5)
 
 A variant of LayerNorm where the input is normalised along the
 channel dimension. The input is expected to have channel dimension with size 
@@ -21,7 +21,7 @@ end
 
 (m::ChannelLayerNorm)(x) = m.diag(MLUtils.normalise(x, dims = ndims(x) - 1, ϵ = m.ϵ))
 
-function ChannelLayerNorm(sz::Int, λ = identity; ϵ = 1f-5)
+function ChannelLayerNorm(sz::Integer, λ = identity; ϵ = 1f-5)
   diag = Flux.Scale(1, 1, sz, λ)
   return ChannelLayerNorm(diag, ϵ)
 end
diff --git a/src/layers/others.jl b/src/layers/others.jl
@@ -1,5 +1,5 @@
 """
-    LayerScale(λ, planes::Int)
+    LayerScale(λ, planes::Integer)
 
 Creates a `Flux.Scale` layer that performs "`LayerScale`"
 ([reference](https://arxiv.org/abs/2103.17239)).
@@ -8,7 +8,7 @@ Creates a `Flux.Scale` layer that performs "`LayerScale`"
 - `planes`: Size of channel dimension in the input.
 - `λ`: initialisation value for the learnable diagonal matrix.
 """
-LayerScale(planes::Int, λ) =
+LayerScale(planes::Integer, λ) =
     λ > 0 ? Flux.Scale(fill(Float32(λ), planes), false) : identity
 
 """
diff --git a/src/other/mlpmixer.jl b/src/other/mlpmixer.jl
@@ -190,7 +190,7 @@ struct SpatialGatingUnit{T, F}
 end
 
 """
-    SpatialGatingUnit(planes::Int, npatches::Int; norm_layer = LayerNorm)
+    SpatialGatingUnit(planes::Integer, npatches::Integer; norm_layer = LayerNorm)
 
 Creates a spatial gating unit as described in the gMLP paper.
 ([reference](https://arxiv.org/abs/2105.08050))
@@ -200,7 +200,7 @@ Creates a spatial gating unit as described in the gMLP paper.
 - `npatches`: the number of patches of the input
 - `norm_layer`: the normalisation layer to use
 """
-function SpatialGatingUnit(planes::Int, npatches::Int; norm_layer = LayerNorm)
+function SpatialGatingUnit(planes::Integer, npatches::Integer; norm_layer = LayerNorm)
   gateplanes = planes ÷ 2
   norm = norm_layer(gateplanes)
   proj = Dense(2 * eps(Float32) .* rand(Float32, npatches, npatches), ones(npatches))