|
| 1 | +# MobileNetv1 |
| 2 | + |
| 3 | +""" |
| 4 | + mobilenetv1(width_mult, config; |
| 5 | + activation = relu, |
| 6 | + inchannels = 3, |
| 7 | + nclasses = 1000) |
| 8 | +
|
| 9 | +Create a MobileNetv1 model ([reference](https://arxiv.org/abs/1704.04861v1)). |
| 10 | +
|
| 11 | +# Arguments |
| 12 | +
|
| 13 | + - `width_mult`: Controls the number of output feature maps in each block |
| 14 | + (with 1.0 being the default in the paper) |
| 15 | +
|
| 16 | + - `configs`: A "list of tuples" configuration for each layer that details: |
| 17 | + |
| 18 | + + `dw`: Set true to use a depthwise separable convolution or false for regular convolution |
| 19 | + + `o`: The number of output feature maps |
| 20 | + + `s`: The stride of the convolutional kernel |
| 21 | + + `r`: The number of time this configuration block is repeated |
| 22 | + - `activate`: The activation function to use throughout the network |
| 23 | + - `inchannels`: The number of input channels. The default value is 3. |
| 24 | + - `nclasses`: The number of output classes |
| 25 | +""" |
| 26 | +function mobilenetv1(width_mult, config; |
| 27 | + activation = relu, |
| 28 | + inchannels = 3, |
| 29 | + nclasses = 1000) |
| 30 | + layers = [] |
| 31 | + for (dw, outch, stride, nrepeats) in config |
| 32 | + outch = Int(outch * width_mult) |
| 33 | + for _ in 1:nrepeats |
| 34 | + layer = dw ? |
| 35 | + depthwise_sep_conv_bn((3, 3), inchannels, outch, activation; |
| 36 | + stride = stride, pad = 1, bias = false) : |
| 37 | + conv_bn((3, 3), inchannels, outch, activation; stride = stride, pad = 1, |
| 38 | + bias = false) |
| 39 | + append!(layers, layer) |
| 40 | + inchannels = outch |
| 41 | + end |
| 42 | + end |
| 43 | + |
| 44 | + return Chain(Chain(layers), |
| 45 | + Chain(GlobalMeanPool(), |
| 46 | + MLUtils.flatten, |
| 47 | + Dense(inchannels, nclasses))) |
| 48 | +end |
| 49 | + |
| 50 | +const mobilenetv1_configs = [ |
| 51 | + # dw, c, s, r |
| 52 | + (false, 32, 2, 1), |
| 53 | + (true, 64, 1, 1), |
| 54 | + (true, 128, 2, 1), |
| 55 | + (true, 128, 1, 1), |
| 56 | + (true, 256, 2, 1), |
| 57 | + (true, 256, 1, 1), |
| 58 | + (true, 512, 2, 1), |
| 59 | + (true, 512, 1, 5), |
| 60 | + (true, 1024, 2, 1), |
| 61 | + (true, 1024, 1, 1), |
| 62 | +] |
| 63 | + |
| 64 | +""" |
| 65 | + MobileNetv1(width_mult = 1; inchannels = 3, pretrain = false, nclasses = 1000) |
| 66 | +
|
| 67 | +Create a MobileNetv1 model with the baseline configuration |
| 68 | +([reference](https://arxiv.org/abs/1704.04861v1)). |
| 69 | +Set `pretrain` to `true` to load the pretrained weights for ImageNet. |
| 70 | +
|
| 71 | +# Arguments |
| 72 | +
|
| 73 | + - `width_mult`: Controls the number of output feature maps in each block |
| 74 | + (with 1.0 being the default in the paper; |
| 75 | + this is usually a value between 0.1 and 1.4) |
| 76 | + - `inchannels`: The number of input channels. The default value is 3. |
| 77 | + - `pretrain`: Whether to load the pre-trained weights for ImageNet |
| 78 | + - `nclasses`: The number of output classes |
| 79 | +
|
| 80 | +See also [`Metalhead.mobilenetv1`](#). |
| 81 | +""" |
| 82 | +struct MobileNetv1 |
| 83 | + layers::Any |
| 84 | +end |
| 85 | + |
| 86 | +function MobileNetv1(width_mult::Number = 1; inchannels = 3, pretrain = false, |
| 87 | + nclasses = 1000) |
| 88 | + layers = mobilenetv1(width_mult, mobilenetv1_configs; inchannels, nclasses) |
| 89 | + pretrain && loadpretrain!(layers, string("MobileNetv1")) |
| 90 | + return MobileNetv1(layers) |
| 91 | +end |
| 92 | + |
| 93 | +@functor MobileNetv1 |
| 94 | + |
| 95 | +(m::MobileNetv1)(x) = m.layers(x) |
| 96 | + |
| 97 | +backbone(m::MobileNetv1) = m.layers[1] |
| 98 | +classifier(m::MobileNetv1) = m.layers[2] |
| 99 | + |
| 100 | +# MobileNetv2 |
| 101 | + |
| 102 | +""" |
| 103 | + mobilenetv2(width_mult, configs; inchannels = 3, max_width = 1280, nclasses = 1000) |
| 104 | +
|
| 105 | +Create a MobileNetv2 model. |
| 106 | +([reference](https://arxiv.org/abs/1801.04381)). |
| 107 | +
|
| 108 | +# Arguments |
| 109 | +
|
| 110 | + - `width_mult`: Controls the number of output feature maps in each block |
| 111 | + (with 1.0 being the default in the paper) |
| 112 | +
|
| 113 | + - `configs`: A "list of tuples" configuration for each layer that details: |
| 114 | + |
| 115 | + + `t`: The expansion factor that controls the number of feature maps in the bottleneck layer |
| 116 | + + `c`: The number of output feature maps |
| 117 | + + `n`: The number of times a block is repeated |
| 118 | + + `s`: The stride of the convolutional kernel |
| 119 | + + `a`: The activation function used in the bottleneck layer |
| 120 | + - `inchannels`: The number of input channels. The default value is 3. |
| 121 | + - `max_width`: The maximum number of feature maps in any layer of the network |
| 122 | + - `nclasses`: The number of output classes |
| 123 | +""" |
| 124 | +function mobilenetv2(width_mult, configs; inchannels = 3, max_width = 1280, nclasses = 1000) |
| 125 | + # building first layer |
| 126 | + inplanes = _round_channels(32 * width_mult, width_mult == 0.1 ? 4 : 8) |
| 127 | + layers = [] |
| 128 | + append!(layers, conv_bn((3, 3), inchannels, inplanes; pad = 1, stride = 2)) |
| 129 | + # building inverted residual blocks |
| 130 | + for (t, c, n, s, a) in configs |
| 131 | + outplanes = _round_channels(c * width_mult, width_mult == 0.1 ? 4 : 8) |
| 132 | + for i in 1:n |
| 133 | + push!(layers, |
| 134 | + invertedresidual(3, inplanes, inplanes * t, outplanes, a; |
| 135 | + stride = i == 1 ? s : 1)) |
| 136 | + inplanes = outplanes |
| 137 | + end |
| 138 | + end |
| 139 | + # building last several layers |
| 140 | + outplanes = (width_mult > 1) ? |
| 141 | + _round_channels(max_width * width_mult, width_mult == 0.1 ? 4 : 8) : |
| 142 | + max_width |
| 143 | + return Chain(Chain(Chain(layers), |
| 144 | + conv_bn((1, 1), inplanes, outplanes, relu6; bias = false)...), |
| 145 | + Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, |
| 146 | + Dense(outplanes, nclasses))) |
| 147 | +end |
| 148 | + |
| 149 | +# Layer configurations for MobileNetv2 |
| 150 | +const mobilenetv2_configs = [ |
| 151 | + # t, c, n, s, a |
| 152 | + (1, 16, 1, 1, relu6), |
| 153 | + (6, 24, 2, 2, relu6), |
| 154 | + (6, 32, 3, 2, relu6), |
| 155 | + (6, 64, 4, 2, relu6), |
| 156 | + (6, 96, 3, 1, relu6), |
| 157 | + (6, 160, 3, 2, relu6), |
| 158 | + (6, 320, 1, 1, relu6), |
| 159 | +] |
| 160 | + |
| 161 | +# Model definition for MobileNetv2 |
| 162 | +struct MobileNetv2 |
| 163 | + layers::Any |
| 164 | +end |
| 165 | + |
| 166 | +""" |
| 167 | + MobileNetv2(width_mult = 1.0; inchannels = 3, pretrain = false, nclasses = 1000) |
| 168 | +
|
| 169 | +Create a MobileNetv2 model with the specified configuration. |
| 170 | +([reference](https://arxiv.org/abs/1801.04381)). |
| 171 | +Set `pretrain` to `true` to load the pretrained weights for ImageNet. |
| 172 | +
|
| 173 | +# Arguments |
| 174 | +
|
| 175 | + - `width_mult`: Controls the number of output feature maps in each block |
| 176 | + (with 1.0 being the default in the paper; |
| 177 | + this is usually a value between 0.1 and 1.4) |
| 178 | + - `inchannels`: The number of input channels. The default value is 3. |
| 179 | + - `pretrain`: Whether to load the pre-trained weights for ImageNet |
| 180 | + - `nclasses`: The number of output classes |
| 181 | +
|
| 182 | +See also [`Metalhead.mobilenetv2`](#). |
| 183 | +""" |
| 184 | +function MobileNetv2(width_mult::Number = 1; inchannels = 3, pretrain = false, |
| 185 | + nclasses = 1000) |
| 186 | + layers = mobilenetv2(width_mult, mobilenetv2_configs; inchannels, nclasses) |
| 187 | + pretrain && loadpretrain!(layers, string("MobileNetv2")) |
| 188 | + return MobileNetv2(layers) |
| 189 | +end |
| 190 | + |
| 191 | +@functor MobileNetv2 |
| 192 | + |
| 193 | +(m::MobileNetv2)(x) = m.layers(x) |
| 194 | + |
| 195 | +backbone(m::MobileNetv2) = m.layers[1] |
| 196 | +classifier(m::MobileNetv2) = m.layers[2] |
| 197 | + |
| 198 | +# MobileNetv3 |
| 199 | + |
| 200 | +""" |
| 201 | + mobilenetv3(width_mult, configs; inchannels = 3, max_width = 1024, nclasses = 1000) |
| 202 | +
|
| 203 | +Create a MobileNetv3 model. |
| 204 | +([reference](https://arxiv.org/abs/1905.02244)). |
| 205 | +
|
| 206 | +# Arguments |
| 207 | +
|
| 208 | + - `width_mult`: Controls the number of output feature maps in each block |
| 209 | + (with 1.0 being the default in the paper; |
| 210 | + this is usually a value between 0.1 and 1.4) |
| 211 | +
|
| 212 | + - `configs`: a "list of tuples" configuration for each layer that details: |
| 213 | + |
| 214 | + + `k::Integer` - The size of the convolutional kernel |
| 215 | + + `c::Float` - The multiplier factor for deciding the number of feature maps in the hidden layer |
| 216 | + + `t::Integer` - The number of output feature maps for a given block |
| 217 | + + `r::Integer` - The reduction factor (`>= 1` or `nothing` to skip) for squeeze and excite layers |
| 218 | + + `s::Integer` - The stride of the convolutional kernel |
| 219 | + + `a` - The activation function used in the bottleneck (typically `hardswish` or `relu`) |
| 220 | + - `inchannels`: The number of input channels. The default value is 3. |
| 221 | + - `max_width`: The maximum number of feature maps in any layer of the network |
| 222 | + - `nclasses`: the number of output classes |
| 223 | +""" |
| 224 | +function mobilenetv3(width_mult, configs; inchannels = 3, max_width = 1024, nclasses = 1000) |
| 225 | + # building first layer |
| 226 | + inplanes = _round_channels(16 * width_mult, 8) |
| 227 | + layers = [] |
| 228 | + append!(layers, |
| 229 | + conv_bn((3, 3), inchannels, inplanes, hardswish; pad = 1, stride = 2, |
| 230 | + bias = false)) |
| 231 | + explanes = 0 |
| 232 | + # building inverted residual blocks |
| 233 | + for (k, t, c, r, a, s) in configs |
| 234 | + # inverted residual layers |
| 235 | + outplanes = _round_channels(c * width_mult, 8) |
| 236 | + explanes = _round_channels(inplanes * t, 8) |
| 237 | + push!(layers, |
| 238 | + invertedresidual(k, inplanes, explanes, outplanes, a; |
| 239 | + stride = s, reduction = r)) |
| 240 | + inplanes = outplanes |
| 241 | + end |
| 242 | + # building last several layers |
| 243 | + output_channel = max_width |
| 244 | + output_channel = width_mult > 1.0 ? _round_channels(output_channel * width_mult, 8) : |
| 245 | + output_channel |
| 246 | + classifier = Chain(Dense(explanes, output_channel, hardswish), |
| 247 | + Dropout(0.2), |
| 248 | + Dense(output_channel, nclasses)) |
| 249 | + return Chain(Chain(Chain(layers), |
| 250 | + conv_bn((1, 1), inplanes, explanes, hardswish; bias = false)...), |
| 251 | + Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, classifier)) |
| 252 | +end |
| 253 | + |
| 254 | +# Configurations for small and large mode for MobileNetv3 |
| 255 | +mobilenetv3_configs = Dict(:small => [ |
| 256 | + # k, t, c, SE, a, s |
| 257 | + (3, 1, 16, 4, relu, 2), |
| 258 | + (3, 4.5, 24, nothing, relu, 2), |
| 259 | + (3, 3.67, 24, nothing, relu, 1), |
| 260 | + (5, 4, 40, 4, hardswish, 2), |
| 261 | + (5, 6, 40, 4, hardswish, 1), |
| 262 | + (5, 6, 40, 4, hardswish, 1), |
| 263 | + (5, 3, 48, 4, hardswish, 1), |
| 264 | + (5, 3, 48, 4, hardswish, 1), |
| 265 | + (5, 6, 96, 4, hardswish, 2), |
| 266 | + (5, 6, 96, 4, hardswish, 1), |
| 267 | + (5, 6, 96, 4, hardswish, 1), |
| 268 | + ], |
| 269 | + :large => [ |
| 270 | + # k, t, c, SE, a, s |
| 271 | + (3, 1, 16, nothing, relu, 1), |
| 272 | + (3, 4, 24, nothing, relu, 2), |
| 273 | + (3, 3, 24, nothing, relu, 1), |
| 274 | + (5, 3, 40, 4, relu, 2), |
| 275 | + (5, 3, 40, 4, relu, 1), |
| 276 | + (5, 3, 40, 4, relu, 1), |
| 277 | + (3, 6, 80, nothing, hardswish, 2), |
| 278 | + (3, 2.5, 80, nothing, hardswish, 1), |
| 279 | + (3, 2.3, 80, nothing, hardswish, 1), |
| 280 | + (3, 2.3, 80, nothing, hardswish, 1), |
| 281 | + (3, 6, 112, 4, hardswish, 1), |
| 282 | + (3, 6, 112, 4, hardswish, 1), |
| 283 | + (5, 6, 160, 4, hardswish, 2), |
| 284 | + (5, 6, 160, 4, hardswish, 1), |
| 285 | + (5, 6, 160, 4, hardswish, 1), |
| 286 | + ]) |
| 287 | + |
| 288 | +# Model definition for MobileNetv3 |
| 289 | +struct MobileNetv3 |
| 290 | + layers::Any |
| 291 | +end |
| 292 | + |
| 293 | +""" |
| 294 | + MobileNetv3(mode::Symbol = :small, width_mult::Number = 1; inchannels = 3, pretrain = false, nclasses = 1000) |
| 295 | +
|
| 296 | +Create a MobileNetv3 model with the specified configuration. |
| 297 | +([reference](https://arxiv.org/abs/1905.02244)). |
| 298 | +Set `pretrain = true` to load the model with pre-trained weights for ImageNet. |
| 299 | +
|
| 300 | +# Arguments |
| 301 | +
|
| 302 | + - `mode`: :small or :large for the size of the model (see paper). |
| 303 | + - `width_mult`: Controls the number of output feature maps in each block |
| 304 | + (with 1.0 being the default in the paper; |
| 305 | + this is usually a value between 0.1 and 1.4) |
| 306 | + - `inchannels`: The number of channels in the input. The default value is 3. |
| 307 | + - `pretrain`: whether to load the pre-trained weights for ImageNet |
| 308 | + - `nclasses`: the number of output classes |
| 309 | +
|
| 310 | +See also [`Metalhead.mobilenetv3`](#). |
| 311 | +""" |
| 312 | +function MobileNetv3(mode::Symbol = :small, width_mult::Number = 1; inchannels = 3, |
| 313 | + pretrain = false, nclasses = 1000) |
| 314 | + @assert mode in [:large, :small] "`mode` has to be either :large or :small" |
| 315 | + max_width = (mode == :large) ? 1280 : 1024 |
| 316 | + layers = mobilenetv3(width_mult, mobilenetv3_configs[mode]; inchannels, max_width, |
| 317 | + nclasses) |
| 318 | + pretrain && loadpretrain!(layers, string("MobileNetv3", mode)) |
| 319 | + return MobileNetv3(layers) |
| 320 | +end |
| 321 | + |
| 322 | +@functor MobileNetv3 |
| 323 | + |
| 324 | +(m::MobileNetv3)(x) = m.layers(x) |
| 325 | + |
| 326 | +backbone(m::MobileNetv3) = m.layers[1] |
| 327 | +classifier(m::MobileNetv3) = m.layers[2] |
0 commit comments