Add StochasticDepth to EfficientNets

theabhirath · theabhirath · commit 97c391126ab2 · 2022-08-28T08:46:52.000+05:30
More cleanup
diff --git a/src/convnets/builders/irmodel.jl b/src/convnets/builders/irmodel.jl
@@ -2,8 +2,9 @@ function irmodelbuilder(scalings::NTuple{2, Real}, block_configs::AbstractVector
                         inplanes::Integer = 32, connection = +, activation = relu,
                         norm_layer = BatchNorm, divisor::Integer = 8,
                         tail_conv::Bool = true, expanded_classifier::Bool = false,
-                        headplanes::Integer, dropout_prob = nothing,
-                        inchannels::Integer = 3, nclasses::Integer = 1000, kwargs...)
+                        stochastic_depth_prob = nothing, headplanes::Integer,
+                        dropout_prob = nothing, inchannels::Integer = 3,
+                        nclasses::Integer = 1000, kwargs...)
     width_mult, _ = scalings
     # building first layer
     inplanes = _round_channels(inplanes * width_mult, divisor)
@@ -13,7 +14,8 @@ function irmodelbuilder(scalings::NTuple{2, Real}, block_configs::AbstractVector
                       norm_layer))
     # building inverted residual blocks
     get_layers, block_repeats = mbconv_stage_builder(block_configs, inplanes, scalings;
-                                                     norm_layer, divisor, kwargs...)
+                                                     stochastic_depth_prob, norm_layer,
+                                                     divisor, kwargs...)
     append!(layers, cnn_stages(get_layers, block_repeats, connection))
     # building last layers
     outplanes = _round_channels(block_configs[end][3] * width_mult, divisor)
diff --git a/src/convnets/builders/mbconv.jl b/src/convnets/builders/mbconv.jl
@@ -23,10 +23,10 @@ end
 
 function _get_builder(::typeof(mbconv), block_configs::AbstractVector{<:Tuple},
                       inplanes::Integer, stage_idx::Integer, scalings::NTuple{2, Real};
-                      norm_layer = BatchNorm, divisor::Integer = 8,
-                      se_from_explanes::Bool = false, kwargs...)
+                      stochastic_depth_prob = nothing, norm_layer = BatchNorm,
+                      divisor::Integer = 8, se_from_explanes::Bool = false, kwargs...)
     width_mult, depth_mult = scalings
-    block_repeats = [ceil(Int, block_configs[idx][end - 3] * depth_mult)
+    block_repeats = [ceil(Int, block_configs[idx][end - 2] * depth_mult)
                      for idx in eachindex(block_configs)]
     block_fn, k, outplanes, expansion, stride, _, reduction, activation = block_configs[stage_idx]
     # calculate number of reduced channels for squeeze-excite layer from explanes instead of inplanes
@@ -37,48 +37,53 @@ function _get_builder(::typeof(mbconv), block_configs::AbstractVector{<:Tuple},
         inplanes = _round_channels(block_configs[stage_idx - 1][3] * width_mult, divisor)
     end
     outplanes = _round_channels(outplanes * width_mult, divisor)
-    pathschedule = linear_scheduler(stochastic_depth_prob; depth = sum(block_repeats))
+    sdschedule = linear_scheduler(stochastic_depth_prob; depth = sum(block_repeats))
     function get_layers(block_idx::Integer)
         inplanes = block_idx == 1 ? inplanes : outplanes
         explanes = _round_channels(inplanes * expansion, divisor)
         stride = block_idx == 1 ? stride : 1
         block = block_fn((k, k), inplanes, explanes, outplanes, activation; norm_layer,
                          stride, reduction, kwargs...)
-        schedule_idx = sum(block_repeats[1:(stage_idx - 1)]) + block_idx
-        drop_path = StochasticDepth(pathschedule(schedule_idx))
-        return stride == 1 && inplanes == outplanes ? (drop_path, block) : (block,)
+        use_skip = stride == 1 && inplanes == outplanes
+        if use_skip
+            schedule_idx = sum(block_repeats[1:(stage_idx - 1)]) + block_idx
+
+            drop_path = StochasticDepth(sdschedule[schedule_idx])
+            return (drop_path, block)
+        else
+            return (block,)
+        end
     end
-    return get_layers, ceil(Int, nrepeats * depth_mult)
+    return get_layers, block_repeats[stage_idx]
 end
 
 function _get_builder(::typeof(fused_mbconv), block_configs::AbstractVector{<:Tuple},
                       inplanes::Integer, stage_idx::Integer, scalings::NTuple{2, Real};
-                      norm_layer = BatchNorm, divisor::Integer = 8, kwargs...)
-    width_mult, depth_mult = scaling
-    block_repeats = [ceil(Int, block_configs[idx][end - 2] * depth_mult)
+                      stochastic_depth_prob = nothing, norm_layer = BatchNorm,
+                      divisor::Integer = 8, kwargs...)
+    width_mult, depth_mult = scalings
+    block_repeats = [ceil(Int, block_configs[idx][end - 1] * depth_mult)
                      for idx in eachindex(block_configs)]
     block_fn, k, outplanes, expansion, stride, _, activation = block_configs[stage_idx]
     inplanes = stage_idx == 1 ? inplanes : block_configs[stage_idx - 1][3]
     outplanes = _round_channels(outplanes * width_mult, divisor)
-    block_repeats = sum(block_configs[idx][4] for idx in 1:stage_idx)
-    pathschedule = linear_scheduler(stochastic_depth_prob; depth = sum(block_repeats))
+    sdschedule = linear_scheduler(stochastic_depth_prob; depth = sum(block_repeats))
     function get_layers(block_idx::Integer)
         inplanes = block_idx == 1 ? inplanes : outplanes
         explanes = _round_channels(inplanes * expansion, divisor)
         stride = block_idx == 1 ? stride : 1
         block = block_fn((k, k), inplanes, explanes, outplanes, activation;
                          norm_layer, stride, kwargs...)
         schedule_idx = sum(block_repeats[1:(stage_idx - 1)]) + block_idx
-        drop_path = StochasticDepth(pathschedule(schedule_idx))
+        drop_path = StochasticDepth(sdschedule[schedule_idx])
         return stride == 1 && inplanes == outplanes ? (drop_path, block) : (block,)
     end
     return get_layers, block_repeats[stage_idx]
 end
 
 function mbconv_stage_builder(block_configs::AbstractVector{<:Tuple}, inplanes::Integer,
                               scalings::NTuple{2, Real}; kwargs...)
-    bxs = [_get_builder(block_configs[1], block_configs, inplanes, idx, scalings;
-                        kwargs...)
-           for idx in eachindex(block_configs)]
+    bxs = [_get_builder(block_configs[idx][1], block_configs, inplanes, idx, scalings;
+                        kwargs...) for idx in eachindex(block_configs)]
     return (stage_idx, block_idx) -> first.(bxs)[stage_idx](block_idx), last.(bxs)
 end
diff --git a/src/convnets/builders/resblocks.jl b/src/convnets/builders/resblocks.jl
@@ -6,14 +6,14 @@ function basicblock_builder(block_repeats::AbstractVector{<:Integer};
                             dropblock_prob = nothing, stochastic_depth_prob = nothing,
                             stride_fn = resnet_stride, planes_fn = resnet_planes,
                             downsample_tuple = (downsample_conv, downsample_identity))
-    # DropBlock, StochasticDepth both take in rates based on a linear scaling schedule
+    # DropBlock, StochasticDepth both take in probabilities based on a linear scaling schedule
     # Also get `planes_vec` needed for block `inplanes` and `planes` calculations
-    pathschedule = linear_scheduler(stochastic_depth_prob; depth = sum(block_repeats))
-    blockschedule = linear_scheduler(dropblock_prob; depth = sum(block_repeats))
+    sdschedule = linear_scheduler(stochastic_depth_prob; depth = sum(block_repeats))
+    dbschedule = linear_scheduler(dropblock_prob; depth = sum(block_repeats))
     planes_vec = collect(planes_fn(block_repeats))
     # closure over `idxs`
     function get_layers(stage_idx::Integer, block_idx::Integer)
-        # DropBlock, StochasticDepth both take in rates based on a linear scaling schedule
+        # DropBlock, StochasticDepth both take in probabilities based on a linear scaling schedule
         # This is also needed for block `inplanes` and `planes` calculations
         schedule_idx = sum(block_repeats[1:(stage_idx - 1)]) + block_idx
         planes = planes_vec[schedule_idx]
@@ -23,8 +23,8 @@ function basicblock_builder(block_repeats::AbstractVector{<:Integer};
         stride = stride_fn(stage_idx, block_idx)
         downsample_fn = stride != 1 || inplanes != planes * expansion ?
                         downsample_tuple[1] : downsample_tuple[2]
-        drop_path = StochasticDepth(pathschedule[schedule_idx])
-        drop_block = DropBlock(blockschedule[schedule_idx])
+        drop_path = StochasticDepth(sdschedule[schedule_idx])
+        drop_block = DropBlock(dbschedule[schedule_idx])
         block = basicblock(inplanes, planes; stride, reduction_factor, activation,
                            norm_layer, revnorm, attn_fn, drop_path, drop_block)
         downsample = downsample_fn(inplanes, planes * expansion; stride, norm_layer,
@@ -43,8 +43,8 @@ function bottleneck_builder(block_repeats::AbstractVector{<:Integer};
                             dropblock_prob = nothing, stochastic_depth_prob = nothing,
                             stride_fn = resnet_stride, planes_fn = resnet_planes,
                             downsample_tuple = (downsample_conv, downsample_identity))
-    pathschedule = linear_scheduler(stochastic_depth_prob; depth = sum(block_repeats))
-    blockschedule = linear_scheduler(dropblock_prob; depth = sum(block_repeats))
+    sdschedule = linear_scheduler(stochastic_depth_prob; depth = sum(block_repeats))
+    dbschedule = linear_scheduler(dropblock_prob; depth = sum(block_repeats))
     planes_vec = collect(planes_fn(block_repeats))
     # closure over `idxs`
     function get_layers(stage_idx::Integer, block_idx::Integer)
@@ -58,8 +58,8 @@ function bottleneck_builder(block_repeats::AbstractVector{<:Integer};
         stride = stride_fn(stage_idx, block_idx)
         downsample_fn = stride != 1 || inplanes != planes * expansion ?
                         downsample_tuple[1] : downsample_tuple[2]
-        drop_path = StochasticDepth(pathschedule[schedule_idx])
-        drop_block = DropBlock(blockschedule[schedule_idx])
+        drop_path = StochasticDepth(sdschedule[schedule_idx])
+        drop_block = DropBlock(dbschedule[schedule_idx])
         block = bottleneck(inplanes, planes; stride, cardinality, base_width,
                            reduction_factor, activation, norm_layer, revnorm,
                            attn_fn, drop_path, drop_block)
diff --git a/src/convnets/convnext.jl b/src/convnets/convnext.jl
@@ -56,11 +56,11 @@ function convnext(depths::AbstractVector{<:Integer}, planes::AbstractVector{<:In
                               norm_layer = ChannelLayerNorm, revnorm = true)...))
     end
     stages = []
-    dp_rates = linear_scheduler(stochastic_depth_prob; depth = sum(depths))
+    sdschedule = linear_scheduler(stochastic_depth_prob; depth = sum(depths))
     cur = 0
     for i in eachindex(depths)
         push!(stages,
-              [convnextblock(planes[i], dp_rates[cur + j], layerscale_init)
+              [convnextblock(planes[i], sdschedule[cur + j], layerscale_init)
                for j in 1:depths[i]])
         cur += depths[i]
     end
diff --git a/src/convnets/efficientnets/efficientnet.jl b/src/convnets/efficientnets/efficientnet.jl
@@ -31,13 +31,13 @@ const EFFICIENTNET_GLOBAL_CONFIGS = Dict(:b0 => (224, (1.0, 1.0)),
                                          :b7 => (600, (2.0, 3.1)),
                                          :b8 => (672, (2.2, 3.6)))
 
-function efficientnet(config::Symbol; norm_layer = BatchNorm,
+function efficientnet(config::Symbol; norm_layer = BatchNorm, stochastic_depth_prob = 0.2,
                       dropout_prob = nothing, inchannels::Integer = 3,
                       nclasses::Integer = 1000)
     _checkconfig(config, keys(EFFICIENTNET_GLOBAL_CONFIGS))
     scalings = EFFICIENTNET_GLOBAL_CONFIGS[config][2]
     return irmodelbuilder(scalings, EFFICIENTNET_BLOCK_CONFIGS; inplanes = 32,
-                          norm_layer, activation = swish,
+                          norm_layer, stochastic_depth_prob, activation = swish,
                           headplanes = EFFICIENTNET_BLOCK_CONFIGS[end][3] * 4,
                           dropout_prob, inchannels, nclasses)
 end
diff --git a/src/convnets/efficientnets/efficientnetv2.jl b/src/convnets/efficientnets/efficientnetv2.jl
@@ -35,13 +35,14 @@ const EFFNETV2_CONFIGS = Dict(:small => [(fused_mbconv, 3, 24, 1, 1, 2, swish),
                                   (mbconv, 3, 512, 6, 2, 32, 4, swish),
                                   (mbconv, 3, 768, 6, 1, 8, 4, swish)])
 
-function efficientnetv2(config::Symbol; norm_layer = BatchNorm, dropout_prob = nothing,
-                        inchannels::Integer = 3, nclasses::Integer = 1000)
+function efficientnetv2(config::Symbol; norm_layer = BatchNorm, stochastic_depth_prob = 0.2,
+                        dropout_prob = nothing, inchannels::Integer = 3,
+                        nclasses::Integer = 1000)
     _checkconfig(config, keys(EFFNETV2_CONFIGS))
     block_configs = EFFNETV2_CONFIGS[config]
     return irmodelbuilder((1, 1), block_configs; activation = swish, norm_layer,
                           inplanes = block_configs[1][3], headplanes = 1280,
-                          dropout_prob, inchannels, nclasses)
+                          stochastic_depth_prob, dropout_prob, inchannels, nclasses)
 end
 
 """
diff --git a/src/layers/drop.jl b/src/layers/drop.jl
@@ -62,15 +62,6 @@ It can be used in two ways: either with all blocks having the same survival prob
 or with a linear scaling rule across the blocks. This is performed only at training time.
 At test time, the `DropBlock` layer is equivalent to `identity`.
 
-!!! warning
-    
-    In the case of the linear scaling rule, the calculations of survival probabilities for each
-    block may lead to a survival probability > 1 for a given block. This will lead to
-    `DropBlock` erroring. This usually happens with a low number of blocks and a high base
-    survival probability, so in such cases it is recommended to use a fixed base survival
-    probability across blocks. If this is not desired, then a lower base survival probability
-    is recommended.
-
 ([reference](https://arxiv.org/abs/1810.12890))
 
 # Arguments
@@ -141,15 +132,6 @@ all blocks having the same survival probability or with a linear scaling rule ac
 blocks. This is performed only at training time. At test time, the `StochasticDepth` layer is
 equivalent to `identity`.
 
-!!! warning
-    
-    In the case of the linear scaling rule, the calculations of survival probabilities for each
-    block may lead to a survival probability > 1 for a given block. This will lead to
-    `StochasticDepth` erroring. This usually happens with a low number of blocks and a high base
-    survival probability, so in such cases it is recommended to use a fixed base survival
-    probability across  blocks. If this is not desired, then a lower base survival probability
-    is recommended.
-
 # Arguments
 
   - `p`: probability of Stochastic Depth.
diff --git a/src/layers/mlp.jl b/src/layers/mlp.jl
@@ -1,4 +1,4 @@
-# TODO @theabhirath figure out consistent behaviour for dropout rates - 0.0 vs `nothing`
+# TODO @theabhirath figure out consistent behaviour for dropout probs - 0.0 vs `nothing`
 """
     mlp_block(inplanes::Integer, hidden_planes::Integer, outplanes::Integer = inplanes; 
               dropout_prob = 0., activation = gelu)
diff --git a/src/mixers/core.jl b/src/mixers/core.jl
@@ -27,9 +27,10 @@ function mlpmixer(block, imsize::Dims{2} = (224, 224); norm_layer = LayerNorm,
                   depth::Integer = 12, inchannels::Integer = 3, nclasses::Integer = 1000,
                   kwargs...)
     npatches = prod(imsize .÷ patch_size)
-    dp_rates = linear_scheduler(stochastic_depth_prob; depth)
+    sdschedule = linear_scheduler(stochastic_depth_prob; depth)
     layers = Chain(PatchEmbedding(imsize; inchannels, patch_size, embedplanes),
-                   Chain([block(embedplanes, npatches; stochastic_depth_prob = dp_rates[i],
+                   Chain([block(embedplanes, npatches;
+                                stochastic_depth_prob = sdschedule[i],
                                 kwargs...)
                           for i in 1:depth]...))
     classifier = Chain(norm_layer(embedplanes), seconddimmean, Dense(embedplanes, nclasses))
diff --git a/src/utilities.jl b/src/utilities.jl
@@ -66,17 +66,18 @@ function _maybe_big_show(io, model)
 end
 
 """
-    linear_scheduler(drop_rate = 0.0; start_value = 0.0, depth)
-    linear_scheduler(drop_rate::Nothing; depth::Integer)
+    linear_scheduler(drop_prob = 0.0; start_value = 0.0, depth)
+    linear_scheduler(drop_prob::Nothing; depth::Integer)
 
-Returns the dropout rates for a given depth using the linear scaling rule. If the
-`drop_rate` is `nothing`, it returns a `Vector` of length `depth` with all values
-equal to `nothing`.
+Returns the dropout probabilities for a given depth using the linear scaling rule. Note
+that this returns evenly spaced values between `start_value` and `drop_prob`, not including
+`drop_prob`. If `drop_prob` is `nothing`, it returns a `Vector` of length `depth` with all
+values equal to `nothing`.
 """
-function linear_scheduler(drop_rate = 0.0; depth::Integer, start_value = 0.0)
-    return LinRange(start_value, drop_rate, depth)
+function linear_scheduler(drop_prob = 0.0; depth::Integer, start_value = 0.0)
+    return LinRange(start_value, drop_prob, depth + 1)[1:(end - 1)]
 end
-linear_scheduler(drop_rate::Nothing; depth::Integer) = fill(drop_rate, depth)
+linear_scheduler(drop_prob::Nothing; depth::Integer) = fill(drop_prob, depth)
 
 # Utility function for depth and configuration checks in models
 function _checkconfig(config, configs)
diff --git a/test/convnets.jl b/test/convnets.jl
@@ -46,8 +46,8 @@ end
                     (dropout_prob = 0.5, stochastic_depth_prob = 0.5, dropblock_prob = 0.5),
                     (dropout_prob = 0.8, stochastic_depth_prob = 0.8, dropblock_prob = 0.8),
                 ]
-                @testset for drop_rates in drop_list
-                    m = Metalhead.resnet(block_fn, layers; drop_rates...)
+                @testset for drop_probs in drop_list
+                    m = Metalhead.resnet(block_fn, layers; drop_probs...)
                     @test size(m(x_224)) == (1000, 1)
                     @test gradtest(m, x_224)
                     _gc()

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		-# TODO @theabhirath figure out consistent behaviour for dropout rates - 0.0 vs `nothing`
	`1`	+# TODO @theabhirath figure out consistent behaviour for dropout probs - 0.0 vs `nothing`
`2`	`2`	`"""`
`3`	`3`	`mlp_block(inplanes::Integer, hidden_planes::Integer, outplanes::Integer = inplanes;`
`4`	`4`	`dropout_prob = 0., activation = gelu)`