Merge #1781

bors[bot] · ToucheSir · web-flow · commit 4e283778a8e5 · 2021-11-26T14:17:50.000Z
1781: Fix AlphaDropout implementation and add tests r=CarloLucibello a=ToucheSir AFAICT, the original implementation never behaved as expected even pre-Zygote. This was likely not caught because the original PR didn't come with tests, so this PR should remedy that. Behaviour and outputs are adapted from the PyTorch and TF implementations. Some points of note: 1. We have to special-case `p = 0` to avoid propagating NaNs when calcuating `A` and `B`. 2. Likewise for `p = 0`. TF just returns the input, but I think the PyTorch approach of returning all zeros (+/- depending on the input sign) is more in line with `Dropout`. 3. `ifelse` is used instead of something like https://github.com/keras-team/keras/blob/v2.7.0/keras/layers/noise.py#L200. I think it better reflects the conditional nature of the operation and it was also faster in local benchmarking. ### PR Checklist - [x] Tests are added - [ ] Entry in NEWS.md Co-authored-by: Brian Chen <ToucheSir@users.noreply.github.com>
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,8 @@
 # Flux Release Notes
 
+## v0.12.9
+* Fixed incorrect output and added GPU compatibility for [AlphaDropout](https://github.com/FluxML/Flux.jl/pull/1781).
+
 ## v0.12.8
 * Optimized inference and gradient calculation of OneHotMatrix[pr](https://github.com/FluxML/Flux.jl/pull/1756)
 
@@ -12,7 +15,7 @@
 * REPL printing via [`show`](https://github.com/FluxML/Flux.jl/pull/1467) displays parameter counts.
 
 ## v0.12.4
-* Implemented an [`Embedding layer`](https://github.com/FluxML/Flux.jl/pull/1516) 
+* Implemented an [`Embedding layer`](https://github.com/FluxML/Flux.jl/pull/1516)
   based on `NNlib.gather` and `NNlib.scatter`.
 
 ## v0.12.1 - v0.12.3
@@ -37,8 +40,8 @@
 * New [`Parallel` layer](https://github.com/FluxML/Flux.jl/pull/1462) adds inception module-like building blocks.
 * Feature additions and bug fixes for BatchNorm, LayerNorm, InstanceNorm, and GroupNorm [normalization layers](https://github.com/FluxML/Flux.jl/pull/1397)
 * Added [Upsample and PixelShuffle layers](https://github.com/FluxML/Flux.jl/pull/1468)
-* End of deprecation cycle: loss functions cannot be accessed directly from `Flux` anymore, they live in the `Flux.Losses` module. 
- All loss functions perform `mean` aggregation by default. 
+* End of deprecation cycle: loss functions cannot be accessed directly from `Flux` anymore, they live in the `Flux.Losses` module.
+ All loss functions perform `mean` aggregation by default.
 
 ## v0.11.2
 
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
@@ -101,17 +101,18 @@ mutable struct AlphaDropout{F}
   end
 end
 
-function (a::AlphaDropout)(x)
+function (a::AlphaDropout)(x::AbstractArray{T}) where T
   _isactive(a) || return x
-  λ = eltype(x)(1.0507009873554804934193349852946)
-  α = eltype(x)(1.6732632423543772848170429916717)
-  α1 = eltype(x)(-λ*α)
-  noise = randn(eltype(x), size(x))
-  x = @. x*(noise > (1 - a.p)) + α1 * (noise < (1 - a.p))
-  A = sqrt(a.p + a.p * (1 - a.p) * α1^2)
-  B = -A * α1 * (1 - a.p)
-  x = @. A * x + B
-  return x
+  p = a.p
+  iszero(p) && return x
+  isone(p) && return sign.(x) .* T(0)
+
+  α′ = T(-1.7580993408473766) # selu(-Inf) == -λα
+  A = T(inv(sqrt((1 - p) * (1 + p * α′^2))))
+  B = T(-A * α′ * p)
+
+  noise = rand!(similar(x))
+  return A .* ifelse.(noise .> p, x, α′) .+ B
 end
 
 testmode!(m::AlphaDropout, mode=true) =
diff --git a/test/cuda/layers.jl b/test/cuda/layers.jl
@@ -10,13 +10,8 @@
   @test gradient(x -> sum(cpu(x)), gpu(rand(3,3))) isa Tuple
 end
 
-# TODO: These layers get into scalar indexing
-# `AlphaDropout` throws a compilation error on GPUs,
-# whereas, the rest are scalar indexing issues.
-# The norm layers behave differently on the CPU and
-# the GPU too.
-const BROKEN_LAYERS = Union{DepthwiseConv,
-                            AlphaDropout}
+# TODO: These layers get into scalar indexing issues.
+const BROKEN_LAYERS = Union{DepthwiseConv}
 
 const ACTIVATIONS = [identity, relu, tanh,
                      sigmoid, exp, softplus,
diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
@@ -57,6 +57,34 @@ evalwgrad(f, x...) = pullback(f, x...)[1]
   @test count(a->a == 0, y) == 0
 end
 
+@testset "AlphaDropout" begin
+  x = [1., 2., 3.]
+  @test x == AlphaDropout(0.1)(x)
+  @test x == evalwgrad(AlphaDropout(0), x)
+  @test zero(x) == evalwgrad(AlphaDropout(1), x)
+
+  x = randn(1000) # large enough to prevent flaky test
+  m = AlphaDropout(0.5)
+
+  y = evalwgrad(m, x)
+  # Should preserve unit mean and variance
+  @test mean(y) ≈ 0 atol=0.1
+  @test var(y) ≈ 1 atol=0.1
+
+  testmode!(m, true) # should override istraining
+  @test evalwgrad(m, x) == x
+
+  testmode!(m, false)
+  y = evalwgrad(m, x)
+  @test mean(y) ≈ 0 atol=0.1
+  @test var(y) ≈ 1 atol=0.1
+  
+  # Known good value ranges
+  # Values taken from https://github.com/pytorch/pytorch/blob/v1.10.0/test/cpp/api/modules.cpp#L1337-L1338
+  x = ones(100)
+  @test 40 < sum(evalwgrad(m, x)) < 130
+end
+
 @testset "BatchNorm" begin
   let m = BatchNorm(2), x = [1.0 3.0 5.0;
                              2.0 4.0 6.0]