update after dropout PR

mcabbott · mcabbott · commit 795a2d71201a · 2023-01-05T11:07:45.000-05:00
diff --git a/src/bias_act.jl b/src/bias_act.jl
@@ -8,28 +8,31 @@ const RCR = RuleConfig{>:HasReverseMode}
 @inline only_derivative(y,f::F,x) where F = only(only(ChainRulesCore.derivatives_given_output(y, f, x)))
 
 # This has no methods, used for testing whether `derivatives_given_output(Ω, f, x)`
-# is independent of `x`, as `_return_type` says `Union{}` when calling is an error. 
+# is independent of `x`, as `_return_type` says `Union{}` when calling is an error.
 struct NotaNumber <: Real end
 
 """
     bias_act!(σ, x, b)
 
-This is equivalent to `σ.(x .+ b)`, but faster because
-it will overwrite `x` to save memory (when possible) and
-replace `sigmoid` & `tanh` with `sigmoid_fast` & `tanh_fast`.
+This is equivalent to `x .= σ.(x .+ b)`, also replacing `sigmoid` & `tanh`
+with `sigmoid_fast` & `tanh_fast`.
+It will only overwrite `x` when `x isa StridedArray{<:AbstractFloat}`.
 
-The best case requires `x isa StridedArray{<:AbstractFloat}`,
-and that the activation has a method of `derivatives_given_output`
-which does not need the input at all (such as `relu`, `tanh`).
+When used within a gradient, it will overwrite only when `σ` has
+a method of `derivatives_given_output` which does not need the input at all.
+Such methods are defined by e.g. `@scalar_rule relu(x) Ω > 0` where the derivative
+contains only `Ω` (the output) not `x`.
 
 !!! warning
     This is not safe to use if `x` is still needed for the gradient
     of some other function. Incorrect use will give silently wrong answers.
+    It is intended mainly for Flux layers, in which the previous operation is
+    known to be safe, e.g. `bias_act!(σ, weight * input, bias)` for a `Dense` layer.
 """
 bias_act!(σ::Function, x::AbstractArray, b) = fast_act(σ, x).(x .+ b)  # fallback
 
 bias_act!(σ::Function, x::StridedArray{<:AbstractFloat}, b::AbstractArray{<:Union{Bool, AbstractFloat}}) =
-    fast_broadcast_plus!(fast_act(σ, x), x, b)  # hand-written version below.
+    _fast_broadcast!(fast_act(σ, x)∘(+), x, b)  # works around a SIMD bug
 
 bias_act!(::typeof(identity), x::StridedArray{<:AbstractFloat}, b::Bool) =
     (@assert !b "bias=true is not accepted; layer constructors shoud guarantee this";  x)
@@ -89,197 +92,10 @@ function rrule(cfg::RCR, ::typeof(bias_act!), ::typeof(identity), x::AbstractArr
     end
     return bias_act!(identity, x, b), bias_act!_idback
 end
+
 function rrule(cfg::RCR, ::typeof(bias_act!), ::typeof(identity), x::AbstractArray{T,N}, b::Bool) where {T,N}
     bias_act!_trivial(Δ) = (NoTangent(), NoTangent(), Δ, NoTangent())
     return x, bias_act!_trivial
 end
 
 
-"""
-    NNlib.fast_broadcast_plus!(f, x, b)
-
-This is equivalent to `x .= f.(x .+ b)`, but works around
-an issue with broadcasting that prevents SIMD in such cases.
-
-That can be removed once https://github.com/JuliaLang/julia/issues/43153 is fixed.
-
-Also has an `rrule` to prevent mutation inside 2nd-order AD.
-
-!!! warning
-    Does not allow for derivatives with respect to `f`.
-"""
-function fast_broadcast_plus!(f::F, x::Array{<:AbstractFloat}, b) where {F<:Function}
-    if b === false
-        @simd ivdep for I in eachindex(x)
-            @inbounds x[I] = f(x[I])
-        end
-    else
-        xplus = Broadcast.instantiate(Broadcast.broadcasted(+, x, b))
-        @simd ivdep for I in eachindex(xplus)
-            @inbounds x[I] = f(xplus[I])
-        end
-    end
-    return x
-end
-function fast_broadcast_plus!(f::F, x::StridedArray{<:AbstractFloat}, b) where {F<:Function}
-    # CuArray has its own broadcasting.
-    x .= f.(x .+ b)
-    return x
-end
-function fast_broadcast_plus!(f::F, x::AbstractArray, b) where {F<:Function}
-    # Don't try to write into weird arrays
-    return f.(x .+ b)
-end
-
-function rrule(cfg::RCR, ::typeof(fast_broadcast_plus!), f::F, x::AbstractArray{T,N}, b::B) where {F,T,N,B}
-    rrule_via_ad(cfg, broadcast, (x,b) -> f.(x .+ b), x, b)
-end
-
-
-# """
-#     add_act(σ, x, y...)
-#     add_act!(σ, x, y, z...)
-
-# Equivalent to `σ.(x .+ y .+ z)`. The mutating method `add_act!`
-# """
-# add_act(σ::Function, x::AbstractArray, yz::AbstractArray...) = σ.(.+(x, yz...))  # fused
-
-
-# function ChainRulesCore.rrule(::typeof(add_act), σ::F, x::AbstractArray, yz::AbstractArray...) where {F,T,N}
-#     if isconcretetype(Core.Compiler._return_type(
-#             derivatives_given_output, Tuple{T, F, NotaNumber}))
-
-# end
-
-
-# bias_act!(σ::Function, x::StridedArray{<:AbstractFloat}, b::Bool) =
-#     # b ? (x .= fast_act(σ, x).(x .+ b)) : (x .= fast_act(σ, x).(x))
-#     (@assert !b "bias=true is not accepted";  (x .= fast_act(σ, x).(x)))
-
-
-# using NNlib, BenchmarkTools
-
-#=
-
-## M1 mac, 1.10
-
-julia> w, b = rand(Float32, 100, 10000), rand(Float32, 100);
-
-julia> @btime bias_act!(relu, $w, $b);
-  min 19.500 μs, mean 21.375 μs (0 allocations)
-
-julia> @btime relu.($w .+ $b);
-  min 17.208 μs, mean 62.826 μs (2 allocations, 390.67 KiB)
-
-julia> @btime bias_act!(tanh, $w, $b);
-  min 63.792 μs, mean 65.052 μs (0 allocations)
-
-julia> @btime tanh_fast.($w .+ $b);
-  min 63.583 μs, mean 102.004 μs (2 allocations, 390.67 KiB)
-
-julia> using Zygote
-
-julia> @btime gradient((w,b) -> sum(bias_act!(relu, w, b)), $w, $b);
-  min 145.166 μs, mean 150.785 μs (51 allocations, 2.18 KiB)
-
-julia> @btime gradient((w,b) -> sum(relu.(w .+ b)), $w, $b);
-  min 165.583 μs, mean 314.267 μs (32 allocations, 1.15 MiB)
-
-julia> @btime gradient((w,b) -> sum(bias_act!(tanh, w, b)), $w, $b);
-  min 191.917 μs, mean 195.956 μs (51 allocations, 2.18 KiB)
-
-julia> @btime gradient((w,b) -> sum(tanh_fast.(w .+ b)), $w, $b);
-  min 209.458 μs, mean 338.652 μs (32 allocations, 1.15 MiB)
-
-
-
-## Cyclops
-
-julia> using CUDA  # 10x bigger
-
-julia> cw, cb = CUDA.rand(Float32, 100, 100_00), CUDA.rand(Float32, 100);
-
-julia> @btime CUDA.@sync bias_act!(relu, $cw, $cb);
-  22.546 μs (27 allocations: 1.45 KiB)
-
-julia> @btime CUDA.@sync relu.($cw .+ $cb);  # faster, that's odd?
-  31.282 μs (38 allocations: 1.81 KiB)
-
-julia> @btime CUDA.@sync bias_act!(tanh, $cw, $cb);
-  27.030 μs (27 allocations: 1.45 KiB)
-
-julia> @btime CUDA.@sync tanh_fast.($cw .+ $cb);
-  36.421 μs (38 allocations: 1.81 KiB)
-
-julia> using Zygote
-
-julia> @btime CUDA.@sync gradient((w,b) -> sum(bias_act!(relu, w, b)), $cw, $cb);
-  204.507 μs (382 allocations: 18.15 KiB)
-
-julia> @btime CUDA.@sync gradient((w,b) -> sum(relu.(w .+ b)), $cw, $cb);
-  204.458 μs (409 allocations: 19.19 KiB)
-
-julia> @btime CUDA.@sync gradient((w,b) -> sum(bias_act!(tanh, w, b)), $cw, $cb);
-  224.545 μs (382 allocations: 18.15 KiB)
-
-julia> @btime CUDA.@sync gradient((w,b) -> sum(tanh_fast.(w .+ b)), $cw, $cb);
-  204.793 μs (411 allocations: 19.30 KiB)
-
-
-=#
-
-#=
-
-(jl_fuwIi8) pkg> add https://github.com/mcabbott/NNlib.jl/tree/bias_act_23
-
-julia> using NNlib, Zygote, BenchmarkTools
-
-julia> w, b, x = rand(Float32, 50, 50), rand(Float32, 50), randn(Float32, 50, 100);
-
-julia> @btime bias_act!(relu, $w * $x, $b);
-  min 5.243 μs, mean 8.600 μs (2 allocations, 19.61 KiB)
-
-julia> @btime relu.($w * $x .+ $b);
-  min 5.160 μs, mean 10.863 μs (4 allocations, 39.22 KiB)
-
-julia> @btime gradient((w,x,b) -> sum(abs2, bias_act!(relu, w*x, b)), $w, $x, $b);
-  min 21.042 μs, mean 40.476 μs (43 allocations, 89.83 KiB)
-
-julia> @btime gradient((w,x,b) -> sum(abs2, relu.(w*x .+ b)), $w, $x, $b);
-  min 21.542 μs, mean 43.947 μs (41 allocations, 128.91 KiB)
-
-julia> @btime gradient((w,x) -> sum(abs2, w*x), $w, $x);
-  min 14.708 μs, mean 26.450 μs (28 allocations, 69.41 KiB)
-
-julia> @btime gradient(x -> sum(abs2, x), $x);
-  min 1.938 μs, mean 4.160 μs (2 allocations, 19.61 KiB)
-
-
-# Cyclops
-
-julia> @btime bias_act!(relu, $w * $x, $b);
-  24.786 μs (2 allocations: 19.61 KiB)
-
-julia> @btime relu.($w * $x .+ $b);
-  25.501 μs (4 allocations: 39.22 KiB)
-
-julia> @btime gradient((w,x,b) -> sum(abs2, bias_act!(relu, w*x, b)), $w, $x, $b);
-  91.847 μs (43 allocations: 89.83 KiB)
-
-julia> @btime gradient((w,x,b) -> sum(abs2, relu.(w*x .+ b)), $w, $x, $b);
-  98.054 μs (41 allocations: 128.91 KiB)
-
-julia> @btime gradient((w,x) -> sum(abs2, w*x), $w, $x);
-  80.464 μs (28 allocations: 69.41 KiB)
-
-julia> @btime gradient(x -> sum(abs2, x), $x);
-  4.604 μs (2 allocations: 19.61 KiB)
-
-julia> @time using CUDA; @time cu(ones(3)) .+ 1;
-
-julia> w, b, x = CUDA.rand(Float32, 1000, 1000), CUDA.rand(Float32, 1000), CUDA.rand(Float32, 1000, 1000);
-
-
-
-=#
-
diff --git a/src/dropout.jl b/src/dropout.jl
@@ -123,27 +123,6 @@ end
 # and seems about as fast, but needs a method `copy(::CUDA.RNG)` and careful checking.
 # https://github.com/FluxML/NNlib.jl/pull/454#issuecomment-1369357402
 
-"""
-    _fast_broadcast!(f, x, y, z...)
-
-This does `x .= f.(x, y, z...)`, but works around
-an issue with broadcasting that prevents SIMD in such cases.
-Can be removed once https://github.com/JuliaLang/julia/issues/43153 is fixed.
-
-Not intended for general use. Does not check sizes!
-"""
-function _fast_broadcast!(f::F, x::Array, yz...) where {F<:Function}
-    bc = Broadcast.instantiate(Broadcast.broadcasted(f, x, yz...))
-    @simd ivdep for I in eachindex(bc)
-        @inbounds x[I] = bc[I]
-    end
-    return x
-end
-function _fast_broadcast!(f::F, x::AbstractArray, yz...) where {F<:Function}
-    # CUDA does not suffer from this bug
-    broadcast!(f, x, x, yz...)
-end
-
 
 """
     _rng_from_array(x)
diff --git a/src/utils.jl b/src/utils.jl
@@ -53,4 +53,32 @@ function reverse_indices(idx::AbstractArray{<:Any,N}) where N
     return reverse_indices!(rev, idx)
 end
 
-unsqueeze(x) = reshape(x, 1, size(x)...) 
+unsqueeze(x) = reshape(x, 1, size(x)...)
+
+
+"""
+    _fast_broadcast!(f, x, y, z...)
+
+This does `x .= f.(x, y, z...)`, but works around
+an issue with broadcasting that prevents SIMD in such cases.
+Can be removed once https://github.com/JuliaLang/julia/issues/43153 is fixed.
+
+Not intended for general use. Uses `@inbounds` but does not check sizes!
+
+Has an `rrule` to avoid mutation within derivatives. This assumes that `f` has no derivative!
+"""
+function _fast_broadcast!(f::F, x::Array, yz...) where {F<:Function}
+    bc = Broadcast.instantiate(Broadcast.broadcasted(f, x, yz...))
+    @simd ivdep for I in eachindex(bc)
+        @inbounds x[I] = bc[I]
+    end
+    return x
+end
+function _fast_broadcast!(f::F, x::AbstractArray, yz...) where {F<:Function}
+    # CUDA does not suffer from this bug
+    broadcast!(f, x, x, yz...)
+end
+
+function rrule(cfg::RuleConfig{>:HasReverseMode}, ::typeof(_fast_broadcast!), f::F, x::AbstractArray, ys...)  where {F<:Function}
+    rrule_via_ad(cfg, broadcast, f, x, ys...)
+end
diff --git a/test/bias_act.jl b/test/bias_act.jl
@@ -1,7 +1,7 @@
 using NNlib, Zygote, Test
 using Zygote: ForwardDiff
 
-ACTIVATION_FUNCTIONS = 
+ACTIVATION_FUNCTIONS =
     [@eval($a) for a in NNlib.ACTIVATIONS]
 
 @testset "bias_act!" begin
@@ -11,7 +11,7 @@ ACTIVATION_FUNCTIONS =
     @test bias_act!(relu, copy(x), b) ≈ relu.(x .+ b)
     @test bias_act!(tanh, copy(x), b) ≈ tanh.(x .+ b)
 
-    @testset "gradient with $fun" for fun in vcat([identity, tanh, cbrt], 
+    @testset "gradient with $fun" for fun in vcat([identity, tanh, cbrt],
                                                     ACTIVATION_FUNCTIONS,
                                                     [x->x, x -> 1/(x^2+2), x -> leakyrelu(x, 0.33)])
         # Only some of these go the fast path, `cbrt` is an example of a function NNlib knows nothing about.
@@ -33,9 +33,38 @@ ACTIVATION_FUNCTIONS =
         @test Zygote.gradient(b -> sum(bias_act!(fun, copy(x), b)), b .> 0) == (nothing,)
     end
 
-    @testset "gradient for fast_broadcast_plus!" begin
+    @testset "gradient for fast_broadcast!" begin
         # Gradient definition is just to disable mutation inside 2nd order AD
-        gx = ForwardDiff.gradient(x -> sum(NNlib.fast_broadcast_plus!(cbrt, copy(x), b)), x)
-        @test gx ≈ Zygote.gradient(x -> sum(NNlib.fast_broadcast_plus!(cbrt, copy(x), b)), x)[1]
+        gx = ForwardDiff.gradient(x -> sum(NNlib._fast_broadcast!(cbrt∘(+), copy(x), b)), x)
+        @test gx ≈ Zygote.gradient(x -> sum(NNlib._fast_broadcast!(cbrt∘(+), copy(x), b)), x)[1]
+
+        # relu should take the fast path
+        g2 = ForwardDiff.gradient(x) do x
+            sum(abs2, Zygote.gradient(x -> sum(abs2, bias_act!(relu, copy(x), b)), x)[1])
+        end
+        @test_broken gx ≈ Zygote.gradient(x) do x
+            sum(abs2,Zygote. gradient(x -> sum(abs2, bias_act!(relu, copy(x), b)), x)[1])
+        end
+        # Can't differentiate foreigncall expression $(Expr(:foreigncall, :(:jl_eqtable_get), Any, svec(Any, Any, Any), 0, :(:ccall), %5, %3, %4)).
+        # [5] (::typeof(∂(accum_global)))(Δ::Nothing)
+        @test g2 ≈ Zygote.gradient(x, b) do x, b
+            sum(abs2, Zygote.gradient((x, b) -> sum(abs2, bias_act!(relu, copy(x), b)), x, b)[1])
+        end[1]
+
+       g3 = ForwardDiff.gradient(x) do x
+            sum(abs2, Zygote.gradient((x, b) -> sum(abs2, bias_act!(swish, copy(x), b)), x, b)[1])
+        end
+        @test g3 ≈ Zygote.gradient(x, b) do x, b
+            sum(abs2, Zygote.gradient((x, b) -> sum(abs2, bias_act!(swish, copy(x), b)), x, b)[1])
+        end[1]
+
+        # Anon function sure to take the generic path
+        g4 = ForwardDiff.gradient(x) do x
+            sum(abs2, Zygote.gradient((x, b) -> sum(abs2, bias_act!(y -> cbrt(y/3), copy(x), b)), x, b)[1])
+        end
+        @test g4 ≈ Zygote.gradient(x, b) do x, b
+            sum(abs2, Zygote.gradient((x, b) -> sum(abs2, bias_act!(y -> cbrt(y/3), copy(x), b)), x, b)[1])
+        end[1]
     end
 end
+