From 554f33987fcb66ef23c29005ff672545d3d779d6 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Mon, 2 Jan 2023 14:56:55 -0500
Subject: [PATCH 01/11] sometimes-in-place bias_act

---
 src/NNlib.jl     |   3 +
 src/bias_act.jl  | 285 +++++++++++++++++++++++++++++++++++++++++++++++
 test/bias_act.jl |  41 +++++++
 test/runtests.jl |   2 +
 4 files changed, 331 insertions(+)
 create mode 100644 src/bias_act.jl
 create mode 100644 test/bias_act.jl

diff --git a/src/NNlib.jl b/src/NNlib.jl
index 8b0d3d5d5..8450a0261 100644
--- a/src/NNlib.jl
+++ b/src/NNlib.jl
@@ -72,6 +72,9 @@ export conv, conv!, ∇conv_data, ∇conv_data!, ∇conv_filter,
 include("conv_bias_act.jl")
 export conv_bias_act, conv_bias_act!
 
+include("bias_act.jl")
+export bias_act!
+
 include("fold.jl")
 
 include("ctc.jl")
diff --git a/src/bias_act.jl b/src/bias_act.jl
new file mode 100644
index 000000000..46a6355a1
--- /dev/null
+++ b/src/bias_act.jl
@@ -0,0 +1,285 @@
+
+using NNlib: fast_act, tanh_fast
+using ChainRulesCore
+
+const RCR = RuleConfig{>:HasReverseMode}
+
+# This just saves typing `only.(only.(` many times:
+@inline only_derivative(y,f::F,x) where F = only(only(ChainRulesCore.derivatives_given_output(y, f, x)))
+
+# This has no methods, used for testing whether `derivatives_given_output(Ω, f, x)`
+# is independent of `x`, as `_return_type` says `Union{}` when calling is an error. 
+struct NotaNumber <: Real end
+
+"""
+    bias_act!(σ, x, b)
+
+This is equivalent to `σ.(x .+ b)`, but faster because
+it will overwrite `x` to save memory (when possible) and
+replace `sigmoid` & `tanh` with `sigmoid_fast` & `tanh_fast`.
+
+The best case requires `x isa StridedArray{<:AbstractFloat}`,
+and that the activation has a method of `derivatives_given_output`
+which does not need the input at all (such as `relu`, `tanh`).
+
+!!! warning
+    This is not safe to use if `x` is still needed for the gradient
+    of some other function. Incorrect use will give silently wrong answers.
+"""
+bias_act!(σ::Function, x::AbstractArray, b) = fast_act(σ, x).(x .+ b)  # fallback
+
+bias_act!(σ::Function, x::StridedArray{<:AbstractFloat}, b::AbstractArray{<:Union{Bool, AbstractFloat}}) =
+    fast_broadcast_plus!(fast_act(σ, x), x, b)  # hand-written version below.
+
+bias_act!(::typeof(identity), x::StridedArray{<:AbstractFloat}, b::Bool) =
+    (@assert !b "bias=true is not accepted; layer constructors shoud guarantee this";  x)
+
+
+function ChainRulesCore.rrule(cfg::RCR, ::typeof(bias_act!), σ::F, x::AbstractArray{T,N}, b::B) where {F,T,N,B}
+    if eltype(B) !== Bool
+        b_dims = ntuple(d -> size(b,d)==1 ? d : N+1, N)
+        size_b = size(b)
+    end
+
+    # Fast path: it is now safe to overwrite x, since this is not needed for gradient of σ
+    if isconcretetype(Core.Compiler._return_type(only_derivative, Tuple{T, F, NotaNumber}))
+        Ω = bias_act!(σ, x, b)  # now x === Ω, when x isa StridedArray{<:AbstractFloat}
+        function bias_act!_fastback(Δ)
+            # Tempting to overwrite x again, but only safe if you call pullback at most once,
+            # TODO with e.g. https://github.com/FluxML/Zygote.jl/pull/1340
+            # https://github.com/JuliaDiff/ChainRulesCore.jl/pull/592
+            dx = only_derivative.(Ω, σ, NotaNumber()) .* unthunk(Δ)
+            db = eltype(B) === Bool ? NoTangent() : reshape(sum(dx; dims = b_dims), size_b)
+            return (NoTangent(), NoTangent(), dx, db)
+        end
+        return Ω, bias_act!_fastback
+
+    # # Slower path: can't overwrite x, but can use derivatives_given_output
+    # # This case is WRONG and tests fail, but not sure why
+    # elseif isconcretetype(Core.Compiler._return_type(only_derivative, Tuple{T, F, T}))
+    #     Ω2 = fast_act(σ, x).(x) .+ b
+    #     @show σ b
+    #     function bias_act!_back2(Δ)
+    #         dx = only_derivative.(Ω2, σ, x .+ b) .* unthunk(Δ)
+    #         db = eltype(B) === Bool ? NoTangent() : reshape(sum(dx; dims = b_dims), size_b)
+    #         return (NoTangent(), NoTangent(), dx, db)
+    #     end
+    #     return Ω2, bias_act!_back2
+
+    # Fallback path: let AD handle the broadcast
+    else
+        Ω3, back = rrule_via_ad(cfg, broadcast, fast_act(σ, x), bias_act!(identity, x, b))
+        @inline function bias_act!_slowback(Δ)
+            _, _, dx = back(Δ)
+            db = eltype(B) === Bool ? NoTangent() : reshape(sum(dx; dims = b_dims), size_b)
+            return (NoTangent(), NoTangent(), dx, db)
+        end
+        return Ω3, bias_act!_slowback
+    end
+end
+
+# Two easy cases
+function rrule(cfg::RCR, ::typeof(bias_act!), ::typeof(identity), x::AbstractArray{T,N}, b::B) where {T,N,B}
+    b_dims = ntuple(d -> size(b,d)==1 ? d : N+1, N)
+    size_b = size(b)
+    function bias_act!_idback(Δ)
+        dx = unthunk(Δ)
+        db = reshape(sum(dx; dims = b_dims), size_b)
+        return (NoTangent(), NoTangent(), dx, db)
+    end
+    return bias_act!(identity, x, b), bias_act!_idback
+end
+function rrule(cfg::RCR, ::typeof(bias_act!), ::typeof(identity), x::AbstractArray{T,N}, b::Bool) where {T,N}
+    bias_act!_trivial(Δ) = (NoTangent(), NoTangent(), Δ, NoTangent())
+    return x, bias_act!_trivial
+end
+
+
+"""
+    NNlib.fast_broadcast_plus!(f, x, b)
+
+This is equivalent to `x .= f.(x .+ b)`, but works around
+an issue with broadcasting that prevents SIMD in such cases.
+
+That can be removed once https://github.com/JuliaLang/julia/issues/43153 is fixed.
+
+Also has an `rrule` to prevent mutation inside 2nd-order AD.
+
+!!! warning
+    Does not allow for derivatives with respect to `f`.
+"""
+function fast_broadcast_plus!(f::F, x::Array{<:AbstractFloat}, b) where {F<:Function}
+    if b === false
+        @simd ivdep for I in eachindex(x)
+            @inbounds x[I] = f(x[I])
+        end
+    else
+        xplus = Broadcast.instantiate(Broadcast.broadcasted(+, x, b))
+        @simd ivdep for I in eachindex(xplus)
+            @inbounds x[I] = f(xplus[I])
+        end
+    end
+    return x
+end
+function fast_broadcast_plus!(f::F, x::StridedArray{<:AbstractFloat}, b) where {F<:Function}
+    # CuArray has its own broadcasting.
+    x .= f.(x .+ b)
+    return x
+end
+function fast_broadcast_plus!(f::F, x::AbstractArray, b) where {F<:Function}
+    # Don't try to write into weird arrays
+    return f.(x .+ b)
+end
+
+function rrule(cfg::RCR, ::typeof(fast_broadcast_plus!), f::F, x::AbstractArray{T,N}, b::B) where {F,T,N,B}
+    rrule_via_ad(cfg, broadcast, (x,b) -> f.(x .+ b), x, b)
+end
+
+
+# """
+#     add_act(σ, x, y...)
+#     add_act!(σ, x, y, z...)
+
+# Equivalent to `σ.(x .+ y .+ z)`. The mutating method `add_act!`
+# """
+# add_act(σ::Function, x::AbstractArray, yz::AbstractArray...) = σ.(.+(x, yz...))  # fused
+
+
+# function ChainRulesCore.rrule(::typeof(add_act), σ::F, x::AbstractArray, yz::AbstractArray...) where {F,T,N}
+#     if isconcretetype(Core.Compiler._return_type(
+#             derivatives_given_output, Tuple{T, F, NotaNumber}))
+
+# end
+
+
+# bias_act!(σ::Function, x::StridedArray{<:AbstractFloat}, b::Bool) =
+#     # b ? (x .= fast_act(σ, x).(x .+ b)) : (x .= fast_act(σ, x).(x))
+#     (@assert !b "bias=true is not accepted";  (x .= fast_act(σ, x).(x)))
+
+
+# using NNlib, BenchmarkTools
+
+#=
+
+## M1 mac, 1.10
+
+julia> w, b = rand(Float32, 100, 10000), rand(Float32, 100);
+
+julia> @btime bias_act!(relu, $w, $b);
+  min 19.500 μs, mean 21.375 μs (0 allocations)
+
+julia> @btime relu.($w .+ $b);
+  min 17.208 μs, mean 62.826 μs (2 allocations, 390.67 KiB)
+
+julia> @btime bias_act!(tanh, $w, $b);
+  min 63.792 μs, mean 65.052 μs (0 allocations)
+
+julia> @btime tanh_fast.($w .+ $b);
+  min 63.583 μs, mean 102.004 μs (2 allocations, 390.67 KiB)
+
+julia> using Zygote
+
+julia> @btime gradient((w,b) -> sum(bias_act!(relu, w, b)), $w, $b);
+  min 145.166 μs, mean 150.785 μs (51 allocations, 2.18 KiB)
+
+julia> @btime gradient((w,b) -> sum(relu.(w .+ b)), $w, $b);
+  min 165.583 μs, mean 314.267 μs (32 allocations, 1.15 MiB)
+
+julia> @btime gradient((w,b) -> sum(bias_act!(tanh, w, b)), $w, $b);
+  min 191.917 μs, mean 195.956 μs (51 allocations, 2.18 KiB)
+
+julia> @btime gradient((w,b) -> sum(tanh_fast.(w .+ b)), $w, $b);
+  min 209.458 μs, mean 338.652 μs (32 allocations, 1.15 MiB)
+
+
+
+## Cyclops
+
+julia> using CUDA  # 10x bigger
+
+julia> cw, cb = CUDA.rand(Float32, 100, 100_00), CUDA.rand(Float32, 100);
+
+julia> @btime CUDA.@sync bias_act!(relu, $cw, $cb);
+  22.546 μs (27 allocations: 1.45 KiB)
+
+julia> @btime CUDA.@sync relu.($cw .+ $cb);  # faster, that's odd?
+  31.282 μs (38 allocations: 1.81 KiB)
+
+julia> @btime CUDA.@sync bias_act!(tanh, $cw, $cb);
+  27.030 μs (27 allocations: 1.45 KiB)
+
+julia> @btime CUDA.@sync tanh_fast.($cw .+ $cb);
+  36.421 μs (38 allocations: 1.81 KiB)
+
+julia> using Zygote
+
+julia> @btime CUDA.@sync gradient((w,b) -> sum(bias_act!(relu, w, b)), $cw, $cb);
+  204.507 μs (382 allocations: 18.15 KiB)
+
+julia> @btime CUDA.@sync gradient((w,b) -> sum(relu.(w .+ b)), $cw, $cb);
+  204.458 μs (409 allocations: 19.19 KiB)
+
+julia> @btime CUDA.@sync gradient((w,b) -> sum(bias_act!(tanh, w, b)), $cw, $cb);
+  224.545 μs (382 allocations: 18.15 KiB)
+
+julia> @btime CUDA.@sync gradient((w,b) -> sum(tanh_fast.(w .+ b)), $cw, $cb);
+  204.793 μs (411 allocations: 19.30 KiB)
+
+
+=#
+
+#=
+
+(jl_fuwIi8) pkg> add https://github.com/mcabbott/NNlib.jl/tree/bias_act_23
+
+julia> using NNlib, Zygote, BenchmarkTools
+
+julia> w, b, x = rand(Float32, 50, 50), rand(Float32, 50), randn(Float32, 50, 100);
+
+julia> @btime bias_act!(relu, $w * $x, $b);
+  min 5.243 μs, mean 8.600 μs (2 allocations, 19.61 KiB)
+
+julia> @btime relu.($w * $x .+ $b);
+  min 5.160 μs, mean 10.863 μs (4 allocations, 39.22 KiB)
+
+julia> @btime gradient((w,x,b) -> sum(abs2, bias_act!(relu, w*x, b)), $w, $x, $b);
+  min 21.042 μs, mean 40.476 μs (43 allocations, 89.83 KiB)
+
+julia> @btime gradient((w,x,b) -> sum(abs2, relu.(w*x .+ b)), $w, $x, $b);
+  min 21.542 μs, mean 43.947 μs (41 allocations, 128.91 KiB)
+
+julia> @btime gradient((w,x) -> sum(abs2, w*x), $w, $x);
+  min 14.708 μs, mean 26.450 μs (28 allocations, 69.41 KiB)
+
+julia> @btime gradient(x -> sum(abs2, x), $x);
+  min 1.938 μs, mean 4.160 μs (2 allocations, 19.61 KiB)
+
+
+# Cyclops
+
+julia> @btime bias_act!(relu, $w * $x, $b);
+  24.786 μs (2 allocations: 19.61 KiB)
+
+julia> @btime relu.($w * $x .+ $b);
+  25.501 μs (4 allocations: 39.22 KiB)
+
+julia> @btime gradient((w,x,b) -> sum(abs2, bias_act!(relu, w*x, b)), $w, $x, $b);
+  91.847 μs (43 allocations: 89.83 KiB)
+
+julia> @btime gradient((w,x,b) -> sum(abs2, relu.(w*x .+ b)), $w, $x, $b);
+  98.054 μs (41 allocations: 128.91 KiB)
+
+julia> @btime gradient((w,x) -> sum(abs2, w*x), $w, $x);
+  80.464 μs (28 allocations: 69.41 KiB)
+
+julia> @btime gradient(x -> sum(abs2, x), $x);
+  4.604 μs (2 allocations: 19.61 KiB)
+
+julia> @time using CUDA; @time cu(ones(3)) .+ 1;
+
+julia> w, b, x = CUDA.rand(Float32, 1000, 1000), CUDA.rand(Float32, 1000), CUDA.rand(Float32, 1000, 1000);
+
+
+
+=#
+
diff --git a/test/bias_act.jl b/test/bias_act.jl
new file mode 100644
index 000000000..2c955ff74
--- /dev/null
+++ b/test/bias_act.jl
@@ -0,0 +1,41 @@
+using NNlib, Zygote, Test
+using Zygote: ForwardDiff
+
+ACTIVATION_FUNCTIONS = 
+    [@eval($a) for a in NNlib.ACTIVATIONS]
+
+@testset "bias_act!" begin
+    x = randn(3,4)
+    b = randn(3)
+    @test bias_act!(identity, copy(x), b) ≈ (x .+ b)
+    @test bias_act!(relu, copy(x), b) ≈ relu.(x .+ b)
+    @test bias_act!(tanh, copy(x), b) ≈ tanh.(x .+ b)
+
+    @testset "gradient with $fun" for fun in vcat([identity, tanh, cbrt], 
+                                                    ACTIVATION_FUNCTIONS,
+                                                    [x->x, x -> 1/(x^2+2), x -> leakyrelu(x, 0.33)])
+        # Only some of these go the fast path, `cbrt` is an example of a function NNlib knows nothing about.
+        fun == rrelu && continue # this one is randomised!
+
+        @test bias_act!(fun, copy(x), b) ≈ fun.(x .+ b)
+        @test bias_act!(fun, copy(x), false) ≈ fun.(x)
+
+        gx = ForwardDiff.gradient(x -> sum(bias_act!(fun, copy(x), b)), x)
+        @test gx ≈ Zygote.gradient(x -> sum(bias_act!(fun, copy(x), b)), x)[1]
+
+        gx2 = ForwardDiff.gradient(x -> sum(bias_act!(fun, copy(x), false)), x)
+        @test gx2 ≈ Zygote.gradient(x -> sum(bias_act!(fun, copy(x), false)), x)[1]
+
+        gb = ForwardDiff.gradient(b -> sum(bias_act!(fun, copy(x), b)), b)
+        @test gb ≈ Zygote.gradient(b -> sum(bias_act!(fun, copy(x), b)), b)[1]
+
+        @test Zygote.gradient(b -> sum(bias_act!(fun, copy(x), b)), false) == (nothing,)
+        @test Zygote.gradient(b -> sum(bias_act!(fun, copy(x), b)), b .> 0) == (nothing,)
+    end
+
+    @testset "gradient for fast_broadcast_plus!" begin
+        # Gradient definition is just to disable mutation inside 2nd order AD
+        gx = ForwardDiff.gradient(x -> sum(NNlib.fast_broadcast_plus!(cbrt, copy(x), b)), x)
+        @test gx ≈ Zygote.gradient(x -> sum(NNlib.fast_broadcast_plus!(cbrt, copy(x), b)), x)[1]
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 31db3a84f..0f52934c9 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -101,6 +101,7 @@ end
             @info "Skipping CUDA tests, set NNLIB_TEST_CUDA=true to run them"
         end
 
+<<<<<<< HEAD
         if get(ENV, "NNLIB_TEST_AMDGPU", "false") == "true"
             import Pkg
             test_info = Pkg.project()
@@ -127,6 +128,7 @@ end
 
         @testset "Activation Functions" begin
             include("activations.jl")
+            include("bias_act.jl")
         end
 
         @testset "Attention" begin

From 13ab2e7f3d322de100babc6d61f44b820fdd931c Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Thu, 5 Jan 2023 10:56:48 -0500
Subject: [PATCH 02/11] update after dropout PR

---
 src/bias_act.jl  | 208 +++--------------------------------------------
 src/dropout.jl   |  21 -----
 src/utils.jl     |  30 ++++++-
 test/bias_act.jl |  39 +++++++--
 4 files changed, 75 insertions(+), 223 deletions(-)

diff --git a/src/bias_act.jl b/src/bias_act.jl
index 46a6355a1..2a7b93805 100644
--- a/src/bias_act.jl
+++ b/src/bias_act.jl
@@ -8,28 +8,31 @@ const RCR = RuleConfig{>:HasReverseMode}
 @inline only_derivative(y,f::F,x) where F = only(only(ChainRulesCore.derivatives_given_output(y, f, x)))
 
 # This has no methods, used for testing whether `derivatives_given_output(Ω, f, x)`
-# is independent of `x`, as `_return_type` says `Union{}` when calling is an error. 
+# is independent of `x`, as `_return_type` says `Union{}` when calling is an error.
 struct NotaNumber <: Real end
 
 """
     bias_act!(σ, x, b)
 
-This is equivalent to `σ.(x .+ b)`, but faster because
-it will overwrite `x` to save memory (when possible) and
-replace `sigmoid` & `tanh` with `sigmoid_fast` & `tanh_fast`.
+This is equivalent to `x .= σ.(x .+ b)`, also replacing `sigmoid` & `tanh`
+with `sigmoid_fast` & `tanh_fast`.
+It will only overwrite `x` when `x isa StridedArray{<:AbstractFloat}`.
 
-The best case requires `x isa StridedArray{<:AbstractFloat}`,
-and that the activation has a method of `derivatives_given_output`
-which does not need the input at all (such as `relu`, `tanh`).
+When used within a gradient, it will overwrite only when `σ` has
+a method of `derivatives_given_output` which does not need the input at all.
+Such methods are defined by e.g. `@scalar_rule relu(x) Ω > 0` where the derivative
+contains only `Ω` (the output) not `x`.
 
 !!! warning
     This is not safe to use if `x` is still needed for the gradient
     of some other function. Incorrect use will give silently wrong answers.
+    It is intended mainly for Flux layers, in which the previous operation is
+    known to be safe, e.g. `bias_act!(σ, weight * input, bias)` for a `Dense` layer.
 """
 bias_act!(σ::Function, x::AbstractArray, b) = fast_act(σ, x).(x .+ b)  # fallback
 
 bias_act!(σ::Function, x::StridedArray{<:AbstractFloat}, b::AbstractArray{<:Union{Bool, AbstractFloat}}) =
-    fast_broadcast_plus!(fast_act(σ, x), x, b)  # hand-written version below.
+    _fast_broadcast!(fast_act(σ, x)∘(+), x, b)  # works around a SIMD bug
 
 bias_act!(::typeof(identity), x::StridedArray{<:AbstractFloat}, b::Bool) =
     (@assert !b "bias=true is not accepted; layer constructors shoud guarantee this";  x)
@@ -89,197 +92,10 @@ function rrule(cfg::RCR, ::typeof(bias_act!), ::typeof(identity), x::AbstractArr
     end
     return bias_act!(identity, x, b), bias_act!_idback
 end
+
 function rrule(cfg::RCR, ::typeof(bias_act!), ::typeof(identity), x::AbstractArray{T,N}, b::Bool) where {T,N}
     bias_act!_trivial(Δ) = (NoTangent(), NoTangent(), Δ, NoTangent())
     return x, bias_act!_trivial
 end
 
 
-"""
-    NNlib.fast_broadcast_plus!(f, x, b)
-
-This is equivalent to `x .= f.(x .+ b)`, but works around
-an issue with broadcasting that prevents SIMD in such cases.
-
-That can be removed once https://github.com/JuliaLang/julia/issues/43153 is fixed.
-
-Also has an `rrule` to prevent mutation inside 2nd-order AD.
-
-!!! warning
-    Does not allow for derivatives with respect to `f`.
-"""
-function fast_broadcast_plus!(f::F, x::Array{<:AbstractFloat}, b) where {F<:Function}
-    if b === false
-        @simd ivdep for I in eachindex(x)
-            @inbounds x[I] = f(x[I])
-        end
-    else
-        xplus = Broadcast.instantiate(Broadcast.broadcasted(+, x, b))
-        @simd ivdep for I in eachindex(xplus)
-            @inbounds x[I] = f(xplus[I])
-        end
-    end
-    return x
-end
-function fast_broadcast_plus!(f::F, x::StridedArray{<:AbstractFloat}, b) where {F<:Function}
-    # CuArray has its own broadcasting.
-    x .= f.(x .+ b)
-    return x
-end
-function fast_broadcast_plus!(f::F, x::AbstractArray, b) where {F<:Function}
-    # Don't try to write into weird arrays
-    return f.(x .+ b)
-end
-
-function rrule(cfg::RCR, ::typeof(fast_broadcast_plus!), f::F, x::AbstractArray{T,N}, b::B) where {F,T,N,B}
-    rrule_via_ad(cfg, broadcast, (x,b) -> f.(x .+ b), x, b)
-end
-
-
-# """
-#     add_act(σ, x, y...)
-#     add_act!(σ, x, y, z...)
-
-# Equivalent to `σ.(x .+ y .+ z)`. The mutating method `add_act!`
-# """
-# add_act(σ::Function, x::AbstractArray, yz::AbstractArray...) = σ.(.+(x, yz...))  # fused
-
-
-# function ChainRulesCore.rrule(::typeof(add_act), σ::F, x::AbstractArray, yz::AbstractArray...) where {F,T,N}
-#     if isconcretetype(Core.Compiler._return_type(
-#             derivatives_given_output, Tuple{T, F, NotaNumber}))
-
-# end
-
-
-# bias_act!(σ::Function, x::StridedArray{<:AbstractFloat}, b::Bool) =
-#     # b ? (x .= fast_act(σ, x).(x .+ b)) : (x .= fast_act(σ, x).(x))
-#     (@assert !b "bias=true is not accepted";  (x .= fast_act(σ, x).(x)))
-
-
-# using NNlib, BenchmarkTools
-
-#=
-
-## M1 mac, 1.10
-
-julia> w, b = rand(Float32, 100, 10000), rand(Float32, 100);
-
-julia> @btime bias_act!(relu, $w, $b);
-  min 19.500 μs, mean 21.375 μs (0 allocations)
-
-julia> @btime relu.($w .+ $b);
-  min 17.208 μs, mean 62.826 μs (2 allocations, 390.67 KiB)
-
-julia> @btime bias_act!(tanh, $w, $b);
-  min 63.792 μs, mean 65.052 μs (0 allocations)
-
-julia> @btime tanh_fast.($w .+ $b);
-  min 63.583 μs, mean 102.004 μs (2 allocations, 390.67 KiB)
-
-julia> using Zygote
-
-julia> @btime gradient((w,b) -> sum(bias_act!(relu, w, b)), $w, $b);
-  min 145.166 μs, mean 150.785 μs (51 allocations, 2.18 KiB)
-
-julia> @btime gradient((w,b) -> sum(relu.(w .+ b)), $w, $b);
-  min 165.583 μs, mean 314.267 μs (32 allocations, 1.15 MiB)
-
-julia> @btime gradient((w,b) -> sum(bias_act!(tanh, w, b)), $w, $b);
-  min 191.917 μs, mean 195.956 μs (51 allocations, 2.18 KiB)
-
-julia> @btime gradient((w,b) -> sum(tanh_fast.(w .+ b)), $w, $b);
-  min 209.458 μs, mean 338.652 μs (32 allocations, 1.15 MiB)
-
-
-
-## Cyclops
-
-julia> using CUDA  # 10x bigger
-
-julia> cw, cb = CUDA.rand(Float32, 100, 100_00), CUDA.rand(Float32, 100);
-
-julia> @btime CUDA.@sync bias_act!(relu, $cw, $cb);
-  22.546 μs (27 allocations: 1.45 KiB)
-
-julia> @btime CUDA.@sync relu.($cw .+ $cb);  # faster, that's odd?
-  31.282 μs (38 allocations: 1.81 KiB)
-
-julia> @btime CUDA.@sync bias_act!(tanh, $cw, $cb);
-  27.030 μs (27 allocations: 1.45 KiB)
-
-julia> @btime CUDA.@sync tanh_fast.($cw .+ $cb);
-  36.421 μs (38 allocations: 1.81 KiB)
-
-julia> using Zygote
-
-julia> @btime CUDA.@sync gradient((w,b) -> sum(bias_act!(relu, w, b)), $cw, $cb);
-  204.507 μs (382 allocations: 18.15 KiB)
-
-julia> @btime CUDA.@sync gradient((w,b) -> sum(relu.(w .+ b)), $cw, $cb);
-  204.458 μs (409 allocations: 19.19 KiB)
-
-julia> @btime CUDA.@sync gradient((w,b) -> sum(bias_act!(tanh, w, b)), $cw, $cb);
-  224.545 μs (382 allocations: 18.15 KiB)
-
-julia> @btime CUDA.@sync gradient((w,b) -> sum(tanh_fast.(w .+ b)), $cw, $cb);
-  204.793 μs (411 allocations: 19.30 KiB)
-
-
-=#
-
-#=
-
-(jl_fuwIi8) pkg> add https://github.com/mcabbott/NNlib.jl/tree/bias_act_23
-
-julia> using NNlib, Zygote, BenchmarkTools
-
-julia> w, b, x = rand(Float32, 50, 50), rand(Float32, 50), randn(Float32, 50, 100);
-
-julia> @btime bias_act!(relu, $w * $x, $b);
-  min 5.243 μs, mean 8.600 μs (2 allocations, 19.61 KiB)
-
-julia> @btime relu.($w * $x .+ $b);
-  min 5.160 μs, mean 10.863 μs (4 allocations, 39.22 KiB)
-
-julia> @btime gradient((w,x,b) -> sum(abs2, bias_act!(relu, w*x, b)), $w, $x, $b);
-  min 21.042 μs, mean 40.476 μs (43 allocations, 89.83 KiB)
-
-julia> @btime gradient((w,x,b) -> sum(abs2, relu.(w*x .+ b)), $w, $x, $b);
-  min 21.542 μs, mean 43.947 μs (41 allocations, 128.91 KiB)
-
-julia> @btime gradient((w,x) -> sum(abs2, w*x), $w, $x);
-  min 14.708 μs, mean 26.450 μs (28 allocations, 69.41 KiB)
-
-julia> @btime gradient(x -> sum(abs2, x), $x);
-  min 1.938 μs, mean 4.160 μs (2 allocations, 19.61 KiB)
-
-
-# Cyclops
-
-julia> @btime bias_act!(relu, $w * $x, $b);
-  24.786 μs (2 allocations: 19.61 KiB)
-
-julia> @btime relu.($w * $x .+ $b);
-  25.501 μs (4 allocations: 39.22 KiB)
-
-julia> @btime gradient((w,x,b) -> sum(abs2, bias_act!(relu, w*x, b)), $w, $x, $b);
-  91.847 μs (43 allocations: 89.83 KiB)
-
-julia> @btime gradient((w,x,b) -> sum(abs2, relu.(w*x .+ b)), $w, $x, $b);
-  98.054 μs (41 allocations: 128.91 KiB)
-
-julia> @btime gradient((w,x) -> sum(abs2, w*x), $w, $x);
-  80.464 μs (28 allocations: 69.41 KiB)
-
-julia> @btime gradient(x -> sum(abs2, x), $x);
-  4.604 μs (2 allocations: 19.61 KiB)
-
-julia> @time using CUDA; @time cu(ones(3)) .+ 1;
-
-julia> w, b, x = CUDA.rand(Float32, 1000, 1000), CUDA.rand(Float32, 1000), CUDA.rand(Float32, 1000, 1000);
-
-
-
-=#
-
diff --git a/src/dropout.jl b/src/dropout.jl
index 86bcb6c6f..02673cf03 100644
--- a/src/dropout.jl
+++ b/src/dropout.jl
@@ -125,27 +125,6 @@ end
 # and seems about as fast, but needs a method `copy(::CUDA.RNG)` and careful checking.
 # https://github.com/FluxML/NNlib.jl/pull/454#issuecomment-1369357402
 
-"""
-    _fast_broadcast!(f, x, y, z...)
-
-This does `x .= f.(x, y, z...)`, but works around
-an issue with broadcasting that prevents SIMD in such cases.
-Can be removed once https://github.com/JuliaLang/julia/issues/43153 is fixed.
-
-Not intended for general use. Does not check sizes!
-"""
-function _fast_broadcast!(f::F, x::Array, yz...) where {F<:Function}
-    bc = Broadcast.instantiate(Broadcast.broadcasted(f, x, yz...))
-    @simd ivdep for I in eachindex(bc)
-        @inbounds x[I] = bc[I]
-    end
-    return x
-end
-function _fast_broadcast!(f::F, x::AbstractArray, yz...) where {F<:Function}
-    # CUDA does not suffer from this bug
-    broadcast!(f, x, x, yz...)
-end
-
 
 """
     _rng_from_array(x)
diff --git a/src/utils.jl b/src/utils.jl
index 6d16b1cb1..e244022de 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -105,4 +105,32 @@ function reverse_indices(idx::AbstractArray{<:Any,N}) where N
     return reverse_indices!(rev, idx)
 end
 
-unsqueeze(x) = reshape(x, 1, size(x)...) 
+unsqueeze(x) = reshape(x, 1, size(x)...)
+
+
+"""
+    _fast_broadcast!(f, x, y, z...)
+
+This does `x .= f.(x, y, z...)`, but works around
+an issue with broadcasting that prevents SIMD in such cases.
+Can be removed once https://github.com/JuliaLang/julia/issues/43153 is fixed.
+
+Not intended for general use. Uses `@inbounds` but does not check sizes!
+
+Has an `rrule` to avoid mutation within derivatives. This assumes that `f` has no derivative!
+"""
+function _fast_broadcast!(f::F, x::Array, yz...) where {F<:Function}
+    bc = Broadcast.instantiate(Broadcast.broadcasted(f, x, yz...))
+    @simd ivdep for I in eachindex(bc)
+        @inbounds x[I] = bc[I]
+    end
+    return x
+end
+function _fast_broadcast!(f::F, x::AbstractArray, yz...) where {F<:Function}
+    # CUDA does not suffer from this bug
+    broadcast!(f, x, x, yz...)
+end
+
+function rrule(cfg::RuleConfig{>:HasReverseMode}, ::typeof(_fast_broadcast!), f::F, x::AbstractArray, ys...)  where {F<:Function}
+    rrule_via_ad(cfg, broadcast, f, x, ys...)
+end
diff --git a/test/bias_act.jl b/test/bias_act.jl
index 2c955ff74..c4d682559 100644
--- a/test/bias_act.jl
+++ b/test/bias_act.jl
@@ -1,7 +1,7 @@
 using NNlib, Zygote, Test
 using Zygote: ForwardDiff
 
-ACTIVATION_FUNCTIONS = 
+ACTIVATION_FUNCTIONS =
     [@eval($a) for a in NNlib.ACTIVATIONS]
 
 @testset "bias_act!" begin
@@ -11,7 +11,7 @@ ACTIVATION_FUNCTIONS =
     @test bias_act!(relu, copy(x), b) ≈ relu.(x .+ b)
     @test bias_act!(tanh, copy(x), b) ≈ tanh.(x .+ b)
 
-    @testset "gradient with $fun" for fun in vcat([identity, tanh, cbrt], 
+    @testset "gradient with $fun" for fun in vcat([identity, tanh, cbrt],
                                                     ACTIVATION_FUNCTIONS,
                                                     [x->x, x -> 1/(x^2+2), x -> leakyrelu(x, 0.33)])
         # Only some of these go the fast path, `cbrt` is an example of a function NNlib knows nothing about.
@@ -33,9 +33,38 @@ ACTIVATION_FUNCTIONS =
         @test Zygote.gradient(b -> sum(bias_act!(fun, copy(x), b)), b .> 0) == (nothing,)
     end
 
-    @testset "gradient for fast_broadcast_plus!" begin
+    @testset "gradient for fast_broadcast!" begin
         # Gradient definition is just to disable mutation inside 2nd order AD
-        gx = ForwardDiff.gradient(x -> sum(NNlib.fast_broadcast_plus!(cbrt, copy(x), b)), x)
-        @test gx ≈ Zygote.gradient(x -> sum(NNlib.fast_broadcast_plus!(cbrt, copy(x), b)), x)[1]
+        gx = ForwardDiff.gradient(x -> sum(NNlib._fast_broadcast!(cbrt∘(+), copy(x), b)), x)
+        @test gx ≈ Zygote.gradient(x -> sum(NNlib._fast_broadcast!(cbrt∘(+), copy(x), b)), x)[1]
+
+        # relu should take the fast path
+        g2 = ForwardDiff.gradient(x) do x
+            sum(abs2, Zygote.gradient(x -> sum(abs2, bias_act!(relu, copy(x), b)), x)[1])
+        end
+        @test_broken gx ≈ Zygote.gradient(x) do x
+            sum(abs2,Zygote. gradient(x -> sum(abs2, bias_act!(relu, copy(x), b)), x)[1])
+        end
+        # Can't differentiate foreigncall expression $(Expr(:foreigncall, :(:jl_eqtable_get), Any, svec(Any, Any, Any), 0, :(:ccall), %5, %3, %4)).
+        # [5] (::typeof(∂(accum_global)))(Δ::Nothing)
+        @test g2 ≈ Zygote.gradient(x, b) do x, b
+            sum(abs2, Zygote.gradient((x, b) -> sum(abs2, bias_act!(relu, copy(x), b)), x, b)[1])
+        end[1]
+
+       g3 = ForwardDiff.gradient(x) do x
+            sum(abs2, Zygote.gradient((x, b) -> sum(abs2, bias_act!(swish, copy(x), b)), x, b)[1])
+        end
+        @test g3 ≈ Zygote.gradient(x, b) do x, b
+            sum(abs2, Zygote.gradient((x, b) -> sum(abs2, bias_act!(swish, copy(x), b)), x, b)[1])
+        end[1]
+
+        # Anon function sure to take the generic path
+        g4 = ForwardDiff.gradient(x) do x
+            sum(abs2, Zygote.gradient((x, b) -> sum(abs2, bias_act!(y -> cbrt(y/3), copy(x), b)), x, b)[1])
+        end
+        @test g4 ≈ Zygote.gradient(x, b) do x, b
+            sum(abs2, Zygote.gradient((x, b) -> sum(abs2, bias_act!(y -> cbrt(y/3), copy(x), b)), x, b)[1])
+        end[1]
     end
 end
+

From f08fc0bfebdd00163b8c13c59348dc092c2c9640 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Thu, 5 Jan 2023 11:27:50 -0500
Subject: [PATCH 03/11] add to docs

---
 docs/src/reference.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/src/reference.md b/docs/src/reference.md
index fe0ad2d8e..e45273977 100644
--- a/docs/src/reference.md
+++ b/docs/src/reference.md
@@ -152,4 +152,5 @@ ctc_loss
 logsumexp
 NNlib.glu
 NNlib.within_gradient
+bias_act!
 ```

From a9136e722cbc0ad594aa68a5d412e228be07a081 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Thu, 5 Jan 2023 12:02:47 -0500
Subject: [PATCH 04/11] also fix two unrelated docstring which just told you
 what the function was called without explaining anything

---
 src/utils.jl | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/utils.jl b/src/utils.jl
index e244022de..41264ada5 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -53,15 +53,21 @@ ChainRulesCore.rrule(::typeof(within_gradient), x) = true, _ -> (NoTangent(), No
 """
     safe_div(x, y)
 
-Safely divide `x` by `y`. If `y` is zero, return `x` directly.
+Returns `x/y` unless `y==0`, in which case it just returns `x`.
+(Used internally by `scatter`.)
 """
 safe_div(x, y) = ifelse(iszero(y), x, x/y)
 
 """
     maximum_dims(dims)
 
-Return the maximum value for each dimension. An array of dimensions `dims` is accepted.
-The maximum of each dimension in the element is computed.
+Given an array of `CartesianIndex{N}` or `NTuple{N,Int}`,
+returns a tuple containing the maximum of the 1st entries,
+the 2nd, and so on up to `N`.
+
+Given an array of integers, returns `(maximum(dims),)`.
+
+(These arguments are what [`scatter`](@ref NNlib.scatter) understands.)
 """
 maximum_dims(dims::AbstractArray{<:Integer}) = (maximum(dims), )
 maximum_dims(dims::AbstractArray{NTuple{N, T}}) where {N,T} = ntuple(i -> maximum(x->x[i], dims), N)

From 83882ded8cf2cc60f4bcac2461e3ae90ea2e9495 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Fri, 6 Jan 2023 20:14:15 -0500
Subject: [PATCH 05/11] tidy & un-comment

---
 docs/src/reference.md |   1 +
 src/bias_act.jl       | 196 ++++++++++++++++++++++++++++++++++++------
 src/utils.jl          |   4 +-
 3 files changed, 175 insertions(+), 26 deletions(-)

diff --git a/docs/src/reference.md b/docs/src/reference.md
index e45273977..c01db6b24 100644
--- a/docs/src/reference.md
+++ b/docs/src/reference.md
@@ -75,6 +75,7 @@ pad_zeros
 ## Convolution
 
 `Flux`'s `Conv` and `CrossCor` layers use `NNlib.DenseConvDims` and `NNlib.conv` internally.
+
 `NNlib.conv` supports complex datatypes on CPU and CUDA devices.
 
 !!! AMDGPU MIOpen supports only cross-correlation (flipkernel=true).
diff --git a/src/bias_act.jl b/src/bias_act.jl
index 2a7b93805..715e5fcd2 100644
--- a/src/bias_act.jl
+++ b/src/bias_act.jl
@@ -39,9 +39,12 @@ bias_act!(::typeof(identity), x::StridedArray{<:AbstractFloat}, b::Bool) =
 
 
 function ChainRulesCore.rrule(cfg::RCR, ::typeof(bias_act!), σ::F, x::AbstractArray{T,N}, b::B) where {F,T,N,B}
-    if eltype(B) !== Bool
-        b_dims = ntuple(d -> size(b,d)==1 ? d : N+1, N)
-        size_b = size(b)
+    biasgrad = if eltype(B) !== Bool
+        # Summing over ndims(x)+1 is a trick to make b_dims type-stable
+        dims = ntuple(d -> size(b,d)==1 ? d : N+1, N)
+        _biasgrad(dx) = reshape(sum(dx; dims), size(b))
+    else
+        Returns(NoTangent())
     end
 
     # Fast path: it is now safe to overwrite x, since this is not needed for gradient of σ
@@ -52,50 +55,195 @@ function ChainRulesCore.rrule(cfg::RCR, ::typeof(bias_act!), σ::F, x::AbstractA
             # TODO with e.g. https://github.com/FluxML/Zygote.jl/pull/1340
             # https://github.com/JuliaDiff/ChainRulesCore.jl/pull/592
             dx = only_derivative.(Ω, σ, NotaNumber()) .* unthunk(Δ)
-            db = eltype(B) === Bool ? NoTangent() : reshape(sum(dx; dims = b_dims), size_b)
-            return (NoTangent(), NoTangent(), dx, db)
+            return (NoTangent(), NoTangent(), dx, biasgrad(dx))
         end
         return Ω, bias_act!_fastback
 
-    # # Slower path: can't overwrite x, but can use derivatives_given_output
-    # # This case is WRONG and tests fail, but not sure why
-    # elseif isconcretetype(Core.Compiler._return_type(only_derivative, Tuple{T, F, T}))
-    #     Ω2 = fast_act(σ, x).(x) .+ b
-    #     @show σ b
-    #     function bias_act!_back2(Δ)
-    #         dx = only_derivative.(Ω2, σ, x .+ b) .* unthunk(Δ)
-    #         db = eltype(B) === Bool ? NoTangent() : reshape(sum(dx; dims = b_dims), size_b)
-    #         return (NoTangent(), NoTangent(), dx, db)
-    #     end
-    #     return Ω2, bias_act!_back2
+    # Slower path: can't overwrite x, but can use derivatives_given_output
+    # This case is WRONG and tests fail, but not sure why
+    elseif isconcretetype(Core.Compiler._return_type(only_derivative, Tuple{T, F, T}))
+        Ω2 = fast_act(σ, x).(x) .+ b
+        @show σ b
+        function bias_act!_back2(Δ)
+            dx = only_derivative.(Ω2, σ, x .+ b) .* unthunk(Δ)
+            return (NoTangent(), NoTangent(), dx, biasgrad(dx))
+        end
+        return Ω2, bias_act!_back2
 
     # Fallback path: let AD handle the broadcast
     else
         Ω3, back = rrule_via_ad(cfg, broadcast, fast_act(σ, x), bias_act!(identity, x, b))
         @inline function bias_act!_slowback(Δ)
             _, _, dx = back(Δ)
-            db = eltype(B) === Bool ? NoTangent() : reshape(sum(dx; dims = b_dims), size_b)
-            return (NoTangent(), NoTangent(), dx, db)
+            return (NoTangent(), NoTangent(), dx, biasgrad(dx))
         end
         return Ω3, bias_act!_slowback
     end
 end
 
-# Two easy cases
+# Two easy cases with identity
 function rrule(cfg::RCR, ::typeof(bias_act!), ::typeof(identity), x::AbstractArray{T,N}, b::B) where {T,N,B}
-    b_dims = ntuple(d -> size(b,d)==1 ? d : N+1, N)
-    size_b = size(b)
+    dims = ntuple(d -> size(b,d)==1 ? d : N+1, N)
+    biasgrad(dx) = reshape(sum(dx; dims), size(b))
     function bias_act!_idback(Δ)
         dx = unthunk(Δ)
-        db = reshape(sum(dx; dims = b_dims), size_b)
-        return (NoTangent(), NoTangent(), dx, db)
+        return (NoTangent(), NoTangent(), dx,  biasgrad(dx))
     end
     return bias_act!(identity, x, b), bias_act!_idback
 end
-
 function rrule(cfg::RCR, ::typeof(bias_act!), ::typeof(identity), x::AbstractArray{T,N}, b::Bool) where {T,N}
     bias_act!_trivial(Δ) = (NoTangent(), NoTangent(), Δ, NoTangent())
     return x, bias_act!_trivial
 end
 
 
+
+# """
+#     add_act(σ, x, y...)
+#     add_act!(σ, x, y, z...)
+
+# Equivalent to `σ.(x .+ y .+ z)`. The mutating method `add_act!`
+# """
+# add_act(σ::Function, x::AbstractArray, yz::AbstractArray...) = σ.(.+(x, yz...))  # fused
+
+
+# function ChainRulesCore.rrule(::typeof(add_act), σ::F, x::AbstractArray, yz::AbstractArray...) where {F,T,N}
+#     if isconcretetype(Core.Compiler._return_type(
+#             derivatives_given_output, Tuple{T, F, NotaNumber}))
+
+# end
+
+
+# bias_act!(σ::Function, x::StridedArray{<:AbstractFloat}, b::Bool) =
+#     # b ? (x .= fast_act(σ, x).(x .+ b)) : (x .= fast_act(σ, x).(x))
+#     (@assert !b "bias=true is not accepted";  (x .= fast_act(σ, x).(x)))
+
+
+# using NNlib, BenchmarkTools
+
+#=
+
+## M1 mac, 1.10
+
+julia> w, b = rand(Float32, 100, 10000), rand(Float32, 100);
+
+julia> @btime bias_act!(relu, $w, $b);
+  min 19.500 μs, mean 21.375 μs (0 allocations)
+
+julia> @btime relu.($w .+ $b);
+  min 17.208 μs, mean 62.826 μs (2 allocations, 390.67 KiB)
+
+julia> @btime bias_act!(tanh, $w, $b);
+  min 63.792 μs, mean 65.052 μs (0 allocations)
+
+julia> @btime tanh_fast.($w .+ $b);
+  min 63.583 μs, mean 102.004 μs (2 allocations, 390.67 KiB)
+
+julia> using Zygote
+
+julia> @btime gradient((w,b) -> sum(bias_act!(relu, w, b)), $w, $b);
+  min 145.166 μs, mean 150.785 μs (51 allocations, 2.18 KiB)
+
+julia> @btime gradient((w,b) -> sum(relu.(w .+ b)), $w, $b);
+  min 165.583 μs, mean 314.267 μs (32 allocations, 1.15 MiB)
+
+julia> @btime gradient((w,b) -> sum(bias_act!(tanh, w, b)), $w, $b);
+  min 191.917 μs, mean 195.956 μs (51 allocations, 2.18 KiB)
+
+julia> @btime gradient((w,b) -> sum(tanh_fast.(w .+ b)), $w, $b);
+  min 209.458 μs, mean 338.652 μs (32 allocations, 1.15 MiB)
+
+
+
+## Cyclops
+
+julia> using CUDA  # 10x bigger
+
+julia> cw, cb = CUDA.rand(Float32, 100, 100_00), CUDA.rand(Float32, 100);
+
+julia> @btime CUDA.@sync bias_act!(relu, $cw, $cb);
+  22.546 μs (27 allocations: 1.45 KiB)
+
+julia> @btime CUDA.@sync relu.($cw .+ $cb);  # faster, that's odd?
+  31.282 μs (38 allocations: 1.81 KiB)
+
+julia> @btime CUDA.@sync bias_act!(tanh, $cw, $cb);
+  27.030 μs (27 allocations: 1.45 KiB)
+
+julia> @btime CUDA.@sync tanh_fast.($cw .+ $cb);
+  36.421 μs (38 allocations: 1.81 KiB)
+
+julia> using Zygote
+
+julia> @btime CUDA.@sync gradient((w,b) -> sum(bias_act!(relu, w, b)), $cw, $cb);
+  204.507 μs (382 allocations: 18.15 KiB)
+
+julia> @btime CUDA.@sync gradient((w,b) -> sum(relu.(w .+ b)), $cw, $cb);
+  204.458 μs (409 allocations: 19.19 KiB)
+
+julia> @btime CUDA.@sync gradient((w,b) -> sum(bias_act!(tanh, w, b)), $cw, $cb);
+  224.545 μs (382 allocations: 18.15 KiB)
+
+julia> @btime CUDA.@sync gradient((w,b) -> sum(tanh_fast.(w .+ b)), $cw, $cb);
+  204.793 μs (411 allocations: 19.30 KiB)
+
+
+=#
+
+#=
+
+(jl_fuwIi8) pkg> add https://github.com/mcabbott/NNlib.jl/tree/bias_act_23
+
+julia> using NNlib, Zygote, BenchmarkTools
+
+julia> w, b, x = rand(Float32, 50, 50), rand(Float32, 50), randn(Float32, 50, 100);
+
+julia> @btime bias_act!(relu, $w * $x, $b);
+  min 5.243 μs, mean 8.600 μs (2 allocations, 19.61 KiB)
+
+julia> @btime relu.($w * $x .+ $b);
+  min 5.160 μs, mean 10.863 μs (4 allocations, 39.22 KiB)
+
+julia> @btime gradient((w,x,b) -> sum(abs2, bias_act!(relu, w*x, b)), $w, $x, $b);
+  min 21.042 μs, mean 40.476 μs (43 allocations, 89.83 KiB)
+
+julia> @btime gradient((w,x,b) -> sum(abs2, relu.(w*x .+ b)), $w, $x, $b);
+  min 21.542 μs, mean 43.947 μs (41 allocations, 128.91 KiB)
+
+julia> @btime gradient((w,x) -> sum(abs2, w*x), $w, $x);
+  min 14.708 μs, mean 26.450 μs (28 allocations, 69.41 KiB)
+
+julia> @btime gradient(x -> sum(abs2, x), $x);
+  min 1.938 μs, mean 4.160 μs (2 allocations, 19.61 KiB)
+
+
+# Cyclops
+
+julia> @btime bias_act!(relu, $w * $x, $b);
+  24.786 μs (2 allocations: 19.61 KiB)
+
+julia> @btime relu.($w * $x .+ $b);
+  25.501 μs (4 allocations: 39.22 KiB)
+
+julia> @btime gradient((w,x,b) -> sum(abs2, bias_act!(relu, w*x, b)), $w, $x, $b);
+  91.847 μs (43 allocations: 89.83 KiB)
+
+julia> @btime gradient((w,x,b) -> sum(abs2, relu.(w*x .+ b)), $w, $x, $b);
+  98.054 μs (41 allocations: 128.91 KiB)
+
+julia> @btime gradient((w,x) -> sum(abs2, w*x), $w, $x);
+  80.464 μs (28 allocations: 69.41 KiB)
+
+julia> @btime gradient(x -> sum(abs2, x), $x);
+  4.604 μs (2 allocations: 19.61 KiB)
+
+julia> @time using CUDA; @time cu(ones(3)) .+ 1;
+
+julia> w, b, x = CUDA.rand(Float32, 1000, 1000), CUDA.rand(Float32, 1000), CUDA.rand(Float32, 1000, 1000);
+
+
+
+=#
+
+
+
diff --git a/src/utils.jl b/src/utils.jl
index 41264ada5..542b520be 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -62,8 +62,8 @@ safe_div(x, y) = ifelse(iszero(y), x, x/y)
     maximum_dims(dims)
 
 Given an array of `CartesianIndex{N}` or `NTuple{N,Int}`,
-returns a tuple containing the maximum of the 1st entries,
-the 2nd, and so on up to `N`.
+returns a tuple containing the maximum of all the 1st entries,
+all the 2nd entries, and so on up to `N`.
 
 Given an array of integers, returns `(maximum(dims),)`.
 

From 419725ce9a84653ccc95ac647611bb43350548ea Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Sat, 7 Jan 2023 09:26:47 -0500
Subject: [PATCH 06/11] comment out 2nd path again

---
 src/bias_act.jl | 171 +++---------------------------------------------
 src/utils.jl    |   9 ++-
 2 files changed, 16 insertions(+), 164 deletions(-)

diff --git a/src/bias_act.jl b/src/bias_act.jl
index 715e5fcd2..43eac3a00 100644
--- a/src/bias_act.jl
+++ b/src/bias_act.jl
@@ -59,16 +59,16 @@ function ChainRulesCore.rrule(cfg::RCR, ::typeof(bias_act!), σ::F, x::AbstractA
         end
         return Ω, bias_act!_fastback
 
-    # Slower path: can't overwrite x, but can use derivatives_given_output
-    # This case is WRONG and tests fail, but not sure why
-    elseif isconcretetype(Core.Compiler._return_type(only_derivative, Tuple{T, F, T}))
-        Ω2 = fast_act(σ, x).(x) .+ b
-        @show σ b
-        function bias_act!_back2(Δ)
-            dx = only_derivative.(Ω2, σ, x .+ b) .* unthunk(Δ)
-            return (NoTangent(), NoTangent(), dx, biasgrad(dx))
-        end
-        return Ω2, bias_act!_back2
+    # # Slower path: can't overwrite x, but can use derivatives_given_output
+    # # This case is WRONG and tests fail, but not sure why
+    # elseif isconcretetype(Core.Compiler._return_type(only_derivative, Tuple{T, F, T}))
+    #     Ω2 = fast_act(σ, x).(x) .+ b
+    #     @show σ b
+    #     function bias_act!_back2(Δ)
+    #         dx = only_derivative.(Ω2, σ, x .+ b) .* unthunk(Δ)
+    #         return (NoTangent(), NoTangent(), dx, biasgrad(dx))
+    #     end
+    #     return Ω2, bias_act!_back2
 
     # Fallback path: let AD handle the broadcast
     else
@@ -96,154 +96,3 @@ function rrule(cfg::RCR, ::typeof(bias_act!), ::typeof(identity), x::AbstractArr
     return x, bias_act!_trivial
 end
 
-
-
-# """
-#     add_act(σ, x, y...)
-#     add_act!(σ, x, y, z...)
-
-# Equivalent to `σ.(x .+ y .+ z)`. The mutating method `add_act!`
-# """
-# add_act(σ::Function, x::AbstractArray, yz::AbstractArray...) = σ.(.+(x, yz...))  # fused
-
-
-# function ChainRulesCore.rrule(::typeof(add_act), σ::F, x::AbstractArray, yz::AbstractArray...) where {F,T,N}
-#     if isconcretetype(Core.Compiler._return_type(
-#             derivatives_given_output, Tuple{T, F, NotaNumber}))
-
-# end
-
-
-# bias_act!(σ::Function, x::StridedArray{<:AbstractFloat}, b::Bool) =
-#     # b ? (x .= fast_act(σ, x).(x .+ b)) : (x .= fast_act(σ, x).(x))
-#     (@assert !b "bias=true is not accepted";  (x .= fast_act(σ, x).(x)))
-
-
-# using NNlib, BenchmarkTools
-
-#=
-
-## M1 mac, 1.10
-
-julia> w, b = rand(Float32, 100, 10000), rand(Float32, 100);
-
-julia> @btime bias_act!(relu, $w, $b);
-  min 19.500 μs, mean 21.375 μs (0 allocations)
-
-julia> @btime relu.($w .+ $b);
-  min 17.208 μs, mean 62.826 μs (2 allocations, 390.67 KiB)
-
-julia> @btime bias_act!(tanh, $w, $b);
-  min 63.792 μs, mean 65.052 μs (0 allocations)
-
-julia> @btime tanh_fast.($w .+ $b);
-  min 63.583 μs, mean 102.004 μs (2 allocations, 390.67 KiB)
-
-julia> using Zygote
-
-julia> @btime gradient((w,b) -> sum(bias_act!(relu, w, b)), $w, $b);
-  min 145.166 μs, mean 150.785 μs (51 allocations, 2.18 KiB)
-
-julia> @btime gradient((w,b) -> sum(relu.(w .+ b)), $w, $b);
-  min 165.583 μs, mean 314.267 μs (32 allocations, 1.15 MiB)
-
-julia> @btime gradient((w,b) -> sum(bias_act!(tanh, w, b)), $w, $b);
-  min 191.917 μs, mean 195.956 μs (51 allocations, 2.18 KiB)
-
-julia> @btime gradient((w,b) -> sum(tanh_fast.(w .+ b)), $w, $b);
-  min 209.458 μs, mean 338.652 μs (32 allocations, 1.15 MiB)
-
-
-
-## Cyclops
-
-julia> using CUDA  # 10x bigger
-
-julia> cw, cb = CUDA.rand(Float32, 100, 100_00), CUDA.rand(Float32, 100);
-
-julia> @btime CUDA.@sync bias_act!(relu, $cw, $cb);
-  22.546 μs (27 allocations: 1.45 KiB)
-
-julia> @btime CUDA.@sync relu.($cw .+ $cb);  # faster, that's odd?
-  31.282 μs (38 allocations: 1.81 KiB)
-
-julia> @btime CUDA.@sync bias_act!(tanh, $cw, $cb);
-  27.030 μs (27 allocations: 1.45 KiB)
-
-julia> @btime CUDA.@sync tanh_fast.($cw .+ $cb);
-  36.421 μs (38 allocations: 1.81 KiB)
-
-julia> using Zygote
-
-julia> @btime CUDA.@sync gradient((w,b) -> sum(bias_act!(relu, w, b)), $cw, $cb);
-  204.507 μs (382 allocations: 18.15 KiB)
-
-julia> @btime CUDA.@sync gradient((w,b) -> sum(relu.(w .+ b)), $cw, $cb);
-  204.458 μs (409 allocations: 19.19 KiB)
-
-julia> @btime CUDA.@sync gradient((w,b) -> sum(bias_act!(tanh, w, b)), $cw, $cb);
-  224.545 μs (382 allocations: 18.15 KiB)
-
-julia> @btime CUDA.@sync gradient((w,b) -> sum(tanh_fast.(w .+ b)), $cw, $cb);
-  204.793 μs (411 allocations: 19.30 KiB)
-
-
-=#
-
-#=
-
-(jl_fuwIi8) pkg> add https://github.com/mcabbott/NNlib.jl/tree/bias_act_23
-
-julia> using NNlib, Zygote, BenchmarkTools
-
-julia> w, b, x = rand(Float32, 50, 50), rand(Float32, 50), randn(Float32, 50, 100);
-
-julia> @btime bias_act!(relu, $w * $x, $b);
-  min 5.243 μs, mean 8.600 μs (2 allocations, 19.61 KiB)
-
-julia> @btime relu.($w * $x .+ $b);
-  min 5.160 μs, mean 10.863 μs (4 allocations, 39.22 KiB)
-
-julia> @btime gradient((w,x,b) -> sum(abs2, bias_act!(relu, w*x, b)), $w, $x, $b);
-  min 21.042 μs, mean 40.476 μs (43 allocations, 89.83 KiB)
-
-julia> @btime gradient((w,x,b) -> sum(abs2, relu.(w*x .+ b)), $w, $x, $b);
-  min 21.542 μs, mean 43.947 μs (41 allocations, 128.91 KiB)
-
-julia> @btime gradient((w,x) -> sum(abs2, w*x), $w, $x);
-  min 14.708 μs, mean 26.450 μs (28 allocations, 69.41 KiB)
-
-julia> @btime gradient(x -> sum(abs2, x), $x);
-  min 1.938 μs, mean 4.160 μs (2 allocations, 19.61 KiB)
-
-
-# Cyclops
-
-julia> @btime bias_act!(relu, $w * $x, $b);
-  24.786 μs (2 allocations: 19.61 KiB)
-
-julia> @btime relu.($w * $x .+ $b);
-  25.501 μs (4 allocations: 39.22 KiB)
-
-julia> @btime gradient((w,x,b) -> sum(abs2, bias_act!(relu, w*x, b)), $w, $x, $b);
-  91.847 μs (43 allocations: 89.83 KiB)
-
-julia> @btime gradient((w,x,b) -> sum(abs2, relu.(w*x .+ b)), $w, $x, $b);
-  98.054 μs (41 allocations: 128.91 KiB)
-
-julia> @btime gradient((w,x) -> sum(abs2, w*x), $w, $x);
-  80.464 μs (28 allocations: 69.41 KiB)
-
-julia> @btime gradient(x -> sum(abs2, x), $x);
-  4.604 μs (2 allocations: 19.61 KiB)
-
-julia> @time using CUDA; @time cu(ones(3)) .+ 1;
-
-julia> w, b, x = CUDA.rand(Float32, 1000, 1000), CUDA.rand(Float32, 1000), CUDA.rand(Float32, 1000, 1000);
-
-
-
-=#
-
-
-
diff --git a/src/utils.jl b/src/utils.jl
index 542b520be..49f08d6d6 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -119,11 +119,14 @@ unsqueeze(x) = reshape(x, 1, size(x)...)
 
 This does `x .= f.(x, y, z...)`, but works around
 an issue with broadcasting that prevents SIMD in such cases.
-Can be removed once https://github.com/JuliaLang/julia/issues/43153 is fixed.
+Can perhaps be removed once https://github.com/JuliaLang/julia/issues/43153 is fixed.
 
-Not intended for general use. Uses `@inbounds` but does not check sizes!
+Has an `rrule` to avoid mutation within derivatives.
 
-Has an `rrule` to avoid mutation within derivatives. This assumes that `f` has no derivative!
+!!! warning
+    Not intended for general use.
+    Uses `@inbounds` but does not check sizes!
+    Assumes that `f` has no derivative!
 """
 function _fast_broadcast!(f::F, x::Array, yz...) where {F<:Function}
     bc = Broadcast.instantiate(Broadcast.broadcasted(f, x, yz...))

From dbf39d46651e533001baf65b04158519e4d52844 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Sat, 7 Jan 2023 09:27:33 -0500
Subject: [PATCH 07/11] add Returns for 1.6

---
 src/utils.jl | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/utils.jl b/src/utils.jl
index 49f08d6d6..3d23e7383 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -143,3 +143,22 @@ end
 function rrule(cfg::RuleConfig{>:HasReverseMode}, ::typeof(_fast_broadcast!), f::F, x::AbstractArray, ys...)  where {F<:Function}
     rrule_via_ad(cfg, broadcast, f, x, ys...)
 end
+
+# Could get this from Compat.jl instead
+# https://github.com/JuliaLang/julia/pull/39794
+if VERSION < v"1.7.0-DEV.793"
+    struct Returns{V} <: Function
+        value::V
+        Returns{V}(value) where {V} = new{V}(value)
+        Returns(value) = new{Core.Typeof(value)}(value)
+    end
+
+    (obj::Returns)(args...; kw...) = obj.value
+    function Base.show(io::IO, obj::Returns)
+        show(io, typeof(obj))
+        print(io, "(")
+        show(io, obj.value)
+        print(io, ")")
+    end
+end
+

From 791531a644be217db4c6d447ba213c176078bb73 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Sat, 7 Jan 2023 11:58:27 -0500
Subject: [PATCH 08/11] upgrade tests

---
 src/bias_act.jl  | 17 +++++++++++++----
 test/bias_act.jl | 42 +++++++++++++++++++++++++++++++++++++++---
 2 files changed, 52 insertions(+), 7 deletions(-)

diff --git a/src/bias_act.jl b/src/bias_act.jl
index 43eac3a00..ef7fb29d9 100644
--- a/src/bias_act.jl
+++ b/src/bias_act.jl
@@ -29,14 +29,23 @@ contains only `Ω` (the output) not `x`.
     It is intended mainly for Flux layers, in which the previous operation is
     known to be safe, e.g. `bias_act!(σ, weight * input, bias)` for a `Dense` layer.
 """
-bias_act!(σ::Function, x::AbstractArray, b) = fast_act(σ, x).(x .+ b)  # fallback
-
 bias_act!(σ::Function, x::StridedArray{<:AbstractFloat}, b::AbstractArray{<:Union{Bool, AbstractFloat}}) =
     _fast_broadcast!(fast_act(σ, x)∘(+), x, b)  # works around a SIMD bug
 
-bias_act!(::typeof(identity), x::StridedArray{<:AbstractFloat}, b::Bool) =
-    (@assert !b "bias=true is not accepted; layer constructors shoud guarantee this";  x)
+function bias_act!(σ::Function, x::StridedArray{<:AbstractFloat}, b::Bool)
+    b === true && error("bias=true is not accepted; layer constructors shoud guarantee this")
+    _fast_broadcast!(fast_act(σ, x), x)
+end
 
+function bias_act!(::typeof(identity), x::StridedArray{<:AbstractFloat}, b::Bool)
+    b === true && error("bias=true is not accepted; layer constructors shoud guarantee this")
+    x  # pass-through
+end
+
+function bias_act!(σ::Function, x::AbstractArray, b)
+    b === true && error("bias=true is not accepted; layer constructors shoud guarantee this")
+    fast_act(σ, x).(x .+ b)  # fallback
+end
 
 function ChainRulesCore.rrule(cfg::RCR, ::typeof(bias_act!), σ::F, x::AbstractArray{T,N}, b::B) where {F,T,N,B}
     biasgrad = if eltype(B) !== Bool
diff --git a/test/bias_act.jl b/test/bias_act.jl
index c4d682559..31dcb0487 100644
--- a/test/bias_act.jl
+++ b/test/bias_act.jl
@@ -7,9 +7,33 @@ ACTIVATION_FUNCTIONS =
 @testset "bias_act!" begin
     x = randn(3,4)
     b = randn(3)
-    @test bias_act!(identity, copy(x), b) ≈ (x .+ b)
-    @test bias_act!(relu, copy(x), b) ≈ relu.(x .+ b)
-    @test bias_act!(tanh, copy(x), b) ≈ tanh.(x .+ b)
+    @test @inferred(bias_act!(identity, x, false)) === x  # pass-through
+    @test @inferred(bias_act!(identity, copy(x), b)) ≈ (x .+ b)
+    @test @inferred(bias_act!(relu, copy(x), b)) ≈ relu.(x .+ b)
+    @test @inferred(bias_act!(tanh, copy(x), b)) ≈ tanh.(x .+ b)
+    @test @inferred(bias_act!(tanh, copy(x), false)) ≈ tanh.(x)
+
+    # Check that it does overwrite:
+    x32 = rand(Float32, 3, 4)
+    x32copy = copy(x32)
+    @test @inferred(bias_act!(cbrt, x32, b)) ≈ cbrt.(x32copy .+ b)
+    @test x32  ≈ cbrt.(x32copy .+ b)
+    x32 = rand(Float32, 3, 4)
+    x32copy = copy(x32)
+    @test @inferred(bias_act!(tanh, x32, false)) ≈ tanh.(x32copy)
+    @test x32  ≈ tanh.(x32copy)
+
+    # Check that it doesn't try to overwrite non-float arrays:
+    xint = rand(-3:3, 3, 4)
+    bint = rand(-2:2, 3)
+    @test bias_act!(identity, copy(xint), bint) ≈ xint .+ bint
+    @test bias_act!(tanh, copy(xint), bint) ≈ tanh.(xint .+ bint)
+    @test bias_act!(tanh, copy(xint), false) ≈ tanh.(xint)
+
+    # Reject bias===true so that Bool means one thing:
+    @test_throws Exception bias_act!(identity, rand(3), true)
+    @test_throws Exception bias_act!(cbrt, rand(3), true)
+    @test_throws Exception bias_act!(cbrt, rand(1:3, 3), true)
 
     @testset "gradient with $fun" for fun in vcat([identity, tanh, cbrt],
                                                     ACTIVATION_FUNCTIONS,
@@ -21,9 +45,21 @@ ACTIVATION_FUNCTIONS =
         @test bias_act!(fun, copy(x), false) ≈ fun.(x)
 
         gx = ForwardDiff.gradient(x -> sum(bias_act!(fun, copy(x), b)), x)
+        gxplus = ForwardDiff.gradient(x -> sum(bias_act!(fun, copy(x), b)), x .+ eps())
+        gxminus = ForwardDiff.gradient(x -> sum(bias_act!(fun, copy(x), b)), x .- eps())
+        if !(gx ≈ gxplus ≈ gxminus)
+            @warn "skipping gradient tests due to discontinuity" fun x b
+            continue
+        end
         @test gx ≈ Zygote.gradient(x -> sum(bias_act!(fun, copy(x), b)), x)[1]
 
         gx2 = ForwardDiff.gradient(x -> sum(bias_act!(fun, copy(x), false)), x)
+        gx2plus = ForwardDiff.gradient(x -> sum(bias_act!(fun, copy(x), false)), x .- eps())
+        gx2minus = ForwardDiff.gradient(x -> sum(bias_act!(fun, copy(x), false)), x .- eps())
+        if !(gx2 ≈ gx2plus ≈ gx2minus)
+            @warn "skipping gradient tests due to discontinuity" fun x
+            continue
+        end
         @test gx2 ≈ Zygote.gradient(x -> sum(bias_act!(fun, copy(x), false)), x)[1]
 
         gb = ForwardDiff.gradient(b -> sum(bias_act!(fun, copy(x), b)), b)

From 7b04b15f15793d8534428adbff7b141bea4db42d Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Sat, 7 Jan 2023 12:48:46 -0500
Subject: [PATCH 09/11] more tests

---
 test/bias_act.jl | 23 +++++++++++++++--------
 test/runtests.jl |  1 -
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/test/bias_act.jl b/test/bias_act.jl
index 31dcb0487..110f1a24e 100644
--- a/test/bias_act.jl
+++ b/test/bias_act.jl
@@ -1,4 +1,4 @@
-using NNlib, Zygote, Test
+using NNlib, Zygote, ChainRulesCore, Test
 using Zygote: ForwardDiff
 
 ACTIVATION_FUNCTIONS =
@@ -14,14 +14,21 @@ ACTIVATION_FUNCTIONS =
     @test @inferred(bias_act!(tanh, copy(x), false)) ≈ tanh.(x)
 
     # Check that it does overwrite:
-    x32 = rand(Float32, 3, 4)
-    x32copy = copy(x32)
+    x32 = rand(Float32, 3, 4); x32copy = copy(x32)
     @test @inferred(bias_act!(cbrt, x32, b)) ≈ cbrt.(x32copy .+ b)
-    @test x32  ≈ cbrt.(x32copy .+ b)
-    x32 = rand(Float32, 3, 4)
-    x32copy = copy(x32)
+    @test x32 ≈ cbrt.(x32copy .+ b)
+
+    x32 = rand(Float32, 3, 4); x32copy = copy(x32)  # without bias
     @test @inferred(bias_act!(tanh, x32, false)) ≈ tanh.(x32copy)
-    @test x32  ≈ tanh.(x32copy)
+    @test x32 ≈ tanh.(x32copy)
+
+    x32 = rand(Float32, 3, 4); x32copy = copy(x32)  # now check gradient rule
+    y, back = rrule(Zygote.ZygoteRuleConfig(), bias_act!, relu, x32, b)
+    @test y ≈ x32 ≈ relu.(x32copy .+ b)
+
+    x32 = rand(Float32, 3, 4); x32copy = copy(x32)  # without bias
+    y, back = rrule(Zygote.ZygoteRuleConfig(), bias_act!, relu, x32, false)
+    @test y ≈ x32 ≈ relu.(x32copy)
 
     # Check that it doesn't try to overwrite non-float arrays:
     xint = rand(-3:3, 3, 4)
@@ -78,7 +85,7 @@ ACTIVATION_FUNCTIONS =
         g2 = ForwardDiff.gradient(x) do x
             sum(abs2, Zygote.gradient(x -> sum(abs2, bias_act!(relu, copy(x), b)), x)[1])
         end
-        @test_broken gx ≈ Zygote.gradient(x) do x
+        @test_skip gx ≈ Zygote.gradient(x) do x  # Here global variable b causes an error
             sum(abs2,Zygote. gradient(x -> sum(abs2, bias_act!(relu, copy(x), b)), x)[1])
         end
         # Can't differentiate foreigncall expression $(Expr(:foreigncall, :(:jl_eqtable_get), Any, svec(Any, Any, Any), 0, :(:ccall), %5, %3, %4)).
diff --git a/test/runtests.jl b/test/runtests.jl
index 0f52934c9..ece02b0ed 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -101,7 +101,6 @@ end
             @info "Skipping CUDA tests, set NNLIB_TEST_CUDA=true to run them"
         end
 
-<<<<<<< HEAD
         if get(ENV, "NNLIB_TEST_AMDGPU", "false") == "true"
             import Pkg
             test_info = Pkg.project()

From c9a57227ca97c81a7fa3cf21798b691139f320e7 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Mon, 4 Sep 2023 13:30:05 -0400
Subject: [PATCH 10/11] =?UTF-8?q?skip=20hard=CF=83=20tests?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/bias_act.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/bias_act.jl b/test/bias_act.jl
index 110f1a24e..918e2cca3 100644
--- a/test/bias_act.jl
+++ b/test/bias_act.jl
@@ -47,6 +47,8 @@ ACTIVATION_FUNCTIONS =
                                                     [x->x, x -> 1/(x^2+2), x -> leakyrelu(x, 0.33)])
         # Only some of these go the fast path, `cbrt` is an example of a function NNlib knows nothing about.
         fun == rrelu && continue # this one is randomised!
+	fun == hardσ && continue # this one has heisenbugs, not solved by 
+discontinuity-avoidance code below
 
         @test bias_act!(fun, copy(x), b) ≈ fun.(x .+ b)
         @test bias_act!(fun, copy(x), false) ≈ fun.(x)

From cd99b77b2feb39d24327ad65d3b6a386bd9d45bf Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Mon, 4 Sep 2023 14:39:21 -0400
Subject: [PATCH 11/11] Update test/bias_act.jl

---
 test/bias_act.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/bias_act.jl b/test/bias_act.jl
index 918e2cca3..5d1b316d9 100644
--- a/test/bias_act.jl
+++ b/test/bias_act.jl
@@ -47,8 +47,7 @@ ACTIVATION_FUNCTIONS =
                                                     [x->x, x -> 1/(x^2+2), x -> leakyrelu(x, 0.33)])
         # Only some of these go the fast path, `cbrt` is an example of a function NNlib knows nothing about.
         fun == rrelu && continue # this one is randomised!
-	fun == hardσ && continue # this one has heisenbugs, not solved by 
-discontinuity-avoidance code below
+        fun == hardσ && continue # this one has heisenbugs, not solved by discontinuity-avoidance code below
 
         @test bias_act!(fun, copy(x), b) ≈ fun.(x .+ b)
         @test bias_act!(fun, copy(x), false) ≈ fun.(x)