Sometimes faster sum(f,x) rule (#529)

mcabbott · oxinabox · web-flow · commit ffefa079deae · 2022-05-10T12:00:21.000-04:00
* save less stuff in sum(f, xs) rule

* version using derivatives_given_input

* rules

* rm derivatives_given_input

* add and fix some tests

* rm benchmarks

* rebase fixup

* fix tests

* fix a test

* tighter check

* tidy, more unicode

* rm one unthunk

* comment

* simplify AbstractZero methods

* Apply 4 suggestions

Co-authored-by: Frames Catherine White &lt;oxinabox@ucc.asn.au&gt;

Co-authored-by: Frames Catherine White &lt;oxinabox@ucc.asn.au&gt;
diff --git a/src/ChainRules.jl b/src/ChainRules.jl
@@ -15,6 +15,9 @@ using Statistics
 # to the normal rule of only overload via `ChainRulesCore.rrule`.
 import ChainRulesCore: rrule, frule
 
+# Experimental:
+using ChainRulesCore: derivatives_given_output
+
 # numbers that we know commute under multiplication
 const CommutativeMulNumber = Union{Real,Complex}
 
diff --git a/src/rulesets/Base/base.jl b/src/rulesets/Base/base.jl
@@ -171,6 +171,8 @@ function rrule(::typeof(identity), x)
     return (x, identity_pullback)
 end
 
+ChainRulesCore.derivatives_given_output(Ω, ::typeof(identity), x) = tuple(tuple(true))
+
 # rouding related,
 # we use `zero` rather than `ZeroTangent()` for scalar, and avoids issues with map etc
 @scalar_rule round(x) zero(x)
diff --git a/src/rulesets/Base/fastmath_able.jl b/src/rulesets/Base/fastmath_able.jl
@@ -80,6 +80,11 @@ let
             return Ω, abs_pullback
         end
 
+        function ChainRulesCore.derivatives_given_output(Ω, ::typeof(abs), x::Union{Real, Complex})
+            signx = x isa Real ? sign(x) : x / ifelse(iszero(x), one(Ω), Ω)
+            return tuple(tuple(signx))
+        end
+
         ## abs2
         function frule((_, Δz), ::typeof(abs2), z::Union{Real, Complex})
             return abs2(z), 2 * realdot(z, Δz)
diff --git a/src/rulesets/Base/mapreduce.jl b/src/rulesets/Base/mapreduce.jl
@@ -72,30 +72,74 @@ function rrule(
 end
 
 function rrule(
-    config::RuleConfig{>:HasReverseMode}, ::typeof(sum), f, xs::AbstractArray; dims=:
-)
-    fx_and_pullbacks = map(x->rrule_via_ad(config, f, x), xs)
-    y = sum(first, fx_and_pullbacks; dims=dims)
+    config::RuleConfig{>:HasReverseMode},
+    ::typeof(sum),
+    f::F,
+    xs::AbstractArray{T};
+    dims = :,
+) where {F,T}
+    project = ProjectTo(xs)
 
-    pullbacks = last.(fx_and_pullbacks)
+    if _uses_input_only(f, T)
+        # Then we can compute the forward pass as usual, save nothing but `xs`:
+        function sum_pullback_f1(dy)
+            dxs = broadcast(unthunk(dy), xs) do dyₖ, xᵢ
+                ∂yₖ∂xᵢ = only(only(derivatives_given_output(nothing, f, xᵢ)))
+                dyₖ * conj(∂yₖ∂xᵢ)
+            end
+            return (NoTangent(), NoTangent(), project(dxs))
+        end
+        return sum(f, xs; dims), sum_pullback_f1
+    end
 
-    project = ProjectTo(xs)
+    # (There is an intermediate case, where `derivatives_given_output` needs to
+    # see `f.(xs)` but we don't need the pullbacks. Not implemented at present.)
+
+    # In the general case, we need to save all the pullbacks:
+    fx_and_pullbacks = map(xᵢ -> rrule_via_ad(config, f, xᵢ), xs)
+    y = sum(first, fx_and_pullbacks; dims)
+
+    function sum_pullback_f2(dy)
+        # For arrays of arrays, we ought to protect the element against broadcasting:
+        broadcast_dy = dims isa Colon ? Ref(unthunk(dy)) : unthunk(dy)
+        if Base.issingletontype(F)
+            # Then at least `f` has no gradient. 
+            # Broadcasting here gets the shape right with or without `dims` keyword.
+            dxs = broadcast(fx_and_pullbacks, broadcast_dy) do (_, pbᵢ), dyₖ
+                unthunk(last(pbᵢ(dyₖ)))
+            end
+            return (NoTangent(), NoTangent(), project(dxs))
 
-    function sum_pullback(ȳ)
-        call(f, x) = f(x)
-        # if dims is :, then need only left-handed only broadcast
-        broadcast_ȳ = dims isa Colon  ? (ȳ,) : ȳ
-        f̄_and_x̄s = call.(pullbacks, broadcast_ȳ)
-        # no point thunking as most of work is in f̄_and_x̄s which we need to compute for both
-        f̄ = if fieldcount(typeof(f)) === 0 # Then don't need to worry about derivative wrt f
-            NoTangent()
         else
-            sum(first, f̄_and_x̄s)
+            # Most general case. If `f` were stateful, we would need to reverse the order
+            # of iteration here, but since this function makes no guarantee, even the primal
+            # result is then ill-defined.
+            df_and_dxs = broadcast(fx_and_pullbacks, broadcast_dy) do (_, pbᵢ), dyₖ
+                pbᵢ(dyₖ)
+            end
+            df = sum(first, df_and_dxs)
+            dxs = map(unthunk ∘ last, df_and_dxs)
+            return (NoTangent(), df, project(dxs))
         end
-        x̄s = map(unthunk ∘ last, f̄_and_x̄s) # project does not support receiving InplaceableThunks
-        return NoTangent(), f̄, project(x̄s)
     end
-    return y, sum_pullback
+    return y, sum_pullback_f2
+end
+
+"""
+    _uses_input_only(f, xT::Type)
+
+Returns `true` if it can prove that `derivatives_given_output` will work using only the input
+of the given type. Thus there is no need to store the output `y = f(x::xT)`, allowing us to take
+a fast path in the `rrule` for `sum(f, xs)`.
+
+Works by seeing if the result of `derivatives_given_output(nothing, f, x)` can be inferred.
+The method of `derivatives_given_output` usually comes from `@scalar_rule`.
+"""
+function _uses_input_only(f::F, ::Type{xT}) where {F,xT}
+    gT = Core.Compiler._return_type(derivatives_given_output, Tuple{Nothing, F, xT})
+    # Here we must check `<: Number`, to avoid this, the one rule which can return the `nothing`:
+    # ChainRules.derivatives_given_output("anything", exp, 1) == (("anything",),)
+    return isconcretetype(gT) && gT <: Tuple{Tuple{Number}}
 end
 
 # https://github.com/JuliaDiff/ChainRules.jl/issues/522
@@ -228,6 +272,7 @@ function ∇prod_dims(vald::Val{dims}, x, dy, y=prod(x; dims=dims)) where {dims}
     ∇prod_dims!(dx, vald, x, dy, y)
     return dx
 end
+∇prod_dims(::Val, x, dy::AbstractZero, y=0) = dy
 
 function ∇prod_dims!(dx, ::Val{dims}, x, dy, y) where {dims}
     iters = ntuple(d -> d in dims ? tuple(:) : axes(x,d), ndims(x))  # Without Val(dims) this is a serious type instability
@@ -244,6 +289,7 @@ function ∇prod(x, dy::Number=1, y::Number=prod(x))
     ∇prod!(dx, x, dy, y)
     return dx
 end
+∇prod(x, dy::AbstractZero, y::Number=0) = dy
 
 function ∇prod!(dx, x, dy::Number=1, y::Number=prod(x))
     numzero = iszero(y) ? count(iszero, x) : 0
@@ -326,7 +372,8 @@ function ∇cumprod_dim(vald::Val{dim}, x::AbstractArray, dy=fill!(zero(x),1), y
      dx = fill!(similar(x, T, axes(x)), zero(T))
      ∇cumprod_dim!(dx, vald, x, dy, y)
      return dx
- end
+end
+∇cumprod_dim(vald::Val, x::AbstractArray, dy::AbstractZero, y=0) = dy
 
 @inline function ∇cumprod_dim!(dx::AbstractArray, ::Val{dim}, x::AbstractArray, dy, y) where {dim}
     iters = ntuple(k -> k==dim ? Ref(:) : axes(x,k), ndims(x))
@@ -342,6 +389,7 @@ function ∇cumprod(x::AbstractVector, dy=one(x), y=cumprod(x))
     ∇cumprod!(dx, x, dy, y)
     return dx
 end
+∇cumprod(x::AbstractVector, dy::AbstractZero, y=0) = dy
 
 @inline function ∇cumprod!(dx::AbstractVector, x::AbstractVector, dy, y)
     lo, hi = firstindex(x), lastindex(x)
diff --git a/test/rulesets/Base/mapreduce.jl b/test/rulesets/Base/mapreduce.jl
@@ -65,10 +65,12 @@ const CFG = ChainRulesTestUtils.ADviaRuleConfig()
     @testset "sum(f, xs)" begin
         # This calls back into AD
         test_rrule(sum, abs, [-4.0, 2.0, 2.0])
+        test_rrule(sum, log, rand(3, 4) .+ 1)
         test_rrule(sum, cbrt, randn(5))
         test_rrule(sum, Multiplier(2.0), [2.0, 4.0, 8.0])
 
         # Complex numbers
+        test_rrule(sum, log, rand(ComplexF64, 5))
         test_rrule(sum, sqrt, rand(ComplexF64, 5))
         test_rrule(sum, abs, rand(ComplexF64, 3, 4))  # complex -> real
 
@@ -82,6 +84,12 @@ const CFG = ChainRulesTestUtils.ADviaRuleConfig()
 
         test_rrule(sum, abs, @SVector[1.0, -3.0])
 
+        # Make sure the above test both `derivatives_given_output` path and general case:
+        @test ChainRules._uses_input_only(abs, Float32)
+        @test !ChainRules._uses_input_only(cbrt, Float64)
+        @test ChainRules._uses_input_only(log, ComplexF64)
+        @test !ChainRules._uses_input_only(abs, ComplexF64)
+
         # covectors
         x = [-4.0 2.0; 2.0 -1.0]
         test_rrule(sum, inv, x[1, :]')
@@ -102,14 +110,22 @@ const CFG = ChainRulesTestUtils.ADviaRuleConfig()
         # ... and Bool produced by function
         @test_skip test_rrule(sum, iszero, randn(5))  # DimensionMismatch("second dimension of A, 1, does not match length of x, 0")
 
-
         # Functions that return a Vector
         # see https://github.com/FluxML/Zygote.jl/issues/1074
         test_rrule(sum, make_two_vec, [1.0, 3.0, 5.0, 7.0])
         test_rrule(sum, make_two_vec, [1.0 2.0; 3.0 4.0])
         test_rrule(sum, make_two_vec, [1.0 2.0; 3.0 4.0]; fkwargs=(;dims=2))
         test_rrule(sum, make_two_vec, [1.0 2.0; 3.0 4.0]; fkwargs=(;dims=1))
         test_rrule(sum, make_two_vec, [1.0 2.0; 3.0 4.0]; fkwargs=(;dims=(3, 4)))
+
+        # arrays of arrays, functions which return a scalar:
+        test_rrule(sum, sum, [[1,2], [3,4], [5,6]]; check_inferred=false)
+        x2345 = [rand(2,3) for _ in 1:4, _ in 1:5]
+        test_rrule(sum, prod, x2345; check_inferred=false)
+        test_rrule(sum, sum, x2345; fkwargs=(;dims=1), check_inferred=false)
+        test_rrule(sum, sum, x2345; fkwargs=(;dims=(1,2)), check_inferred=false)
+
+        test_rrule(sum, cumprod, [[1,2], [3,4], [5,6]]; check_inferred=false)
     end
 
     # https://github.com/JuliaDiff/ChainRules.jl/issues/522