lazier +,-,* rules

mcabbott · mcabbott · commit b484a14b518e · 2022-06-29T21:09:51.000-06:00
diff --git a/src/stage1/broadcast.jl b/src/stage1/broadcast.jl
@@ -44,7 +44,8 @@ function split_bc_rule(f::F, args...) where {F}
         # Trivial case
         back_1(_) = ntuple(Returns(ZeroTangent()), length(args)+2)
         return f.(args...), back_1
-    elseif all(a -> a isa Numeric, args) && isconcretetype(Core.Compiler._return_type(
+    # elseif all(a -> a isa Numeric, args) && isconcretetype(Core.Compiler._return_type(
+    elseif isconcretetype(Core.Compiler._return_type(
             derivatives_given_output, Tuple{T, F, map(eltype, args)...}))
         # Fast path: just broadcast, and use x & y to find derivative.
         ys = f.(args...)
@@ -79,6 +80,7 @@ splitcast(f, args...) = StructArrays.components(StructArray(Broadcast.instantiat
 
 unbroadcast(f::Function, x̄) = accum_sum(x̄)
 unbroadcast(::Val, _) = NoTangent()
+unbroadcast(x::AbstractArray, x̄::NoTangent) = NoTangent()
 accum_sum(xs::AbstractArray{<:NoTangent}; dims = :) = NoTangent()
 
 #=
@@ -99,6 +101,7 @@ julia> @btime gradient(x -> sum(abs2, exp.(x)), $xs);
   44.042 μs (32 allocations: 313.48 KiB)  # slow path -- 3 copies, extra is closure? 
   61.167 μs (12 allocations: 703.41 KiB)  # with `map` rule as before -- worse
 
+
 # Composed function, Zygote struggles
 
 julia> @btime Zygote.gradient(x -> sum(abs2, (identity∘cbrt).(x)), $xs);
@@ -116,6 +119,27 @@ julia> @btime gradient(x -> sum(abs2, identity.(cbrt.(x))), $xs);
   75.041 μs (46 allocations: 470.11 KiB)  # slow path -- 5 copies
   135.541 μs (27 allocations: 1.30 MiB)   # with `map` rule as before -- worse
 
+
+# Lazy +,-,* for partial fusing
+
+julia> @btime Zygote.gradient(x -> sum(abs2, exp.(2 .* x .- 100)), $xs);
+  81.250 μs (21 allocations: 625.47 KiB)  # special rules + dual numbers, 4 more copies than best
+
+julia> @btime gradient(x -> sum(abs2, exp.(2 .* x .- 100)), $xs);
+  57.166 μs (49 allocations: 470.22 KiB)  # broadcast in *, -
+  54.583 μs (46 allocations: 314.06 KiB)  # broadcasted -- two less copies
+  72.958 μs (26 allocations: 1016.38 KiB) # with `map` rule as before
+
+julia> gradient((x,y) -> sum(abs2, exp.(2 .* x .+ y)), xs, (rand(10)'))
+ERROR: MethodError: no method matching size(::Base.Broadcast.Broadcasted # hmm
+
+julia> @btime gradient((x,y) -> sum(abs2, exp.(x .+ y)), $xs, $(rand(100)'));
+  7.127 ms (75 allocations: 22.97 MiB)   # after
+  12.956 ms (57 allocations: 76.37 MiB)  # before
+
+ulia> @btime Zygote.gradient((x,y) -> sum(abs2, exp.(x .+ y)), $xs, $(rand(100)'));
+  9.937 ms (48 allocations: 45.86 MiB)
+
 =#
 
 # The below is from Zygote: TODO: DO we want to do something better here?
@@ -133,7 +157,7 @@ end
 
 trim(x, Δ) = reshape(Δ, ntuple(i -> size(Δ, i), Val(ndims(x))))
 
-unbroadcast(x::AbstractArray, x̄) =
+unbroadcast(x::Union{AbstractArray, Base.Broadcast.Broadcasted}, x̄) =
   size(x) == size(x̄) ? x̄ :
   length(x) == length(x̄) ? trim(x, x̄) :
     trim(x, accum_sum(x̄, dims = ntuple(i -> size(x, i) == 1 ? i : ndims(x̄)+1, Val(ndims(x̄)))))
@@ -146,12 +170,46 @@ unbroadcast(x::AbstractArray, x̄::Nothing) = NoTangent()
 
 const Numeric = Union{Number, AbstractArray{<:Number, N} where N}
 
-function ChainRulesCore.rrule(::typeof(broadcasted), ::typeof(+), xs::Numeric...)
-    broadcast(+, xs...), ȳ -> (NoTangent(), NoTangent(), map(x -> unbroadcast(x, unthunk(ȳ)), xs)...)
+# function ChainRulesCore.rrule(::typeof(broadcasted), ::typeof(+), xs::Numeric...)
+#     broadcast(+, xs...), ȳ -> (NoTangent(), NoTangent(), map(x -> unbroadcast(x, unthunk(ȳ)), xs)...)
+# end
+
+# Replace Zygote-like fully split broadcasting with one fused over easy operations
+(::∂⃖{1})(::typeof(broadcasted), ::typeof(+), args...) = split_bc_plus(args...)
+(::∂⃖{1})(::typeof(broadcasted), ::typeof(+), arg::Array) = split_bc_plus(arg) # ambiguity
+function split_bc_plus(xs...) where {F}
+    broadcasted(+, xs...), Δ -> let Δun = unthunk(Δ)
+        # println("+")
+        (NoTangent(), NoTangent(), map(x -> unbroadcast(x, Δun), xs)...)
+    end
 end
+Base.eltype(bc::Broadcast.Broadcasted{<:Any, <:Any, typeof(+), <:Tuple}) = 
+    mapreduce(eltype, promote_type, bc.args)  # needed to hit fast path
 
-ChainRulesCore.rrule(::typeof(broadcasted), ::typeof(-), x::Numeric, y::Numeric) = x .- y,
-  Δ -> let Δ=unthunk(Δ); (NoTangent(), NoTangent(), unbroadcast(x, Δ), -unbroadcast(y, Δ)); end
+(::∂⃖{1})(::typeof(copy), bc::Broadcast.Broadcasted) = copy(bc), Δ -> (NoTangent(), Δ)
 
-ChainRulesCore.rrule(::typeof(broadcasted), ::typeof(*), x::Numeric, y::Numeric) = x.*y,
-  z̄ -> let z̄=unthunk(z̄); (NoTangent(), NoTangent(), unbroadcast(x, z̄ .* conj.(y)), unbroadcast(y, z̄ .* conj.(x))); end
+# ChainRulesCore.rrule(::typeof(broadcasted), ::typeof(-), x::Numeric, y::Numeric) = x .- y,
+#   Δ -> let Δ=unthunk(Δ); (NoTangent(), NoTangent(), unbroadcast(x, Δ), -unbroadcast(y, Δ)); end
+
+function (::∂⃖{1})(::typeof(broadcasted), ::typeof(-), x, y)
+    broadcasted(-, x, y), Δ -> let Δun = unthunk(Δ)
+        # println("-")
+        (NoTangent(), NoTangent(), unbroadcast(x, Δun), -unbroadcast(y, Δun))
+        # Ideally you could fuse the - into unbroadcast, mapreduce() not sum, when y is a smaller array
+    end
+end
+
+# ChainRulesCore.rrule(::typeof(broadcasted), ::typeof(*), x::Numeric, y::Numeric) = x.*y,
+#   z̄ -> let z̄=unthunk(z̄); (NoTangent(), NoTangent(), unbroadcast(x, z̄ .* conj.(y)), unbroadcast(y, z̄ .* conj.(x))); end
+
+using LinearAlgebra: dot
+
+function (::∂⃖{1})(::typeof(broadcasted), ::typeof(*), x, y)
+    broadcasted(*, x, y), Δ -> let Δun = unthunk(Δ)
+        # println("*")
+        dx = x isa Number ? dot(y, Δun) : unbroadcast(x, Δun .* conj.(y))
+        dy = y isa Number ? dot(x, Δun) : unbroadcast(y, Δun .* conj.(x))
+        # When x is an array but a smaller one, instead of dot you may be able to use mapreduce()
+        (NoTangent(), NoTangent(), dx, dy)
+    end
+end