more on broadcasting

mcabbott · mcabbott · commit 27328c2f9bbc · 2022-06-29T21:09:51.000-06:00
diff --git a/src/extra_rules.jl b/src/extra_rules.jl
@@ -147,16 +147,16 @@ function ChainRulesCore.rrule(::typeof(Core.tuple), args...)
 end
 
 # TODO: What to do about these integer rules
-@ChainRulesCore.non_differentiable Base.rem(a::Integer, b::Type)
+# @ChainRulesCore.non_differentiable Base.rem(a::Integer, b::Type)  # now in CR 1.18
 
 ChainRulesCore.canonicalize(::ChainRulesCore.ZeroTangent) = ChainRulesCore.ZeroTangent()
 
-# Skip AD'ing through the axis computation
-function ChainRules.rrule(::typeof(Base.Broadcast.instantiate), bc::Base.Broadcast.Broadcasted)
-    return Base.Broadcast.instantiate(bc), Δ->begin
-        Core.tuple(NoTangent(), Δ)
-    end
-end
+# # Skip AD'ing through the axis computation
+# function ChainRules.rrule(::typeof(Base.Broadcast.instantiate), bc::Base.Broadcast.Broadcasted)
+#     return Base.Broadcast.instantiate(bc), Δ->begin
+#         Core.tuple(NoTangent(), Δ)
+#     end
+# end
 
 
 using StaticArrays
@@ -268,3 +268,7 @@ end
 function ChainRulesCore.rrule(::Type{InplaceableThunk}, add!!, val)
     val, Δ->(NoTangent(), NoTangent(), Δ)
 end
+
+# ERROR: ArgumentError: Tangent for the primal Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}} should be backed by a AbstractDict type, not by NamedTuple{(:data,), Tuple{ChainRulesCore.ZeroTangent}}.
+ChainRulesCore._backing_error(::Type{<:Base.Pairs{Symbol}}, ::Type{<:NamedTuple}, _) = nothing  # solves that!
+
diff --git a/src/stage1/broadcast.jl b/src/stage1/broadcast.jl
@@ -44,7 +44,7 @@ end
 
 (::∂⃖{1})(::typeof(broadcasted), f, args...) = split_bc_rule(f, args...)
 (::∂⃖{1})(::typeof(broadcasted), f, arg::Array) = split_bc_rule(f, arg) # ambiguity
-function split_bc_rule(f::F, args...) where {F}
+function split_bc_rule(f::F, args::Vararg{Any,N}) where {F,N}
     T = Broadcast.combine_eltypes(f, args)
     TΔ = Core.Compiler._return_type(derivatives_given_output, Tuple{T, F, map(eltype, args)...})
     if eltype(T) == Bool
@@ -71,10 +71,11 @@ function split_bc_rule(f::F, args...) where {F}
             dargs = map(unbroadcast, args, deltas)  # ideally sum in unbroadcast could be part of splitcast?
             (NoTangent(), NoTangent(), dargs...)
         end
-        return ys, length(args)==1 ? back_2_one : back_2_many
+        return ys, N==1 ? back_2_one : back_2_many
     else
         # Slow path: collect all the pullbacks & apply them later.
-        # Since broadcast makes no guarantee about order, this does not bother to try to reverse it.
+        # (Since broadcast makes no guarantee about order of calls, and un-fusing 
+        # can change the number of calls, this does not bother to try to reverse.)
         _print("path 3")
         ys, backs = splitcast(∂⃖{1}(), f, args...)
         function back_3(dys)
@@ -84,15 +85,21 @@ function split_bc_rule(f::F, args...) where {F}
             dargs = map(unbroadcast, args, Base.tail(deltas))  # no real need to close over args here
             (NoTangent(), sum(first(deltas)), dargs...)
         end
+        back_3(::AbstractZero) = (NoTangent(), map(Returns(ZeroTangent()), args)...)
         return ys, back_3
     end
 end
 
-# This uses "mulltimap"-like constructs:
+# Skip AD'ing through the axis computation
+function (::∂⃖{1})(::typeof(Base.Broadcast.instantiate), bc::Base.Broadcast.Broadcasted)
+    uninstantiate(Δ) = Core.tuple(NoTangent(), Δ)
+    return Base.Broadcast.instantiate(bc), uninstantiate
+end
+
+# This uses "multimap"-like constructs:
 
 using StructArrays
 splitmap(f, args...) = StructArrays.components(StructArray(Iterators.map(f, args...)))
-# warning: splitmap(identity, [1,2,3,4]) === NamedTuple()
 splitcast(f, args...) = StructArrays.components(StructArray(Broadcast.instantiate(Broadcast.broadcasted(f, args...))))
 
 #=
@@ -156,9 +163,9 @@ end
 (::∂⃖{1})(::typeof(broadcasted), ::typeof(+), args...) = split_bc_plus(args...)
 (::∂⃖{1})(::typeof(broadcasted), ::typeof(+), arg::Array) = split_bc_plus(arg) # ambiguity
 function split_bc_plus(xs...) where {F}
-    broadcasted(+, xs...), Δ -> let Δun = unthunk(Δ)
+    broadcasted(+, xs...), Δraw -> let Δ = unthunk(Δraw)
         _print("broadcast +")
-        (NoTangent(), NoTangent(), map(x -> unbroadcast(x, Δun), xs)...)
+        (NoTangent(), NoTangent(), map(x -> unbroadcast(x, Δ), xs)...)
     end
 end
 Base.eltype(bc::Broadcast.Broadcasted{<:Any, <:Any, typeof(+), <:Tuple}) = 
@@ -167,20 +174,20 @@ Base.eltype(bc::Broadcast.Broadcasted{<:Any, <:Any, typeof(+), <:Tuple}) =
 (::∂⃖{1})(::typeof(copy), bc::Broadcast.Broadcasted) = copy(bc), Δ -> (NoTangent(), Δ)
 
 function (::∂⃖{1})(::typeof(broadcasted), ::typeof(-), x, y)
-    broadcasted(-, x, y), Δ -> let Δun = unthunk(Δ)
+    broadcasted(-, x, y), Δraw -> let Δ = unthunk(Δraw)
         _print("broadcast -")
-        (NoTangent(), NoTangent(), unbroadcast(x, Δun), -unbroadcast(y, Δun))
+        (NoTangent(), NoTangent(), unbroadcast(x, Δ), -unbroadcast(y, Δ))
         # Ideally you could fuse the - into unbroadcast, mapreduce() not sum, when y is a smaller array
     end
 end
 
 using LinearAlgebra: dot
 
 function (::∂⃖{1})(::typeof(broadcasted), ::typeof(*), x, y)  # should this be vararg, or will laziness handle it?
-    broadcasted(*, x, y), Δ -> let Δun = unthunk(Δ)
+    broadcasted(*, x, y), Δraw -> let Δ = unthunk(Δraw)
         _print("broadcast *")
-        dx = eltype(x)==Bool ? NoTangent() : x isa Number ? dot(y, Δun) : unbroadcast(x, Δun .* conj.(y))
-        dy = eltype(y)==Bool ? NoTangent() : y isa Number ? dot(x, Δun) : unbroadcast(y, Δun .* conj.(x))
+        dx = eltype(x)==Bool ? NoTangent() : x isa Number ? dot(y, Δ) : unbroadcast(x, Δ .* conj.(y))
+        dy = eltype(y)==Bool ? NoTangent() : y isa Number ? dot(x, Δ) : unbroadcast(y, Δ .* conj.(x))
         # When x is an array but a smaller one, instead of dot you may be able to use mapreduce()
         # Will things like this work? Ref([1,2]) .* [1,2,3]
         (NoTangent(), NoTangent(), dx, dy)
diff --git a/src/stage1/generated.jl b/src/stage1/generated.jl
@@ -49,6 +49,8 @@ end
 Base.getindex(o::OpticBundle, i::Int) = i == 1 ? o.x :
                                         i == 2 ? o.clos :
                                         throw(BoundsError(o, i))
+Base.lastindex(o::OpticBundle) = 2
+
 Base.iterate(o::OpticBundle) = (o.x, nothing)
 Base.iterate(o::OpticBundle, ::Nothing) = (o.clos, missing)
 Base.iterate(o::OpticBundle, ::Missing) = nothing
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -231,7 +231,9 @@ exp_log(x) = exp(log(x))
 @test gradient(x -> sum(sum, (x,) ./ x), [1,2,3])[1] ≈ [-4.1666, 0.3333, 1.1666] atol=1e-3
 
 @test unthunk.(gradient(x -> sum(x ./ 4), [1,2,3])) == ([0.25, 0.25, 0.25],)
-@test gradient(x -> sum([1,2,3] ./ x), 4) == (-0.375,)
+@test gradient(x -> sum([1,2,3] ./ x), 4) == (-0.375,)  # x/y rule
+@test gradient(x -> sum(x.^2), [1,2,3]) == ([2.0, 4.0, 6.0],)  # x.^2 rule
+@test gradient(x -> sum([1,2,3] ./ x.^2), 4) == (-0.1875,)  # scalar^2 rule
 
 @test gradient(x -> sum(x .> 2), [1,2,3]) == (ZeroTangent(),)  # Bool output
 @test gradient(x -> sum(1 .+ iseven.(x)), [1,2,3]) == (ZeroTangent(),)