more

mcabbott · mcabbott · commit 2ac91f0e2e27 · 2022-06-29T21:09:51.000-06:00
diff --git a/src/extra_rules.jl b/src/extra_rules.jl
@@ -150,6 +150,7 @@ end
 # @ChainRulesCore.non_differentiable Base.rem(a::Integer, b::Type)  # now in CR 1.18
 
 ChainRulesCore.canonicalize(::ChainRulesCore.ZeroTangent) = ChainRulesCore.ZeroTangent()
+ChainRulesCore.canonicalize(::NoTangent) = NoTangent()
 
 # # Skip AD'ing through the axis computation
 # function ChainRules.rrule(::typeof(Base.Broadcast.instantiate), bc::Base.Broadcast.Broadcasted)
@@ -200,12 +201,13 @@ function ChainRules.rrule(::typeof(map), ::typeof(+), A::AbstractVector, B::Abst
     map(+, A, B), Δ->(NoTangent(), NoTangent(), Δ, Δ)
 end
 
-function ChainRules.rrule(AT::Type{<:Array{T,N}}, x::AbstractArray{S,N}) where {T,S,N}
-    # We're leaving these in the eltype that the cotangent vector already has.
-    # There isn't really a good reason to believe we should convert to the
-    # original array type, so don't unless explicitly requested.
-    AT(x), Δ->(NoTangent(), Δ)
-end
+# https://github.com/JuliaDiff/ChainRules.jl/blob/main/src/rulesets/Base/array.jl#L7
+# function ChainRules.rrule(AT::Type{<:Array{T,N}}, x::AbstractArray{S,N}) where {T,S,N}
+#     # We're leaving these in the eltype that the cotangent vector already has.
+#     # There isn't really a good reason to believe we should convert to the
+#     # original array type, so don't unless explicitly requested.
+#     AT(x), Δ->(NoTangent(), Δ)
+# end
 
 # WARNING: Method definition rrule(Type{var"#s260"} where var"#s260"<:(Array{T, N} where N where T), UndefInitializer, Any...) in module ChainRules at /Users/me/.julia/packages/ChainRules/kkDLd/src/rulesets/Base/array.jl:5 overwritten in module Diffractor at /Users/me/.julia/dev/Diffractor/src/extra_rules.jl:209.
 # function ChainRules.rrule(AT::Type{<:Array}, undef::UndefInitializer, args...)
@@ -254,10 +256,9 @@ function ChainRules.frule(_, ::Type{Vector{T}}, undef::UndefInitializer, dims::I
     Vector{T}(undef, dims...), zeros(T, dims...)
 end
 
-@ChainRules.non_differentiable Base.:(|)(a::Integer, b::Integer)
+# @ChainRules.non_differentiable Base.:(|)(a::Integer, b::Integer) CR#558
 @ChainRules.non_differentiable Base.throw(err)
 @ChainRules.non_differentiable Core.Compiler.return_type(args...)
-ChainRulesCore.canonicalize(::NoTangent) = NoTangent()
 
 # Disable thunking at higher order (TODO: These should go into ChainRulesCore)
 function ChainRulesCore.rrule(::Type{Thunk}, thnk)
diff --git a/src/stage1/broadcast.jl b/src/stage1/broadcast.jl
@@ -33,12 +33,12 @@ end
 
 using ChainRulesCore: derivatives_given_output
 
-# _print(s) = nothing
-_print(s) = printstyled(s, "\n"; color=:magenta)
+_print(s) = nothing
+# _print(s) = printstyled(s, "\n"; color=:magenta)
 
 # Broadcast over one element is just map
 function (∂⃖ₙ::∂⃖{N})(::typeof(broadcasted), f, a::Array) where {N}
-    _print("path 0")
+    _print("path 0, order $N")
     ∂⃖ₙ(map, f, a)
 end
 
@@ -47,8 +47,8 @@ end
 function split_bc_rule(f::F, args::Vararg{Any,N}) where {F,N}
     T = Broadcast.combine_eltypes(f, args)
     TΔ = Core.Compiler._return_type(derivatives_given_output, Tuple{T, F, map(eltype, args)...})
-    if eltype(T) == Bool
-        # Trivial case: non-differentiable output
+    if T === Bool
+        # Trivial case: non-differentiable output, e.g. `x .> 0`
         _print("path 1")
         back_1(_) = ntuple(Returns(ZeroTangent()), length(args)+2)
         return f.(args...), back_1
@@ -160,16 +160,14 @@ end
 
 # For certain cheap operations we can easily allow fused broadcast:
 
-(::∂⃖{1})(::typeof(broadcasted), ::typeof(+), args...) = split_bc_plus(args...)
-(::∂⃖{1})(::typeof(broadcasted), ::typeof(+), arg::Array) = split_bc_plus(arg) # ambiguity
-function split_bc_plus(xs...) where {F}
+(::∂⃖{1})(::typeof(broadcasted), ::typeof(+), args...) = lazy_bc_plus(args...)
+(::∂⃖{1})(::typeof(broadcasted), ::typeof(+), arg::Array) = lazy_bc_plus(arg) # ambiguity
+function lazy_bc_plus(xs...) where {F}
     broadcasted(+, xs...), Δraw -> let Δ = unthunk(Δraw)
         _print("broadcast +")
         (NoTangent(), NoTangent(), map(x -> unbroadcast(x, Δ), xs)...)
     end
 end
-Base.eltype(bc::Broadcast.Broadcasted{<:Any, <:Any, typeof(+), <:Tuple}) = 
-    mapreduce(eltype, promote_type, bc.args)  # needed to hit fast path
 
 (::∂⃖{1})(::typeof(copy), bc::Broadcast.Broadcasted) = copy(bc), Δ -> (NoTangent(), Δ)
 
@@ -182,24 +180,22 @@ function (::∂⃖{1})(::typeof(broadcasted), ::typeof(-), x, y)
 end
 
 using LinearAlgebra: dot
+const Numeric{T<:Number} = Union{T, AbstractArray{T}}
 
-function (::∂⃖{1})(::typeof(broadcasted), ::typeof(*), x, y)  # should this be vararg, or will laziness handle it?
+function (::∂⃖{1})(::typeof(broadcasted), ::typeof(*), x::Numeric, y::Numeric)
     broadcasted(*, x, y), Δraw -> let Δ = unthunk(Δraw)
         _print("broadcast *")
         dx = eltype(x)==Bool ? NoTangent() : x isa Number ? dot(y, Δ) : unbroadcast(x, Δ .* conj.(y))
         dy = eltype(y)==Bool ? NoTangent() : y isa Number ? dot(x, Δ) : unbroadcast(y, Δ .* conj.(x))
         # When x is an array but a smaller one, instead of dot you may be able to use mapreduce()
-        # Will things like this work? Ref([1,2]) .* [1,2,3]
         (NoTangent(), NoTangent(), dx, dy)
     end
 end
-# Alternative to `x isa Number` etc above... but not quite right!
-# (::∂⃖{1})(::typeof(broadcasted), ::typeof(*), x, y::Number) = rrule_via_ad(DiffractorRuleConfig(), *, x, y)
 
 function (::∂⃖{1})(::typeof(broadcasted), ::typeof(Base.literal_pow), ::typeof(^), x, ::Val{2})
     _print("broadcast ^2")
     broadcasted(*, x, x), Δ -> begin
-        dx = unbroadcast(x, 2 .* Δ .* conj.(x))
+        dx = unbroadcast(x, 2 .* unthunk(Δ) .* conj.(x))
         (NoTangent(), NoTangent(), NoTangent(), dx, NoTangent()) 
     end
 end
@@ -208,30 +204,25 @@ function (::∂⃖{1})(::typeof(broadcasted), ::typeof(Base.literal_pow), ::type
     x^2, Δ -> (NoTangent(), NoTangent(), NoTangent(), 2 * Δ * conj(x), NoTangent())
 end
 
-# function (::∂⃖{1})(::typeof(broadcasted), ::typeof(/), x, y) # not obvious whether this is better than automatic
-#     broadcasted(/, x, y), Δ -> let Δun = unthunk(Δ)
-#         _print("broadcast /")
-#         dx = unbroadcast(x, Δ ./ conj.(y))
-#         dy = unbroadcast(y, .-Δ .* conj.(res ./ y))
-#         (NoTangent(), NoTangent(), dx, dy)
-#     end
-# end
-function (::∂⃖{1})(::typeof(broadcasted), ::typeof(/), x, y::Number)
+function (::∂⃖{1})(::typeof(broadcasted), ::typeof(/), x::Numeric, y::Number)
     _print("simple /")
     z, back = ∂⃖{1}()(/, x, y)
-    z, Δ -> begin
-        _, dx, dy = back(Δ)
-        (NoTangent(), NoTangent(), dx, dy)  # maybe there should be a funciton for this? Use for conj, identity too
+    z, dz -> begin
+        _, dx, dy = back(dz)
+        (NoTangent(), NoTangent(), dx, dy)
     end
 end
 
+(::∂⃖{1})(::typeof(broadcasted), ::typeof(identity), x) = x, identity_pullback
+(::∂⃖{1})(::typeof(broadcasted), ::typeof(identity), x::Array) = x, identity_pullback # ambiguity
+identity_pullback(Δ) = (NoTangent(), NoTangent(), Δ)
+
+(::∂⃖{1})(::typeof(broadcasted), ::typeof(conj), x::AbstractArray{Real}) = x, identity_pullback
+(::∂⃖{1})(::typeof(broadcasted), ::typeof(conj), x::Array{Real}) = x, identity_pullback
 (::∂⃖{1})(::typeof(broadcasted), ::typeof(conj), x) =
     broadcasted(conj, x), Δ -> (NoTangent(), conj(unthunk(Δ)))
-(::∂⃖{1})(::typeof(broadcasted), ::typeof(conj), x::AbstractArray{Real}) =
-    x, Δ -> (NoTangent(), Δ)
-
-(::∂⃖{1})(::typeof(broadcasted), ::typeof(identity), x) =
-    x, Δ -> (NoTangent(), Δ)
+(::∂⃖{1})(::typeof(broadcasted), ::typeof(conj), x::Array) =
+    broadcasted(conj, x), Δ -> (NoTangent(), conj(unthunk(Δ)))
 
 # All broadcasts use `unbroadcast` to reduce to correct shape:
 
@@ -244,7 +235,7 @@ function unbroadcast(x::Base.AbstractArrayOrBroadcasted, dx)
         ProjectTo(x)(sum(dx; dims))
     end
 end
-unbroadcast(x::Base.AbstractArrayOrBroadcasted, dx::NoTangent) = NoTangent()
+unbroadcast(x::Base.AbstractArrayOrBroadcasted, dx::AbstractZero) = dx
 
 unbroadcast(x::T, dx) where {T<:Tuple{Any}} = ProjectTo(x)(Tangent{T}(sum(dx)))
 function unbroadcast(x::T, dx) where {T<:Tuple{Vararg{Any,N}}} where {N}
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -215,42 +215,49 @@ z45, delta45 = frule_via_ad(DiffractorRuleConfig(), (0,1), x -> log(exp(x)), 2)
 @test delta45 ≈ 1.0
 
 # Broadcasting
-@test gradient(x -> sum(x ./ x), [1,2,3]) == ([0,0,0],)  # derivatives_given_output
-@test gradient(x -> sum(sqrt.(atan.(x, x'))), [1,2,3])[1] ≈ [0.2338, -0.0177, -0.0661] atol=1e-3
-@test gradient(x -> sum(exp.(log.(x))), [1,2,3]) == ([1,1,1],)
-
-@test_broken gradient(x -> sum((exp∘log).(x)), [1,2,3]) == ([1,1,1],)  # stores pullback
-exp_log(x) = exp(log(x))
-@test gradient(x -> sum(exp_log.(x)), [1,2,3]) == ([1,1,1],) 
-@test gradient((x,y) -> sum(x ./ y), [1 2; 3 4], [1,2]) == ([1 1; 0.5 0.5], [-3, -1.75])
-@test gradient((x,y) -> sum(x ./ y), [1 2; 3 4], 5) == ([0.2 0.2; 0.2 0.2], -0.4)
-@test gradient(x -> sum((y -> y/x).([1,2,3])), 4) == (-0.375,)  # closure
-
-@test gradient(x -> sum(sum, (x,) ./ x), [1,2,3])[1] ≈ [-4.1666, 0.3333, 1.1666] atol=1e-3  # array of arrays
-@test gradient(x -> sum(sum, Ref(x) ./ x), [1,2,3])[1] ≈ [-4.1666, 0.3333, 1.1666] atol=1e-3
-@test gradient(x -> sum(sum, (x,) ./ x), [1,2,3])[1] ≈ [-4.1666, 0.3333, 1.1666] atol=1e-3
-
-@test unthunk.(gradient(x -> sum(x ./ 4), [1,2,3])) == ([0.25, 0.25, 0.25],)
-@test gradient(x -> sum([1,2,3] ./ x), 4) == (-0.375,)  # x/y rule
-@test gradient(x -> sum(x.^2), [1,2,3]) == ([2.0, 4.0, 6.0],)  # x.^2 rule
-@test gradient(x -> sum([1,2,3] ./ x.^2), 4) == (-0.1875,)  # scalar^2 rule
-
-@test gradient(x -> sum(x .> 2), [1,2,3]) == (ZeroTangent(),)  # Bool output
-@test gradient(x -> sum(1 .+ iseven.(x)), [1,2,3]) == (ZeroTangent(),)
-@test gradient((x,y) -> sum(x .== y), [1,2,3], [1 2 3]) == (ZeroTangent(), ZeroTangent())
-@test gradient(x -> sum(x .+ [1,2,3]), true) == (NoTangent(),)  # Bool input
-@test gradient(x -> sum(x ./ [1,2,3]), [true false]) == (NoTangent(),)
-@test gradient(x -> sum(x .* [1,2,3]'), (true, false)) == (NoTangent(),)
-
-tup_adj = gradient((x,y) -> sum(2 .* x .+ log.(y)), (1,2), [3,4,5]')
-@test tup_adj[1] == Tangent{Tuple{Int64, Int64}}(6.0, 6.0)
-@test tup_adj[2] ≈ [0.6666666666666666 0.5 0.4]
-@test tup_adj[2] isa Adjoint
-@test gradient(x -> sum(atan.(x, (1,2,3))), Diagonal([4,5,6]))[1] isa Diagonal
-
-@test_broken gradient(x -> sum(gradient(x -> sum(exp.(x)), x)[1]), [1,2,3])  # path 0, MethodError: no method matching Diffractor.Jet(::Int64, ::Float64, ::Tuple{Float64, Float64})
-@test_broken gradient(x -> sum(gradient(x -> sum(x' .* x), x)[1]), [1,2,3]) == ([6,6,6],) # Control flow support not fully implemented yet for higher-order reverse mode
-@test_broken gradient(x -> sum(gradient(x -> sum(x' ./ x.^2), x)[1]), [1,2,3])[1] ≈ [27.675925925925927, -0.824074074074074, -2.1018518518518516]
+@testset "broadcast" begin
+    @test gradient(x -> sum(x ./ x), [1,2,3]) == ([0,0,0],)  # derivatives_given_output
+    @test gradient(x -> sum(sqrt.(atan.(x, x'))), [1,2,3])[1] ≈ [0.2338, -0.0177, -0.0661] atol=1e-3
+    @test gradient(x -> sum(exp.(log.(x))), [1,2,3]) == ([1,1,1],)
+
+    @test gradient(x -> sum((exp∘log).(x)), [1,2,3]) == ([1,1,1],)  # stores pullback
+    exp_log(x) = exp(log(x))
+    @test gradient(x -> sum(exp_log.(x)), [1,2,3]) == ([1,1,1],) 
+    @test gradient((x,y) -> sum(x ./ y), [1 2; 3 4], [1,2]) == ([1 1; 0.5 0.5], [-3, -1.75])
+    @test gradient((x,y) -> sum(x ./ y), [1 2; 3 4], 5) == ([0.2 0.2; 0.2 0.2], -0.4)
+    @test gradient(x -> sum((y -> y/x).([1,2,3])), 4) == (-0.375,)  # closure
+
+    @test gradient(x -> sum(sum, (x,) ./ x), [1,2,3])[1] ≈ [-4.1666, 0.3333, 1.1666] atol=1e-3  # array of arrays
+    @test gradient(x -> sum(sum, Ref(x) ./ x), [1,2,3])[1] ≈ [-4.1666, 0.3333, 1.1666] atol=1e-3
+    @test gradient(x -> sum(sum, (x,) ./ x), [1,2,3])[1] ≈ [-4.1666, 0.3333, 1.1666] atol=1e-3
+    @test gradient(x -> sum(sum, (x,) .* x'), [1,2,3])[1] ≈ [12, 12, 12]  # must not take the * fast path
+
+    @test unthunk.(gradient(x -> sum(x ./ 4), [1,2,3])) == ([0.25, 0.25, 0.25],)
+    @test gradient(x -> sum([1,2,3] ./ x), 4) == (-0.375,)  # x/y rule
+    @test gradient(x -> sum(x.^2), [1,2,3]) == ([2.0, 4.0, 6.0],)  # x.^2 rule
+    @test gradient(x -> sum([1,2,3] ./ x.^2), 4) == (-0.1875,)  # scalar^2 rule
+
+    @test gradient(x -> sum((1,2,3) .- x), (1,2,3)) == (Tangent{Tuple{Int,Int,Int}}(-1.0, -1.0, -1.0),)
+    @test gradient(x -> sum([1,2,3]' .- x), (1,2,3)) == (Tangent{Tuple{Int,Int,Int}}(-3.0, -3.0, -3.0),)
+    @test gradient(x -> sum([1 2 3] .+ x .^ 2), (1,2,3)) == (Tangent{Tuple{Int,Int,Int}}(6.0, 12.0, 18.0),)
+
+    @test gradient(x -> sum(x .> 2), [1,2,3]) == (ZeroTangent(),)  # Bool output
+    @test gradient(x -> sum(1 .+ iseven.(x)), [1,2,3]) == (ZeroTangent(),)
+    @test gradient((x,y) -> sum(x .== y), [1,2,3], [1 2 3]) == (ZeroTangent(), ZeroTangent())
+    @test gradient(x -> sum(x .+ [1,2,3]), true) == (NoTangent(),)  # Bool input
+    @test gradient(x -> sum(x ./ [1,2,3]), [true false]) == (NoTangent(),)
+    @test_broken gradient(x -> sum(x .* [1,2,3]'), (true, false)) == (NoTangent(),)  # Cannot `convert` an object of type NoTangent to an object of type ZeroTangent
+
+    tup_adj = gradient((x,y) -> sum(2 .* x .+ log.(y)), (1,2), [3,4,5]')
+    @test tup_adj[1] == Tangent{Tuple{Int64, Int64}}(6.0, 6.0)
+    @test tup_adj[2] ≈ [0.6666666666666666 0.5 0.4]
+    @test tup_adj[2] isa Adjoint
+    @test gradient(x -> sum(atan.(x, (1,2,3))), Diagonal([4,5,6]))[1] isa Diagonal
+
+    @test_broken gradient(x -> sum(gradient(x -> sum(exp.(x)), x)[1]), [1,2,3])  # path 0, MethodError: no method matching Diffractor.Jet(::Int64, ::Float64, ::Tuple{Float64, Float64})
+    @test_broken gradient(x -> sum(gradient(x -> sum(x' .* x), x)[1]), [1,2,3]) == ([6,6,6],) # Control flow support not fully implemented yet for higher-order reverse mode
+    @test_broken gradient(x -> sum(gradient(x -> sum(x' ./ x.^2), x)[1]), [1,2,3])[1] ≈ [27.675925925925927, -0.824074074074074, -2.1018518518518516]
+end
 
 # Higher order control flow not yet supported (https://github.com/JuliaDiff/Diffractor.jl/issues/24)
 #include("pinn.jl")