replace at-adjoint with rrule

mcabbott · mcabbott · commit df4019d8ae5a · 2022-02-05T14:39:28.000-05:00
diff --git a/Project.toml b/Project.toml
@@ -7,6 +7,7 @@ AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
 Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
@@ -33,6 +34,7 @@ AbstractTrees = "0.3"
 Adapt = "3.0"
 ArrayInterface = "3.1, 4"
 CUDA = "3"
+ChainRulesCore = "1.12"
 CodecZlib = "0.7"
 Colors = "0.12"
 Functors = "0.2.1"
@@ -43,7 +45,7 @@ ProgressLogging = "0.1"
 Reexport = "0.2, 1.0"
 StatsBase = "0.33"
 ZipFile = "0.9"
-Zygote = "0.6"
+Zygote = "0.6.34"
 julia = "1.6"
 
 [extras]
diff --git a/src/Flux.jl b/src/Flux.jl
@@ -9,6 +9,7 @@ using MacroTools: @forward
 @reexport using NNlib
 using Zygote: Params, @adjoint, gradient, pullback, @nograd
 export gradient
+using ChainRulesCore
 
 export Chain, Dense, Maxout, SkipConnection, Parallel, flatten,
        RNN, LSTM, GRU, GRUv3,
diff --git a/src/cuda/cuda.jl b/src/cuda/cuda.jl
@@ -3,8 +3,9 @@ module CUDAint
 using ..CUDA
 
 import ..Flux: Flux
-import Zygote
-using Zygote: @adjoint
+# import Zygote
+# using Zygote: @adjoint
+using ChainRulesCore
 import NNlib, NNlibCUDA
 
 include("cudnn.jl")
diff --git a/src/cuda/cudnn.jl b/src/cuda/cudnn.jl
@@ -11,10 +11,11 @@ function (BN::Flux.BatchNorm)(x::Union{CuArray{T,2},CuArray{T,4},CuArray{T,5}},
                   training=Flux._isactive(BN)))
 end
 
-@adjoint function batchnorm(g, b, x, running_mean, running_var, momentum; kw...)
+function ChainRulesCore.rrule(::typeof(batchnorm), g, b, x, running_mean, running_var, momentum; kw...)
   y = batchnorm(g, b, x, running_mean, running_var, momentum; kw...) 
   function batchnorm_pullback(Δ)
-    ∇batchnorm(g, b, x, Δ, running_mean, running_var, momentum; kw...)..., nothing, nothing, nothing
+    grad = ∇batchnorm(g, b, x, Δ, running_mean, running_var, momentum; kw...)
+    (NoTangent(), grad..., NoTangent(), NoTangent(), NoTangent())
   end
   y, batchnorm_pullback
 end
diff --git a/src/functor.jl b/src/functor.jl
@@ -120,12 +120,12 @@ adapt_storage(to::FluxCPUAdaptor, x::AbstractSparseArray) = x
 adapt_storage(to::FluxCPUAdaptor, x::CUDA.RNG) = Random.default_rng()
 adapt_storage(to::FluxCPUAdaptor, x::AbstractRNG) = x
 
-Zygote.@adjoint function Array(x::CUDA.CuArray)
-  Array(x), d -> (CUDA.cu(d),)
+function ChainRulesCore.rrule(::typeof(Array), x::CUDA.CuArray)
+  Array(x), d -> (NoTangent(), CUDA.cu(d),)
 end
 
-Zygote.@adjoint function Adapt.adapt_storage(to::FluxCPUAdaptor, x::CUDA.AbstractGPUArray)
-  adapt_storage(to, x), d -> (nothing, adapt_storage(FluxCUDAAdaptor(), d),)
+function ChainRulesCore.rrule(::typeof(Adapt.adapt_storage), to::FluxCPUAdaptor, x::CUDA.AbstractGPUArray)
+  adapt_storage(to, x), d -> (NoTangent(), NoTangent(), adapt_storage(FluxCUDAAdaptor(), d),)
 end
 
 # CPU/GPU movement conveniences
@@ -202,7 +202,7 @@ function check_use_cuda()
     end
   end
 end
-Zygote.@nograd check_use_cuda
+ChainRulesCore.@non_differentiable check_use_cuda()
 
 # Precision
 
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
@@ -275,8 +275,7 @@ function conv_transpose_dims(c::ConvTranspose, x::AbstractArray)
   )
 end
 
-# TODO: Find proper fix for https://github.com/FluxML/Flux.jl/issues/900
-@nograd conv_transpose_dims
+ChainRulesCore.@non_differentiable conv_transpose_dims(::Any, ::Any)
 
 function (c::ConvTranspose)(x::AbstractArray)
   b = reshape(c.bias, map(_->1, c.stride)..., :, 1)
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
@@ -1,6 +1,6 @@
 istraining() = false
 
-@adjoint istraining() = true, _ -> nothing
+ChainRulesCore.rrule(::typeof(istraining)) = true, _ -> (NoTangent(),)
 
 _isactive(m) = isnothing(m.active) ? istraining() : m.active
 
@@ -38,12 +38,6 @@ function dropout(rng, x, p; dims=:, active::Bool=true)
 end
 dropout(x, p; kwargs...) = dropout(rng_from_array(x), x, p; kwargs...)
 
-@adjoint function dropout(rng, x, p; dims=:, active::Bool=true)
-  active || return x, Δ -> (Δ, nothing)
-  y = dropout_mask(rng, x, p, dims=dims)
-  return x .* y, Δ -> (nothing, Δ .* y, nothing)
-end
-
 dropout_mask(rng::CUDA.RNG, x::CuArray, p; kwargs...) = _dropout_mask(rng, x, p; kwargs...)
 dropout_mask(rng, x::CuArray, p; kwargs...) =
   throw(ArgumentError("x isa CuArray, but rng isa $(typeof(rng)). dropout_mask only support CUDA.RNG for CuArrays."))
@@ -54,6 +48,8 @@ function _dropout_mask(rng, x, p; dims=:)
   return y
 end
 
+ChainRulesCore.@non_differentiable dropout_mask(::Any, ::Any, ::Any)
+
 """
     Dropout(p; dims=:, rng = rng_from_array())
 
@@ -230,7 +226,8 @@ function _track_stats!(
   bn.σ² = res_mtm .* bn.σ² .+ mtm .* (m / (m - one(V))) .* σ²new
   return nothing
 end
-Zygote.@nograd _track_stats!
+
+ChainRulesCore.@non_differentiable _track_stats!(::Any...)
 
 """
     BatchNorm(channels::Integer, λ=identity;
diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl
@@ -6,13 +6,14 @@ gate(x::AbstractMatrix, h, n) = view(x, gate(h,n), :)
 # AD-friendly helper for dividing monolithic RNN params into equally sized gates
 multigate(x::AbstractArray, h, ::Val{N}) where N = ntuple(n -> gate(x,h,n), N)
 
-@adjoint function multigate(x::AbstractArray, h, c)
+function ChainRulesCore.rrule(::typeof(multigate), x::AbstractArray, h, c)
   function multigate_pullback(dy)
-    dx = Zygote._zero(x, eltype(x))
-    map(multigate(dx, h, c), dy) do dxᵢ, dyᵢ
-      dyᵢ !== nothing && (dxᵢ.= Zygote.accum.(dxᵢ, dyᵢ));
+    dx = map!(zero, similar(x, float(eltype(x)), axes(x)), x)
+    foreach(multigate(dx, h, c), dy) do dxᵢ, dyᵢ
+      dyᵢ isa AbstractZero && return
+      @. dxᵢ += dyᵢ
     end
-    return (dx, nothing, nothing)
+    return (NoTangent(), dx, NoTangent(), NoTangent())
   end
   return multigate(x, h, c), multigate_pullback
 end
@@ -435,7 +436,7 @@ julia> g(rand(Float32, 3, 10)) |> size # batch size of 10
 GRUv3(a...; ka...) = Recur(GRUv3Cell(a...; ka...))
 Recur(m::GRUv3Cell) = Recur(m, m.state0)
 
-
+# TODO move to ChainRulesCore? 
 @adjoint function Broadcast.broadcasted(f::Recur, args...)
   Zygote.∇map(__context__, f, args...)
 end
diff --git a/src/losses/Losses.jl b/src/losses/Losses.jl
@@ -3,6 +3,7 @@ module Losses
 using Statistics
 using Zygote
 using Zygote: @adjoint
+using ChainRulesCore
 using ..Flux: ofeltype, epseltype
 using CUDA
 using NNlib: logsoftmax, logσ
diff --git a/src/losses/ctc.jl b/src/losses/ctc.jl
@@ -133,10 +133,9 @@ for mathematical details.
 """
 ctc_loss(ŷ::AbstractArray, y) = ctc_alpha(ŷ, y).loss
 
-@adjoint function ctc_loss(ŷ, y)
-  out = ctc_alpha(ŷ, y)
-  ctc_loss_pullback(Δ) = (Δ .* ∇ctc_loss(ŷ, y, out), nothing)
-  return out.loss, ctc_loss_pullback
+function ChainRulesCore.rrule(::typeof(ctc_loss), ŷ, y)
+  ctc_loss_pullback(Δ) = (NoTangent(), Δ .* ∇ctc_loss(ŷ, y, out), NoTangent())
+  return ctc_loss(ŷ, y), ctc_loss_pullback
 end
 
 
diff --git a/src/losses/utils.jl b/src/losses/utils.jl
@@ -23,6 +23,9 @@ end
   res, Δ -> (nothing, Zygote.unbroadcast(x, xlogy.(Δ, y)), Zygote.unbroadcast(y, Δ .* x ./ y))
 end
 
+ChainRulesCore.@scalar_rule xlogy(x, y) (log(y), x/y)  # is this good enough?
+ChainRulesCore.@scalar_rule xlogx(x) (log(y) + true)
+
 # This can be made an error in Flux v0.13, for now just a warning
 function _check_sizes(ŷ::AbstractArray, y::AbstractArray)
   for d in 1:max(ndims(ŷ), ndims(y)) 
@@ -33,4 +36,4 @@ function _check_sizes(ŷ::AbstractArray, y::AbstractArray)
 end
 _check_sizes(ŷ, y) = nothing  # pass-through, for constant label e.g. y = 1
 
-Zygote.@nograd _check_sizes
+ChainRulesCore.@non_differentiable _check_sizes(ŷ::Any, y::Any)
diff --git a/src/onehot.jl b/src/onehot.jl
@@ -230,7 +230,11 @@ function _fast_argmax(x::OneHotLike)
   end
 end
 
-@nograd OneHotArray, onecold, onehot, onehotbatch
+ChainRulesCore.@non_differentiable onehot(::Any, ::Any)
+ChainRulesCore.@non_differentiable onehot(::Any, ::Any, ::Any)
+ChainRulesCore.@non_differentiable onehotbatch(::Any, ::Any)
+ChainRulesCore.@non_differentiable onehotbatch(::Any, ::Any, ::Any)
+ChainRulesCore.@non_differentiable (::Type{<:OneHotArray})(indices::Any, L::Integer)
 
 function Base.:(*)(A::AbstractMatrix, B::OneHotLike{<:Any, L}) where L
   _isonehot(B) || return invoke(*, Tuple{AbstractMatrix, AbstractMatrix}, A, B)
diff --git a/src/utils.jl b/src/utils.jl
@@ -662,7 +662,7 @@ function _restructure(m, xs)
   return m̄
 end
 
-@adjoint function _restructure(m, xs)
+@adjoint function _restructure(m, xs)  # TODO ChainRulesCore.rrule
   m̄, numel = _restructure(m, xs), length(xs)
   function _restructure_pullback(dm)
     xs′ = destructure(dm)[1]
@@ -794,6 +794,7 @@ true
 modules(m) = [x for x in Functors.fcollect(m) if !isleaflike(x)]
 
 @nograd modules
+ChainRulesCore.@non_differentiable modules(::Any)  # is this correct?
 
 isleaflike(x) = Functors.isleaf(x)
 isleaflike(::Tuple{Vararg{<:Number}}) = true

Original file line number	Diff line number	Diff line change
`@@ -275,8 +275,7 @@ function conv_transpose_dims(c::ConvTranspose, x::AbstractArray)`
`275`	`275`	`)`
`276`	`276`	`end`
`277`	`277`
`278`		`-# TODO: Find proper fix for https://github.com/FluxML/Flux.jl/issues/900`
`279`		`-@nograd conv_transpose_dims`
	`278`	`+ChainRulesCore.@non_differentiable conv_transpose_dims(::Any, ::Any)`
`280`	`279`
`281`	`280`	`function (c::ConvTranspose)(x::AbstractArray)`
`282`	`281`	`b = reshape(c.bias, map(_->1, c.stride)..., :, 1)`