allow shared parameters, take III

mcabbott · ToucheSir · mcabbott · commit 640765ad37dc · 2022-08-28T11:45:59.000-04:00
Co-authored-by: Brian Chen &lt;ToucheSir@users.noreply.github.com&gt;
diff --git a/src/Optimisers.jl b/src/Optimisers.jl
@@ -1,6 +1,6 @@
 module Optimisers
 
-using Functors: functor, fmap, isleaf
+using Functors: functor, fmap, isleaf, @functor, fmapstructure, children
 using LinearAlgebra
 
 include("interface.jl")
@@ -157,8 +157,8 @@ true
 julia> m  # original should be discarded, may be mutated but no guarantee
 (x = Float32[0.6666666, 1.5333333], y = Float32[4.0, 5.0])
 
-julia> t  # original state should likewise be discarded
-(x = Leaf(Momentum{Float64}(0.0333333, 0.9), Float32[0.333333, 0.466667]), y = Leaf(Momentum{Float64}(0.0333333, 0.9), Float32[0.0, 0.0]))
+julia> t == t2  # original state is in fact guaranteed to be mutated
+true
 ```
 """
 update!
diff --git a/src/adjust.jl b/src/adjust.jl
@@ -47,8 +47,8 @@ adjust(tree; kw...) = map(st -> adjust(st; kw...), tree)
 adjust(::Nothing, ::Real) = nothing
 adjust(::Nothing; kw...) = nothing
 
-adjust(ℓ::Leaf, eta::Real) = Leaf(adjust(ℓ.rule, eta), ℓ.state)
-adjust(ℓ::Leaf; kw...) = Leaf(adjust(ℓ.rule; kw...), ℓ.state)
+adjust(ℓ::Leaf, eta::Real) = ℓ.frozen ? ℓ : Leaf(adjust(ℓ.rule, eta), ℓ.state, ℓ.frozen)
+adjust(ℓ::Leaf; kw...) = ℓ.frozen ? ℓ : Leaf(adjust(ℓ.rule; kw...), ℓ.state, ℓ.frozen)
 
 
 """
diff --git a/src/interface.jl b/src/interface.jl
@@ -1,57 +1,94 @@
 
-using ChainRulesCore: canonicalize, backing, Tangent, AbstractZero
+using ChainRulesCore: canonicalize, backing, Tangent, AbstractZero, ZeroTangent
 base(dx::Tangent) = backing(canonicalize(dx))
 base(dx) = dx
 const Zero = Union{Nothing, AbstractZero}  # Union{Zygote, Diffractor}
 
 abstract type AbstractRule end
 
-struct Leaf{R,S}
+###
+### setup
+###
+
+mutable struct Leaf{R,S}
   rule::R
   state::S
+  frozen::Bool
 end
 
-function setup(rule, x; seen = Base.IdSet())
-  rule isa AbstractRule || Base.depwarn("In future, all optimisation rules should be <: AbstractRule", :setup)
-  if isnumeric(x)
-    x in seen && throw(ArgumentError("Optimisers.jl does not at present handle tied weights, sorry."))
-    isbits(x) || push!(seen, x)
-    return Leaf(rule, init(rule, x))
-  elseif isleaf(x)
-    return nothing
-  else
-    return map(xᵢ -> setup(rule, xᵢ; seen), _trainable(x))
+@functor Leaf
+
+Base.:(==)(a::Leaf, b::Leaf) = children(a) == children(b)
+
+function setup(rule::AbstractRule, model)
+  cnt = Ref(0)
+  # Rely on Functors to identify shared arrays, they will share a Leaf in this tree:
+  tree = fmapstructure(model, exclude = isnumeric) do x
+    cnt[] += 1
+    Leaf(rule, init(rule, x), false)
   end
+  cnt[] == 0 && @warn "setup found no parameters in the given model"
+  tree
 end
 
-subtract!(x, x̄) = maywrite(x) ? (x .= x .- x̄) : eltype(x).(x .- x̄)
-
-update!(::Nothing, x, ::Zero, ::Zero...) = nothing, x
-update!(::Nothing, x, x̄s...) = nothing, x
+function Base.show(io::IO, ℓ::Leaf)  # show method is mostly to hide its long type!
+  ioc = IOContext(io, :compact => true)
+  print(ioc, "Leaf(", ℓ.rule, ", ")
+  show(ioc, ℓ.state)
+  print(ioc, ", ", ℓ.frozen, ")")
+end
 
-update!(ℓ::Leaf, x, ::Zero, ::Zero...) = ℓ, x
-function update!(ℓ::Leaf, x, x̄s...)
-  s′, x̄′ = apply!(ℓ.rule, ℓ.state, x, base.(x̄s)...)
-  Leaf(ℓ.rule, s′), subtract!(x, x̄′)
+###
+### update
+###
+
+function update!(tree, model, grad)
+  # First walk is to accumulate the gradient. This recursion visits every copy of
+  # shared leaves, but stops when branches are absent from the gradient:
+  dict = IdDict{Leaf, Any}()
+  grads!(dict, tree, model, grad)
+  # Second walk is to update the model, using same fmap walk as setup, thus each Leaf exactly once:
+  newmodel = fmap(model, tree; exclude = isnumeric) do x, ℓ
+    ℓ isa Leaf || error("this state does not match the model, expected a Leaf here")
+    ℓ.frozen && return x
+    haskey(dict, ℓ) || return x
+    s′, x̄′ = apply!(ℓ.rule, ℓ.state, x, dict[ℓ])
+    ℓ.state = s′  # to get state out of here, rely on mutability of Leaf
+    subtract!(x, x̄′)
+  end
+  tree, newmodel  # note that tree is guaranteed to be updated
 end
 
-update!(tree, x, ::Zero, ::Zero...) = tree, x
-function update!(tree, x, x̄s...)
+subtract!(x, x̄) = maywrite(x) ? (x .= x .- x̄) : eltype(x).(x .- x̄)
+
+grads!(dict::IdDict, ℓ::Leaf, x, ::Zero) = nothing
+function grads!(dict::IdDict, ℓ::Leaf, x, x̄)
+  x̄₀ = get(dict, ℓ, false)
+  dict[ℓ] = Broadcast.broadcasted(+, x̄, x̄₀)
+  nothing
+end
+grads!(dict::IdDict, t, x, ::Zero) = nothing
+function grads!(dict::IdDict, tree, x, x̄s...)
+  # The only reason grads! takes model is that functor(typeof(x), base(x̄)) may differ from 
+  # functor(typeof(tree), base(x̄)), for things like Transpose
   x̄s′ = map(x̄ -> functor(typeof(x), base(x̄))[1], x̄s)
-  x′, re = functor(typeof(x), x)
-  xtree = map((stᵢ, xᵢ, x̄sᵢ...) -> update!(stᵢ, xᵢ, x̄sᵢ...), tree, x′, x̄s′...)
-  map(first, xtree), re(map(last, xtree))
+  x′, _ = functor(typeof(x), x)
+  foreach((tᵢ, xᵢ, x̄sᵢ...) -> grads!(dict, tᵢ, xᵢ, x̄sᵢ...), tree, x′, x̄s′...)
 end
 
 function update(tree, x, x̄s...)
-  t′ = fmap(copy, tree; exclude = maywrite)
+  t′ = fmap(copy, tree; exclude = maywrite)  # goes inside Leaf
   x′ = fmap(copy, x; exclude = maywrite)
   update!(t′, x′, x̄s...)
 end
 
 # default all rules to first order calls
 apply!(o, state, x, dx, dxs...) = apply!(o, state, x, dx)
 
+###
+### sources of truth
+###
+
 """
     isnumeric(x) -> Bool
 
@@ -98,6 +135,10 @@ function _trainable(ch::NamedTuple, tr::Tuple)  # for old Flux-style no-names tu
   map(c -> c in tr ? c : nothing, ch)
 end
 
+###
+### rule definition helpers
+###
+
 """
     @.. x = x + y
 
@@ -135,11 +176,3 @@ Broadcast.materialize(x::Lazy) = Broadcast.instantiate(x.bc)
 
 onevalue(λ::T, x::AbstractArray{T}) where T = map(_ -> λ, x)
 onevalue(λ, x::AbstractArray{T}) where T = onevalue(convert(float(T), λ), x)
-
-function Base.show(io::IO, ℓ::Leaf)  # show method is mostly to hide its long type!
-  ioc = IOContext(io, :compact => true)
-  print(ioc, "Leaf(", ℓ.rule, ", ")
-  show(ioc, ℓ.state)
-  print(io, ")")
-end
-
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -35,6 +35,17 @@ Optimisers.trainable(x::TwoThirds) = (a = x.a,)
       g4 = Tangent{typeof(m)}(g...)
       s4, m4 = Optimisers.update!(s, ([1.0, 2.0],), g4)
       @test m4[1] ≈ [1,2] .- 0.1 .* [25, 33]
+      
+      o5 = Momentum(0.1)
+      s5 = Optimisers.setup(o5, m)
+      
+      s6, m6 = Optimisers.update(s5, m, g)
+      @test s6[1].state ≈ [2.5, 3.3]
+      @test s5[1].state == [0, 0]  # not mutated -- wrong on v0.2.9
+
+      s7, m7 = Optimisers.update!(s5, m, g)
+      @test s7[1].state === s5[1].state  # same array
+      @test s7[1] === s5[1]  # same Leaf
     end
 
     @testset "gradient clipping" begin
@@ -212,12 +223,75 @@ Optimisers.trainable(x::TwoThirds) = (a = x.a,)
     end
 
     @testset "tied weights" begin
-      ok = (1.0:3.0, sin, "abc", :abc)
-      m = (α = ok, β = rand(3), γ = ok)
-      m1 = (rand(3), m, rand(3))
-      @test Optimisers.setup(AdamW(), m1) isa Tuple
-      m2 = (rand(3), m, rand(3), m, rand(3))  # illegal
-      @test_throws ArgumentError Optimisers.setup(AdamW(), m2)
+      @testset "tuples" begin
+         twice = [1,2.0]
+         mtup = (twice, (copy(twice), twice)) # (tied (not tied, tied))
+
+         # simplest rule for which opt(g1) + opt(g2) != opt(g1 + g2)
+         stup = Optimisers.setup(Momentum(0.1), mtup)
+         gtup = ([3,3], ([10,10], [7,7])) # (g1, (g1 + g2, g2))
+
+         snew, mnew = Optimisers.update(stup, mtup, gtup)
+         @test mnew[1] ≈ mnew[2][1]  # gradient was accumulated
+         @test mnew[2][2] === mnew[1]  # and tie is not broken
+
+         st3, mt3 = Optimisers.update(stup, mtup, ([3,3], nothing))
+         @test mt3[1] ≈ [1,2] - 0.1 * [3,3]
+         @test mt3[2][2] === mt3[1]
+
+         st4, mt4 = Optimisers.update(stup, mtup, (nothing, ([5,5], [7,7])))
+         @test mt4[1] ≈ [1,2] - 0.1 * [7,7]
+       end
+
+       @testset "named" begin
+         thrice = [3f0]
+         model = (a = (x = thrice, y = Float32[4,5,6], z = true), b = ((m = (0, 1, thrice),),), c = (x = Float32[7,8], y = thrice))
+         tree = Optimisers.setup(Momentum(0.1, 0.9), model)
+         @test model.a.x === model.b[1].m[3] == model.c.y
+
+         loss(x::Array) = sum(abs2, x)
+         loss(x::Number) = x^3
+         loss(m) = sum(2 * loss(x) for x in m)
+         gradient(loss, model)
+         _, m2 = Optimisers.update(tree, model, gradient(loss, model)...)
+         @test m2.a.x === m2.b[1].m[3] == m2.c.y
+
+         loss3(m) = sum(x isa Tuple ? 0 : 2 * loss(x) for x in m)
+         gradient(loss3, model)  # truncates the b limb
+         _, m3 = Optimisers.update(tree, model, gradient(loss3, model)...)
+         @test m3.a.x === m3.b[1].m[3] == m3.c.y
+       end
+
+       @testset "transpose" begin
+         mat = [1 2 3; 4 5 6.0]
+         bidir = (m = mat, f = log, t = transpose(mat), v = [7, 8, 9.0])
+         bigrad, _ = gradient((m, x) -> sum(abs2, m.m * (m.f).(m.t*x .+ m.v)), bidir, [1, 0.1])
+         @test bigrad.t isa Matrix  # not a Transpose, that's the point here
+
+         state = Optimisers.setup(Descent(0.1), bidir)
+         @test state.t.parent === state.m  # successfully tied
+
+         s2, b2 = Optimisers.update(state, bidir, bigrad)
+         @test b2.t.parent === b2.m  # tie restored
+         @test b2.m ≈ bidir.m - 0.1 * (bigrad.m + transpose(bigrad.t))  # grad accumulated
+
+         state = Optimisers.setup(OptimiserChain(ClipGrad(10), Descent(0.1), ClipGrad(10)), bidir)
+         s2, b2 = Optimisers.update(state, bidir, bigrad)
+         @test b2.t.parent === b2.m
+         @test b2.m ≈ bidir.m - 0.1 * clamp.((bigrad.m + transpose(bigrad.t)), -10, 10)
+
+         # Similar, but now "primary" field is the transposed one:
+         tri = (a = transpose(mat), b = mat, c = transpose(mat), d = 4.0)
+         trigrad = gradient(m -> sum(abs2, m.a * (m.b * (m.c * [0.1, 1] .+ m.d) .- m.d)), tri)[1]
+         stri = Optimisers.setup(Descent(0.1), tri)
+         s3, t3 = Optimisers.update(stri, tri, trigrad)
+         @test t3.a.parent === t3.b === t3.c.parent
+         @test t3.a ≈ tri.a - 0.1 * (trigrad.a + trigrad.b' + trigrad.c)
+
+         g4 = (a = Broadcast.broadcasted(+, mat', 1), b = nothing, c = @thunk(mat' .+ 1), d = nothing)
+         # Error: no constructors for type Any
+         @test_broken s4, t4 = Optimisers.update(stri, tri, g4)
+       end
     end
 
   end