test with Yota too, and document this

mcabbott · mcabbott · commit 54439006786e · 2022-10-17T14:28:18.000-04:00
diff --git a/Project.toml b/Project.toml
@@ -13,13 +13,15 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 [compat]
 ChainRulesCore = "1"
 Functors = "0.3"
+Yota = "0.7.3"
 Zygote = "0.6.40"
 julia = "1.6"
 
 [extras]
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+Yota = "cd998857-8626-517d-b929-70ad188a48f0"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["Test", "StaticArrays", "Zygote"]
+test = ["Test", "StaticArrays", "Yota", "Zygote"]
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -38,7 +38,7 @@ to adjust the model:
 
 ```julia
 
-using Flux, Metalhead, Optimisers
+using Flux, Metalhead, Zygote, Optimisers
 
 model = Metalhead.ResNet(18) |> gpu  # define a model to train
 image = rand(Float32, 224, 224, 3, 1) |> gpu;  # dummy data
@@ -72,14 +72,29 @@ This `∇model` is another tree structure, rather than the dictionary-like objec
 Zygote's "implicit" mode `gradient(() -> loss(...), Flux.params(model))` -- see 
 [Zygote's documentation](https://fluxml.ai/Zygote.jl/dev/#Explicit-and-Implicit-Parameters-1) for more about this difference.
 
+## Usage with [Yota.jl](https://github.com/dfdx/Yota.jl)
+
+Yota is another modern automatic differentiation package, an alternative to Zygote.
+
+Its main function is `Yota.grad`, which returns the loss as well as the gradient (like `Zygote.withgradient`)
+but also returns a gradient component for the loss function.
+To extract what Optimisers.jl needs, you can write `_, (_, ∇model) = Yota.grad(f, model, data)`
+or, for the Flux model above:
+
+```julia
+loss, (∇function, ∇model, ∇image) = Yota.grad(model, image) do m, x
+  sum(m(x))
+end;
+```
+
 ## Usage with [Lux.jl](https://github.com/avik-pal/Lux.jl)
 
 The main design difference of Lux is that the tree of parameters is separate from
 the layer structure. It is these parameters which `setup` and `update` need to know about.
 
 Lux describes this separation of parameter storage from model description as "explicit" parameters.
 Beware that it has nothing to do with Zygote's notion of "explicit" gradients.
-(If the same model is written in Flux and Lux, `∇model` above and `∇params` below will often be
+(If the same model is written in Flux and Lux, `∇model` above and `∇params` below will be nearly
 identical trees of nested `NamedTuple`s.)
 
 ```julia
@@ -88,27 +103,37 @@ using Lux, Boltz, Zygote, Optimisers
 
 lux_model, params, lux_state = Boltz.resnet(:resnet18) |> gpu;  # define and initialise model
 images = rand(Float32, 224, 224, 3, 4) |> gpu;  # batch of dummy data
-y, _ = Lux.apply(lux_model, images, params, lux_state);  # run the model
+y, lux_state = Lux.apply(lux_model, images, params, lux_state);  # run the model
 @show sum(y)  # initial dummy loss
 
 rule = Optimisers.Adam()
 opt_state = Optimisers.setup(rule, params);  # optimiser state based on model parameters
 
-∇params, _ = gradient(params, images) do p, x  # gradient with respect to parameter tree
-  y, _ = Lux.apply(lux_model, x, p, lux_state)
-  sum(y)
-end;
+(loss, lux_state), back = Zygote.pullback(params, images) do p, x
+  y, st = Lux.apply(lux_model, x, p, lux_state)
+  sum(y), st  # return both the loss, and the updated lux_state
+end
+∇params, _ = back((one.(loss), nothing))  # gradient of only the loss, with respect to parameter tree
 
-opt_state, params = Optimisers.update!(opt_state, params, ∇params);
+@show sum(loss)
 
-y, _ = Lux.apply(lux_model, images, params, lux_state);
-@show sum(y)
+opt_state, params = Optimisers.update!(opt_state, params, ∇params);
 
 ```
 
 Besides the parameters stored in `params` and gradually optimised, any other model state
-is stored in `lux_state`. For simplicity this example does not show how to propagate the 
-updated `lux_state` to the next iteration, see Lux's documentation.
+is stored in `lux_state`, and returned by `Lux.apply`.
+This is completely unrelated to Optimisers.jl's state, although designed in a similar spirit.
+If you are certain there is no model state, then the gradient calculation can
+be simplified to use `Zygote.gradient` instead of `Zygote.pullback`:
+
+```julia
+∇params, _ = gradient(params, images) do p, x
+  y, _ = Lux.apply(lux_model, x, p, lux_state)  # discards new lux_state
+  sum(y)
+end;
+```
+
 
 ## Non-`trainable` Parameters
 
diff --git a/test/rules.jl b/test/rules.jl
@@ -229,3 +229,18 @@ end
     @test static_loss(static_model) < 1.9 
   end
 end
+
+@testset "using Yota" begin
+  @testset "$(name(o))" for o in RULES
+    w′ = (abc = (α = rand(3, 3), β = rand(3, 3), γ = rand(3)), d = (δ = rand(3), ε = eps))
+    w = (abc = (α = 5rand(3, 3), β = rand(3, 3), γ = rand(3)), d = (δ = rand(3), ε = eps))
+    st = Optimisers.setup(o, w)
+    loss(x, y) = mean((x.abc.α .* x.abc.β .- y.abc.α .* y.abc.β) .^ 2)  # does not use γ, δ, ε
+    @test loss(w, w′) > 0.5
+    for i = 1:10^4
+      _, (_, g, _) = Yota.grad(loss, w, w′)
+      st, w = Optimisers.update(st, w, g)
+    end
+    @test loss(w, w′) < 0.001
+  end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,5 +1,5 @@
 using Optimisers
-using ChainRulesCore, Functors, StaticArrays, Zygote
+using ChainRulesCore, Functors, StaticArrays, Zygote, Yota
 using LinearAlgebra, Statistics, Test, Random
 using Optimisers: @.., @lazy