RFC: Restrict train! to AbstractOptimiser (#1902)

mcabbott · web-flow · commit ed78e8ac8c7b · 2022-03-20T13:48:07.000-05:00
diff --git a/.github/workflows/Downstream.yml b/.github/workflows/Downstream.yml
@@ -1,5 +1,4 @@
 name: Downstream
-
 on:
   push:
     branches: [master]
@@ -10,6 +9,8 @@ jobs:
   test:
     name: ${{ matrix.package.repo }}/${{ matrix.package.group }}
     runs-on: ${{ matrix.os }}
+    env:
+      GROUP: ${{ matrix.package.group }}
     strategy:
       fail-fast: false
       matrix:
diff --git a/src/deprecations.jl b/src/deprecations.jl
@@ -34,6 +34,11 @@ struct Zeros
 end
 Zeros(args...) = Zeros()  # was used both Dense(10, 2, initb = Zeros) and Dense(rand(2,10), Zeros())
 
+function Optimise.update!(x::AbstractArray, x̄)
+  depwarn("`Flux.Optimise.update!(x, x̄)` was not used internally and has been removed. Please write `x .-= x̄` instead.", :update!)
+  x .-= x̄
+end
+
 # Channel notation: Changed to match Conv, but very softly deprecated!
 # Perhaps change to @deprecate for v0.14, but there is no plan to remove these.
 Dense(in::Integer, out::Integer, σ = identity; kw...) =
diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
@@ -521,7 +521,7 @@ opt = AdaBelief()
 opt = AdaBelief(0.001, (0.9, 0.8))
 ```
 """
-mutable struct AdaBelief
+mutable struct AdaBelief <: AbstractOptimiser
   eta::Float64
   beta::Tuple{Float64,Float64}
   epsilon::Float64
@@ -553,7 +553,7 @@ mutable struct Optimiser <: AbstractOptimiser
   os::Vector{Any}
 end
 
-Optimiser(o...) = Optimiser(Any[o...])
+Optimiser(opts::AbstractOptimiser...) = Optimiser(Any[opts...])
 
 @forward Optimiser.os Base.getindex, Base.first, Base.last, Base.lastindex, Base.push!, Base.setindex!
 @forward Optimiser.os Base.iterate
diff --git a/src/optimise/train.jl b/src/optimise/train.jl
@@ -1,14 +1,6 @@
 using ProgressLogging: @progress, @withprogress, @logprogress
 import Zygote: Params, gradient
 
-"""
-    update!(x, x̄)
-
-Update the array `x` according to `x .-= x̄`.
-"""
-function update!(x::AbstractArray, x̄)
-  x .-= x̄
-end
 
 """
     update!(opt, p, g)
@@ -20,13 +12,13 @@ according to optimizer `opt`  and the gradients `gs` (the gradient `g`).
 As a result, the parameters are mutated and the optimizer's internal state may change.
 The gradient could be mutated as well.
 """
-function update!(opt, x, x̄)
+function update!(opt::AbstractOptimiser, x, x̄)
   x̄r = ArrayInterface.restructure(x, x̄) # address some cases where Zygote's
                                           # output are not mutable, see #1510 
   x .-= apply!(opt, x, x̄r)
 end
 
-function update!(opt, xs::Params, gs)
+function update!(opt::AbstractOptimiser, xs::Params, gs)
   for x in xs
     isnothing(gs[x]) && continue
     update!(opt, x, gs[x])
@@ -81,28 +73,44 @@ batchmemaybe(x) = tuple(x)
 batchmemaybe(x::Tuple) = x
 
 """
-    train!(loss, params, data, opt; cb)
-        
-`train!` uses a `loss` function and training `data` to improve the 
-[Model parameters](@ref) (`params`) based on a pluggable [Optimisers](@ref) (`opt`).
+    train!(loss, pars::Params, data, opt::AbstractOptimiser; [cb])
         
-For each datapoint `d` in `data`, compute the gradient of  `loss` with
-respect to `params` through backpropagation and call the optimizer `opt`.
-If `d` is a tuple of arguments to `loss` call `loss(d...)`, else call `loss(d)`.
-        
-To pass trainable parameters, call [`Flux.params`](@ref) with your model or just the 
-layers you want to train, like `train!(loss, params(model), ...)` or `train!(loss, params(model[1:end-2), ...)` respectively.
+Uses a `loss` function and training `data` to improve the 
+model's parameters according to a particular optimisation rule `opt`.
 
-[Callbacks](@ref) are given with the keyword argument `cb`. For example, this will print "training" 
-every 10 seconds (using [`Flux.throttle`](@ref)):
-`train!(loss, params, data, opt, cb = throttle(() -> println("training"), 10))`
-        
+For each `d in data`, first the gradient of the `loss` is computed like this:
+```
+    gradient(() -> loss(d...), pars)  # if d isa Tuple
+    gradient(() -> loss(d), pars)     # otherwise
+```
+Here `pars` is produced by calling [`Flux.params`](@ref) on your model.
+(Or just on the layers you want to train, like `train!(loss, params(model[1:end-2]), data, opt)`.)
+This is the "implicit" style of parameter handling.
+
+Then, this gradient is used by optimizer `opt` to update the paramters:
+```
+    update!(opt, pars, grads)
+```
+The optimiser should be from the [Flux.Optimise](@ref) module.
+Different optimisers can be combined using [Flux.Optimise.Optimiser](@ref).
+
+This training loop iterates through `data` once.
+You can use [`@epochs`](@ref) to do this several times, or 
+use for instance `Iterators.repeat` to make a longer `data` iterator.
+
+## Callbacks
+
+[Callbacks](@ref) are given with the keyword argument `cb`.
+For example, this will print "training" every 10 seconds (using [`Flux.throttle`](@ref)):
+```
+    train!(loss, params, data, opt, cb = throttle(() -> println("training"), 10))
+```
+    
 The callback can call [`Flux.stop`](@ref) to interrupt the training loop.
 
-Multiple optimisers and callbacks can be passed to `opt` and `cb` as arrays.
+Multiple callbacks can be passed to `cb` as array.
 """
-function train!(loss, ps, data, opt; cb = () -> ())
-  ps = Params(ps)
+function train!(loss, ps::Params, data, opt::AbstractOptimiser; cb = () -> ())
   cb = runall(cb)
   n = (Base.IteratorSize(typeof(data)) == Base.HasLength()) ? length(data) : 0
   @withprogress for (i, d) in enumerate(data)
diff --git a/test/data.jl b/test/data.jl
@@ -76,7 +76,7 @@ using Random
     X = zeros(2, 10)
     loss(x) = sum((x .- θ).^2)
     d  = DataLoader(X)
-    Flux.train!(loss, [θ], ncycle(d, 10), Descent(0.1))
+    Flux.train!(loss, Params([θ]), ncycle(d, 10), Descent(0.1))
     @test norm(θ) < 1e-4
 
     # test interaction with `train!`
@@ -85,7 +85,7 @@ using Random
     Y = fill(2, 10)
     loss(x, y) = sum((y - x'*θ).^2)
     d  = DataLoader((X, Y))
-    Flux.train!(loss, [θ], ncycle(d, 10), Descent(0.1))
+    Flux.train!(loss, Params([θ]), ncycle(d, 10), Descent(0.1))
     @test norm(θ .- 1) < 1e-10
 
     # specify the rng
diff --git a/test/optimise.jl b/test/optimise.jl
@@ -50,7 +50,7 @@ end
   l = 1
   Flux.train!(
               () -> (sleep(0.1); Flux.skip(); i+=1),
-              (),
+              Params([]),
               Iterators.repeated((), 10),
               Descent()
              )
@@ -59,7 +59,7 @@ end
 
   Flux.train!(
               () -> (sleep(0.1); i==8 && Flux.skip(); i+=1),
-              (),
+              Params([]),
               Iterators.repeated((), 10),
               Descent()
              )
@@ -68,7 +68,7 @@ end
 
   i = 0
   Flux.train!(() -> (sleep(0.1); i += 1; l),
-              (),
+              Params([]),
               Iterators.repeated((), 100),
               Descent(),
               cb = Flux.throttle(() -> (i > 3 && Flux.stop()), 1))