Fixup

wsmoses · wsmoses · commit 0cc61907c27b · 2024-06-15T15:34:19.000-04:00
diff --git a/src/losses/utils.jl b/src/losses/utils.jl
@@ -1,3 +1,5 @@
+import Enzyme
+
 """
     xlogx(x)
 
@@ -36,5 +38,4 @@ end
 _check_sizes(ŷ, y) = nothing  # pass-through, for constant label e.g. y = 1
 
 ChainRulesCore.@non_differentiable _check_sizes(ŷ::Any, y::Any)
-import Enzyme
 Enzyme.EnzymeRules.inactive(::typeof(_check_sizes), args...) = true
diff --git a/src/train.jl b/src/train.jl
@@ -7,7 +7,7 @@ using ..Flux: Flux # used only in docstring
 import ..Flux.Optimise: train!, update!  # during 0.13, we add methods to the old functions
 import Enzyme
 
-export setup, train!, train_enzyme!
+export setup, train!
 
 using ProgressLogging: @progress, @withprogress, @logprogress
 using Zygote: Zygote, Params
@@ -53,6 +53,12 @@ function setup(rule::Optimisers.AbstractRule, model)
     state
 end
 
+_make_zero_internal!(x::AbstractArray) = fill!(x, 0)
+_make_zero_internal!(x) = x
+_make_zero!(model) = fmap(_make_zero_internal!, model)
+
+_applyloss(loss, model, d...) = loss(model, d...)
+
 """
     train!(loss, model, data, opt_state)
 
@@ -61,6 +67,9 @@ according to a particular optimisation rule encoded in `opt_state`.
 Iterates through `data` once, evaluating for each `d in data` either
 `loss(model, d...)` if `d isa Tuple`, or else `loss(model, d)` for other `d`.
 
+If `model` is an Enzyme.Duplicated, gradients will be computed with Enzyme,
+otherwise they will be computed with Zygote.
+
 For example, with these definitions...
 ```
 data = [(x1, y1), (x2, y2), (x3, y3)]
@@ -101,60 +110,30 @@ function train!(loss, model, data, opt; cb = nothing)
                             For more control use a loop with `gradient` and `update!`.""")
   @withprogress for (i,d) in enumerate(data)
     d_splat = d isa Tuple ? d : (d,)
-    l, gs = Zygote.withgradient(m -> loss(m, d_splat...), model)
-    if !isfinite(l)
-      throw(DomainError(lazy"Loss is $l on data item $i, stopping training"))
-    end
-    opt, model = Optimisers.update!(opt, model, gs[1])
-    @logprogress Base.haslength(data) ? i/length(data) : nothing
-  end
-end
+    
+    if model isa Enzyme.Duplicated
+      _make_zero!(model.dval)
+      _, l = Enzyme.autodiff(Enzyme.ReverseWithPrimal, _applyloss, Enzyme.Active, Enzyme.Const(loss), model, map(Enzyme.Const, d_splat)...)
 
-_make_zero_internal!(x::AbstractArray) = fill!(x, 0)
-_make_zero_internal!(x) = x
-_make_zero!(model) = fmap(_make_zero_internal!, model)
+      if !isfinite(l)
+        throw(DomainError(lazy"Loss is $l on data item $i, stopping training"))
+      end
+      opt, model2 = Optimisers.update!(opt, model.val, gs[1])
+      model = Enzyme.Duplicated(model2, model.dval)
+    else
+      Zygote.withgradient(m -> loss(m, d_splat...), model)
 
-_applyloss(loss, model, d...) = loss(model, d...)
+      if !isfinite(l)
+        throw(DomainError(lazy"Loss is $l on data item $i, stopping training"))
+      end
 
-"""
-    train_enzyme!(loss, model_and_shadow, data, opt_state)
-
-Like [`train!](@ref), but gradient computed in place using [Enzyme](github.com/EnzymeAD/Enzyme.jl)        
-"""
-function train!(loss, model_and_shadow::Enzyme.Duplicated, data, opt_state::T) where T<:Optimisers.AbstractRule
-  @withprogress for (i,d) in enumerate(data)
-    d_splat = d isa Tuple ? d : (d,)
-    _make_zero!(model_and_shadow.dval)
-    _, l = Enzyme.autodiff(Enzyme.ReverseWithPrimal, _applyloss, Enzyme.Active, Enzyme.Const(loss), model_and_shadow, map(Enzyme.Const, d_splat)...)
-
-    if !isfinite(l)
-      throw(DomainError(lazy"Loss is $l on data item $i, stopping training"))
-    end
-    opt_state, model = Optimisers.update!(opt_state, model_and_shadow.val, model_and_shadow.dval)
-    model_and_shadow = Enzyme.Duplicated(model, model_and_shadow.dval)
-    @logprogress Base.haslength(data) ? i/length(data) : nothing
-  end
-end
+      opt, model = Optimisers.update!(opt, model, gs[1])
 
-# Required per method ambiguity with
-#   train!(loss, model, data, opt::Flux.Optimise.AbstractOptimiser; cb)
-#      @ Flux ~/work/Flux.jl/Flux.jl/src/deprecations.jl:110
-function train!(loss, model_and_shadow::Enzyme.Duplicated, data, opt_state::Flux.Optimise.AbstractOptimiser)
-  @withprogress for (i,d) in enumerate(data)
-    d_splat = d isa Tuple ? d : (d,)
-    _make_zero!(model_and_shadow.dval)
-    _, l = Enzyme.autodiff(Enzyme.ReverseWithPrimal, _applyloss, Enzyme.Active, Enzyme.Const(loss), model_and_shadow, map(Enzyme.Const, d_splat)...)
-
-    if !isfinite(l)
-      throw(DomainError(lazy"Loss is $l on data item $i, stopping training"))
     end
-    opt_state, model = Optimisers.update!(opt_state, model_and_shadow.val, model_and_shadow.dval)
-    model_and_shadow = Enzyme.Duplicated(model, model_and_shadow.dval)
     @logprogress Base.haslength(data) ? i/length(data) : nothing
   end
 end
 
-
 # This method let you use Optimisers.Descent() without setup, when there is no state
 function train!(loss, model, data, rule::Optimisers.AbstractRule; cb = nothing)
   train!(loss, model, data, _rule_to_state(model, rule); cb)