fix type instability of Rprop (#103)

YichengDWu · mcabbott · web-flow · commit d5a374bd11f0 · 2022-07-28T18:45:53.000-04:00
* fix type instability

* Update rules.jl

* fix

* better style

* add testing

* Update test/runtests.jl

Co-authored-by: Michael Abbott &lt;32575566+mcabbott@users.noreply.github.com&gt;

Co-authored-by: Michael Abbott &lt;32575566+mcabbott@users.noreply.github.com&gt;
diff --git a/src/rules.jl b/src/rules.jl
@@ -25,7 +25,7 @@ init(o::Descent, x::AbstractArray) = nothing
 
 function apply!(o::Descent, state, x, dx)
   η = convert(float(eltype(x)), o.eta)
-  
+
   return state, @lazy dx * η  # @lazy creates a Broadcasted, will later fuse with x .= x .- dx
 end
 
@@ -51,7 +51,7 @@ init(o::Momentum, x::AbstractArray) = zero(x)
 function apply!(o::Momentum, state, x, dx)
   η, ρ, mvel = o.eta, o.rho, state
   @.. mvel = ρ * mvel + η * dx  # Macro @.. broadcasts into mvel if it can, else @. of rhs.
-  
+
   return mvel, mvel
 end
 
@@ -79,7 +79,7 @@ function apply!(o::Nesterov, state, x, dx)
 
   newdx = @. - ρ^2 * vel + (1+ρ) * η * dx  # Cannot be lazy as this needs the old velocity
   @.. vel = ρ * vel - η * dx
-  
+
   return vel, newdx
 end
 
@@ -125,7 +125,7 @@ function apply!(o::RMSProp, state, x, dx)
     @.. lin = ρ * lin + (1 - ρ) * dx
   end
   dx′ = @lazy dx * η / (sqrt(quad - abs2(lin)) + ϵ)
-  
+
   return (quad, lin), dx′
 end
 
@@ -152,7 +152,7 @@ learning algorithm that depends only on the sign of the gradient.
 # Parameters
 - Learning rate (`η`): Amount by which gradients are discounted before updating
                        the weights.
-                      
+
 - Scaling factors (`ℓ::Tuple`): Multiplicative increase and decrease factors.
 
 - Step sizes (`Γ::Tuple`): Mminimal and maximal allowed step sizes.
@@ -168,14 +168,16 @@ Rprop(η = 1f-3, ℓ = (5f-1, 1.2f0), Γ = (1f-6, 50f0)) = Rprop{typeof(η)}(η,
 init(o::Rprop, x::AbstractArray) = (zero(x), onevalue(o.eta, x))
 
 function apply!(o::Rprop, state, x, dx)
-    ℓ, Γ = o.ell, o.gamma
+    T = eltype(x)
+    ℓ = map(T, o.ell)
+    Γ = map(T, o.gamma)
     g, η = state
 
     η = broadcast(g, η, dx) do g, η, dx
         g * dx > 0 ? min(η * ℓ[2], Γ[2]) : g * dx < 0 ? max(η * ℓ[1], Γ[1]) : η
     end
     g = broadcast(g, dx) do g, dx
-        g * dx < 0 ? zero(dx) : dx
+        g * dx < 0 ? zero(T) : T(dx)
     end
     dx′ = @lazy η * sign(g)
 
@@ -384,7 +386,7 @@ function apply!(o::AdaDelta, state, x, dx)
   # DON'T remove epsilon from numerator or even out of the square roots!
   dx′ = @. dx * sqrt(Δacc + ϵ) / sqrt(acc + ϵ)  # Cannot be lazy as this needs the old Δacc
   @.. Δacc = ρ * Δacc + (1 - ρ) * abs2(dx′)
-  
+
   return (acc, Δacc), dx′
 end
 
@@ -454,7 +456,7 @@ function apply!(o::NAdam, state, x, dx)
 
   @.. mt = β[1] * mt + (1 - β[1]) * dx
   @.. vt = β[2] * vt + (1 - β[2]) * abs2(dx)
-  dx′ = @lazy (β[1] * mt / (1 - β[1] * βt[1]) + (1 - β[1]) * dx / (1 - βt[1])) / 
+  dx′ = @lazy (β[1] * mt / (1 - β[1] * βt[1]) + (1 - β[1]) * dx / (1 - βt[1])) /
           (sqrt(vt * β[2] / (1 - βt[2])) + ϵ) * η
 
   return (mt, vt, βt .* β), dx′
@@ -508,7 +510,7 @@ function apply!(o::AdaBelief, state, x, dx)
   @.. mt = β[1] * mt + (1 - β[1]) * dx
   @.. st = β[2] * st + (1 - β[2]) * abs2(dx - mt) + ϵ
   dx′ = @lazy η * mt / (1 - βt[1]) / (sqrt(st / (1 - βt[2])) + ϵ)
-  
+
   return (mt, st, βt .* β), dx′
 end
 
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -22,7 +22,7 @@ Optimisers.trainable(x::TwoThirds) = (a = x.a,)
       g = ([25, 33],)
       o = Descent(0.1)
       s = Optimisers.setup(o, m)
-      
+
       s2, m2 = Optimisers.update(s, m, g)
       @test m[1] == 1:2  # not mutated
       @test Optimisers.maywrite(m[1])
@@ -129,8 +129,19 @@ Optimisers.trainable(x::TwoThirds) = (a = x.a,)
       s4, m4 = Optimisers.update(s3, staticm, staticm)
       @test eltype(m4[1]) == Float16  # because of explicit broadcast in subtract!
       @test eltype(m4[2]) == Float32
+
+      # Rprop re-creates its state arrays, check they don't get widened:
+      s5 = Optimisers.setup(Rprop(0.1), m)  # Float64 rule
+      grad64 = ([1.0,2.0], SA[3.0,4.0])  # Float64 gradients
+      s6, m6 = Optimisers.update(s5, m, grad64)
+      @test eltype(m6[1]) == Float16
+      @test eltype(m6[2]) == Float32
+      @test eltype(s6[1].state[1]) == Float16
+      @test eltype(s6[1].state[2]) == Float16
+      @test eltype(s6[2].state[1]) == Float32
+      @test eltype(s6[2].state[2]) == Float32
     end
-    
+
     @testset "adjusyting parameters" begin
       # Simple momentum:
       m = (α = ([0.0], sin), γ = Float32[4,3,2])
@@ -139,25 +150,25 @@ Optimisers.trainable(x::TwoThirds) = (a = x.a,)
       @test m.γ .- m1.γ ≈ [0.1, 1, 10]
       @test s1.γ.rule.eta == 0.1
       @test s1.γ.state ≈ [0.1, 1, 10]
-  
+
       s2 = Optimisers.adjust(s1, 0.2)
       @test s2.γ.rule.eta == 0.2
       @test s2.γ.rule.rho == 0.9
       @test s2.γ.state == s1.γ.state
       @test s2.α[1].rule.eta == 0.2
       @test s2.α[1].state == s1.α[1].state
-      
+
       s3 = Optimisers.adjust(s1; eta=0.3, rho=0.7)
       @test s3.γ.rule.eta == 0.3
       @test s3.γ.rule.rho == 0.7
       @test s3.γ.state == s1.γ.state
       @test s3.α[1].rule.rho == 0.7
-      
+
       _, m3 = Optimisers.update(s3, m, (α = nothing, γ = [1,10,100],))
       @test !(m.γ .- m3.γ ≈ [1, 10, 100])
 
       @test s1 == Optimisers.adjust(s1, zeta = "this does nothing")
-  
+
       # OptimiserChain
       sc = Optimisers.setup(OptimiserChain(ClipGrad(2), Adam()), m)
       sc1, mc1 = Optimisers.update(sc, m, (α = nothing, γ = [1,10,100],))
@@ -168,7 +179,7 @@ Optimisers.trainable(x::TwoThirds) = (a = x.a,)
       @test sc2.γ.rule.opts[1].delta == 2 # unchanged
       @test sc2.γ.rule.opts[2].eta === 0.2f0
       @test sc2.γ.state[2][1] ≈ [0.1, 0.2, 0.2]
-      
+
       sc2 = Optimisers.adjust(sc1; delta = 2.5)  # ClipGrad(2) does not store an Int, for this reason
       @test sc2.γ.rule.opts[1].delta == 2.5
       @test sc2.γ.rule.opts[2].eta === 0.001f0 # unchanged