Merge pull request #93 from MilkshakeForReal/RProp

ToucheSir · web-flow · commit 2c702c63c0f2 · 2022-07-12T21:48:15.000-07:00
Implement Rprop
diff --git a/src/Optimisers.jl b/src/Optimisers.jl
@@ -10,7 +10,7 @@ include("destructure.jl")
 export destructure
 
 include("rules.jl")
-export Descent, Adam, Momentum, Nesterov, RMSProp,
+export Descent, Adam, Momentum, Nesterov, Rprop, RMSProp,
        AdaGrad, AdaMax, AdaDelta, AMSGrad, NAdam, AdamW, RAdam, OAdam, AdaBelief,
        WeightDecay, ClipGrad, ClipNorm, OptimiserChain
 
diff --git a/src/rules.jl b/src/rules.jl
@@ -110,6 +110,7 @@ struct RMSProp{T} <: AbstractRule
   epsilon::T
   centred::Bool
 end
+
 RMSProp(η = 1f-3, ρ = 9f-1, ϵ = eps(typeof(η)); centred::Bool = false, centered::Bool = false) =
   RMSProp{typeof(η)}(η, ρ, ϵ, centred | centered)
 
@@ -135,6 +136,47 @@ function Base.show(io::IO, o::RMSProp)
     print(io, "; centred = ", o.centred, ")")
 end
 
+
+"""
+    Rprop(η = 1f-3, ℓ = (5f-1, 1.2f0), Γ = (1f-6, 50f0))
+
+Optimizer using the
+[Rprop](https://ieeexplore.ieee.org/document/298623) algorithm. A full-batch
+learning algorithm that depends only on the sign of the gradient.
+
+# Parameters
+- Learning rate (`η`): Amount by which gradients are discounted before updating
+                       the weights.
+                      
+- Scaling factors (`ℓ::Tuple`): Multiplicative increase and decrease factors.
+
+- Step sizes (`Γ::Tuple`): Mminimal and maximal allowed step sizes.
+"""
+struct Rprop{T} <: AbstractRule
+    eta::T
+    ell::Tuple{T,T}
+    gamma::Tuple{T,T}
+end
+
+Rprop(η = 1f-3, ℓ = (5f-1, 1.2f0), Γ = (1f-6, 50f0)) = Rprop{typeof(η)}(η, ℓ, Γ)
+
+init(o::Rprop, x::AbstractArray) = (zero(x), onevalue(o.eta, x))
+
+function apply!(o::Rprop, state, x, dx)
+    ℓ, Γ = o.ell, o.gamma
+    g, η = state
+
+    η = broadcast(g, η, dx) do g, η, dx
+        g * dx > 0 ? min(η * ℓ[2], Γ[2]) : g * dx < 0 ? max(η * ℓ[1], Γ[1]) : η
+    end
+    g = broadcast(g, dx) do g, dx
+        g * dx < 0 ? zero(dx) : dx
+    end
+    dx′ = @lazy η * sign(g)
+
+    return (g, η), dx′
+end
+
 """
     Adam(η = 1f-3, β = (9f-1, 9.99f-1), ϵ = eps(typeof(η)))
 
@@ -584,4 +626,4 @@ function Base.show(io::IO, c::OptimiserChain)
   print(io, "OptimiserChain(")
   join(io, c.opts, ", ")
   print(io, ")")
-end
+end
diff --git a/test/rules.jl b/test/rules.jl
@@ -6,7 +6,7 @@ Random.seed!(1)
 
 RULES = [
   # All the rules at default settings:
-  Descent(), Adam(), Momentum(), Nesterov(), RMSProp(),
+  Descent(), Adam(), Momentum(), Nesterov(), Rprop(), RMSProp(),
   AdaGrad(), AdaMax(), AdaDelta(), AMSGrad(), NAdam(),
   AdamW(), RAdam(), OAdam(), AdaBelief(),
   # A few chained combinations:
@@ -39,7 +39,7 @@ end
     @test iloss(rand(10, 10), w, w′) > 1
     st = Optimisers.setup(o, w)
     for t = 1:10^5
-      x = rand(10)
+      x = rand(10, 20)
       gs = loggradient(o)(w -> iloss(x, w, w′), w)
       st, w = Optimisers.update!(st, w, gs...)
     end