Skip to content

Commit 05a608b

Browse files
authored
Merge pull request #1840 from cossio/eps2
move eps to the end
2 parents 6cd2bf7 + fee94d1 commit 05a608b

File tree

1 file changed

+31
-31
lines changed

1 file changed

+31
-31
lines changed

src/optimise/optimisers.jl

Lines changed: 31 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ using MacroTools: @forward
33

44
abstract type AbstractOptimiser end
55

6-
const ϵ = 1e-8
6+
const EPS = 1e-8
77

88
# TODO: should use weak refs
99

@@ -110,7 +110,7 @@ function apply!(o::Nesterov, x, Δ)
110110
end
111111

112112
"""
113-
RMSProp(η = 0.001, ρ = 0.9)
113+
RMSProp(η = 0.001, ρ = 0.9, ϵ = $EPS)
114114
115115
Optimizer using the
116116
[RMSProp](https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
@@ -136,8 +136,8 @@ mutable struct RMSProp <: AbstractOptimiser
136136
epsilon::Float64
137137
acc::IdDict
138138
end
139-
140-
RMSProp = 0.001, ρ = 0.9, ϵ = ϵ) = RMSProp(η, ρ, ϵ, IdDict())
139+
RMSProp::Real = 0.001, ρ::Real = 0.9, ϵ::Real = EPS) = RMSProp(η, ρ, ϵ, IdDict())
140+
RMSProp::Real, ρ::Real, acc::IdDict) = RMSProp(η, ρ, EPS, acc)
141141

142142
function apply!(o::RMSProp, x, Δ)
143143
η, ρ = o.eta, o.rho
@@ -147,7 +147,7 @@ function apply!(o::RMSProp, x, Δ)
147147
end
148148

149149
"""
150-
ADAM(η = 0.001, β::Tuple = (0.9, 0.999))
150+
ADAM(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS)
151151
152152
[ADAM](https://arxiv.org/abs/1412.6980) optimiser.
153153
@@ -170,8 +170,8 @@ mutable struct ADAM <: AbstractOptimiser
170170
epsilon::Float64
171171
state::IdDict
172172
end
173-
174-
ADAM = 0.001, β = (0.9, 0.999), ϵ = ϵ) = ADAM(η, β, ϵ, IdDict())
173+
ADAM::Real = 0.001, β::Tuple = (0.9, 0.999), ϵ::Real = EPS) = ADAM(η, β, ϵ, IdDict())
174+
ADAM::Real, β::Tuple, state::IdDict) = ADAM(η, β, EPS, state)
175175

176176
function apply!(o::ADAM, x, Δ)
177177
η, β = o.eta, o.beta
@@ -189,7 +189,7 @@ function apply!(o::ADAM, x, Δ)
189189
end
190190

191191
"""
192-
RADAM(η = 0.001, β::Tuple = (0.9, 0.999))
192+
RADAM(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS)
193193
194194
[Rectified ADAM](https://arxiv.org/abs/1908.03265) optimizer.
195195
@@ -212,8 +212,8 @@ mutable struct RADAM <: AbstractOptimiser
212212
epsilon::Float64
213213
state::IdDict
214214
end
215-
216-
RADAM = 0.001, β = (0.9, 0.999), ϵ = ϵ) = RADAM(η, β, ϵ, IdDict())
215+
RADAM::Real = 0.001, β::Tuple = (0.9, 0.999), ϵ::Real = EPS) = RADAM(η, β, ϵ, IdDict())
216+
RADAM::Real, β::Tuple, state::IdDict) = RADAM(η, β, EPS, state)
217217

218218
function apply!(o::RADAM, x, Δ)
219219
η, β = o.eta, o.beta
@@ -239,7 +239,7 @@ function apply!(o::RADAM, x, Δ)
239239
end
240240

241241
"""
242-
AdaMax(η = 0.001, β::Tuple = (0.9, 0.999))
242+
AdaMax(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS)
243243
244244
[AdaMax](https://arxiv.org/abs/1412.6980) is a variant of ADAM based on the ∞-norm.
245245
@@ -262,8 +262,8 @@ mutable struct AdaMax <: AbstractOptimiser
262262
epsilon::Float64
263263
state::IdDict
264264
end
265-
266-
AdaMax = 0.001, β = (0.9, 0.999), ϵ = ϵ) = AdaMax(η, β, ϵ, IdDict())
265+
AdaMax::Real = 0.001, β::Tuple = (0.9, 0.999), ϵ::Real = EPS) = AdaMax(η, β, ϵ, IdDict())
266+
AdaMax::Real, β::Tuple, state::IdDict) = AdaMax(η, β, EPS, state)
267267

268268
function apply!(o::AdaMax, x, Δ)
269269
η, β = o.eta, o.beta
@@ -281,7 +281,7 @@ function apply!(o::AdaMax, x, Δ)
281281
end
282282

283283
"""
284-
OADAM(η = 0.0001, β::Tuple = (0.5, 0.9))
284+
OADAM(η = 0.0001, β::Tuple = (0.5, 0.9), ϵ = $EPS)
285285
286286
[OADAM](https://arxiv.org/abs/1711.00141) (Optimistic ADAM)
287287
is a variant of ADAM adding an "optimistic" term suitable for adversarial training.
@@ -305,8 +305,8 @@ mutable struct OADAM <: AbstractOptimiser
305305
epsilon::Float64
306306
state::IdDict
307307
end
308-
309-
OADAM = 0.001, β = (0.5, 0.9), ϵ = ϵ) = OADAM(η, β, ϵ, IdDict())
308+
OADAM::Real = 0.001, β::Tuple = (0.5, 0.9), ϵ::Real = EPS) = OADAM(η, β, ϵ, IdDict())
309+
OADAM::Real, β::Tuple, state::IdDict) = RMSProp(η, β, EPS, state)
310310

311311
function apply!(o::OADAM, x, Δ)
312312
η, β = o.eta, o.beta
@@ -326,7 +326,7 @@ function apply!(o::OADAM, x, Δ)
326326
end
327327

328328
"""
329-
ADAGrad(η = 0.1)
329+
ADAGrad(η = 0.1, ϵ = $EPS)
330330
331331
[ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimizer. It has
332332
parameter specific learning rates based on how frequently it is updated.
@@ -348,8 +348,8 @@ mutable struct ADAGrad <: AbstractOptimiser
348348
epsilon::Float64
349349
acc::IdDict
350350
end
351-
352-
ADAGrad = 0.1, ϵ = ϵ) = ADAGrad(η, ϵ, IdDict())
351+
ADAGrad::Real = 0.1, ϵ::Real = EPS) = ADAGrad(η, ϵ, IdDict())
352+
ADAGrad::Real, state::IdDict) = ADAGrad(η, EPS, state)
353353

354354
function apply!(o::ADAGrad, x, Δ)
355355
η = o.eta
@@ -359,7 +359,7 @@ function apply!(o::ADAGrad, x, Δ)
359359
end
360360

361361
"""
362-
ADADelta(ρ = 0.9)
362+
ADADelta(ρ = 0.9, ϵ = $EPS)
363363
364364
[ADADelta](https://arxiv.org/abs/1212.5701) is a version of ADAGrad adapting its learning
365365
rate based on a window of past gradient updates.
@@ -380,8 +380,8 @@ mutable struct ADADelta <: AbstractOptimiser
380380
epsilon::Float64
381381
state::IdDict
382382
end
383-
384-
ADADelta = 0.9, ϵ = ϵ) = ADADelta(ρ, ϵ, IdDict())
383+
ADADelta::Real = 0.9, ϵ::Real = EPS) = ADADelta(ρ, ϵ, IdDict())
384+
ADADelta::Real, state::IdDict) = ADADelta(ρ, EPS, state)
385385

386386
function apply!(o::ADADelta, x, Δ)
387387
ρ = o.rho
@@ -395,7 +395,7 @@ function apply!(o::ADADelta, x, Δ)
395395
end
396396

397397
"""
398-
AMSGrad(η = 0.001, β::Tuple = (0.9, 0.999))
398+
AMSGrad(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS)
399399
400400
The [AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) version of the ADAM
401401
optimiser. Parameters don't need tuning.
@@ -419,8 +419,8 @@ mutable struct AMSGrad <: AbstractOptimiser
419419
epsilon::Float64
420420
state::IdDict
421421
end
422-
423-
AMSGrad = 0.001, β = (0.9, 0.999), ϵ = ϵ) = AMSGrad(η, β, ϵ, IdDict())
422+
AMSGrad::Real = 0.001, β = (0.9, 0.999), ϵ::Real = EPS) = AMSGrad(η, β, ϵ, IdDict())
423+
AMSGrad::Real, β::Tuple, state::IdDict) = AMSGrad(η, β, EPS, state)
424424

425425
function apply!(o::AMSGrad, x, Δ)
426426
η, β = o.eta, o.beta
@@ -436,7 +436,7 @@ function apply!(o::AMSGrad, x, Δ)
436436
end
437437

438438
"""
439-
NADAM(η = 0.001, β::Tuple = (0.9, 0.999))
439+
NADAM(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS)
440440
441441
[NADAM](https://openreview.net/forum?id=OM0jvwB8jIp57ZJjtNEZ) is a Nesterov variant of ADAM.
442442
Parameters don't need tuning.
@@ -460,8 +460,8 @@ mutable struct NADAM <: AbstractOptimiser
460460
epsilon::Float64
461461
state::IdDict
462462
end
463-
464-
NADAM = 0.001, β = (0.9, 0.999), ϵ = ϵ) = NADAM(η, β, ϵ, IdDict())
463+
NADAM::Real = 0.001, β = (0.9, 0.999), ϵ::Real = EPS) = NADAM(η, β, ϵ, IdDict())
464+
NADAM::Real, β::Tuple, state::IdDict) = NADAM(η, β, EPS, state)
465465

466466
function apply!(o::NADAM, x, Δ)
467467
η, β = o.eta, o.beta
@@ -503,7 +503,7 @@ ADAMW(η = 0.001, β = (0.9, 0.999), decay = 0) =
503503
Optimiser(ADAM(1, β), WeightDecay(decay), Descent(η))
504504

505505
"""
506-
AdaBelief(η = 0.001, β::Tuple = (0.9, 0.999))
506+
AdaBelief(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS)
507507
508508
The [AdaBelief](https://arxiv.org/abs/2010.07468) optimiser is a variant of the well-known
509509
ADAM optimiser.
@@ -527,8 +527,8 @@ mutable struct AdaBelief
527527
epsilon::Float64
528528
state::IdDict
529529
end
530-
531-
AdaBelief = 0.001, β = (0.9, 0.999), ϵ = ϵ) = AdaBelief(η, β, ϵ, IdDict())
530+
AdaBelief::Real = 0.001, β = (0.9, 0.999), ϵ::Real = EPS) = AdaBelief(η, β, ϵ, IdDict())
531+
AdaBelief::Real, β::Tuple, state::IdDict) = AdaBelief(η, β, EPS, state)
532532

533533
function apply!(o::AdaBelief, x, Δ)
534534
η, β = o.eta, o.beta

0 commit comments

Comments
 (0)