@@ -3,7 +3,7 @@ using MacroTools: @forward
3
3
4
4
abstract type AbstractOptimiser end
5
5
6
- const ϵ = 1e-8
6
+ const EPS = 1e-8
7
7
8
8
# TODO : should use weak refs
9
9
@@ -110,7 +110,7 @@ function apply!(o::Nesterov, x, Δ)
110
110
end
111
111
112
112
"""
113
- RMSProp(η = 0.001, ρ = 0.9)
113
+ RMSProp(η = 0.001, ρ = 0.9, ϵ = $EPS )
114
114
115
115
Optimizer using the
116
116
[RMSProp](https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
@@ -136,8 +136,8 @@ mutable struct RMSProp <: AbstractOptimiser
136
136
epsilon:: Float64
137
137
acc:: IdDict
138
138
end
139
-
140
- RMSProp (η = 0.001 , ρ = 0.9 , ϵ = ϵ ) = RMSProp (η, ρ, ϵ, IdDict () )
139
+ RMSProp (η :: Real = 0.001 , ρ :: Real = 0.9 , ϵ :: Real = EPS) = RMSProp (η, ρ, ϵ, IdDict ())
140
+ RMSProp (η:: Real , ρ:: Real , acc :: IdDict ) = RMSProp (η, ρ, EPS, acc )
141
141
142
142
function apply! (o:: RMSProp , x, Δ)
143
143
η, ρ = o. eta, o. rho
@@ -147,7 +147,7 @@ function apply!(o::RMSProp, x, Δ)
147
147
end
148
148
149
149
"""
150
- ADAM(η = 0.001, β::Tuple = (0.9, 0.999))
150
+ ADAM(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS )
151
151
152
152
[ADAM](https://arxiv.org/abs/1412.6980) optimiser.
153
153
@@ -170,8 +170,8 @@ mutable struct ADAM <: AbstractOptimiser
170
170
epsilon:: Float64
171
171
state:: IdDict
172
172
end
173
-
174
- ADAM (η = 0.001 , β = ( 0.9 , 0.999 ), ϵ = ϵ) = ADAM (η, β, ϵ, IdDict () )
173
+ ADAM (η :: Real = 0.001 , β :: Tuple = ( 0.9 , 0.999 ), ϵ :: Real = EPS) = ADAM (η, β, ϵ, IdDict ())
174
+ ADAM (η:: Real , β:: Tuple , state :: IdDict ) = ADAM (η, β, EPS, state )
175
175
176
176
function apply! (o:: ADAM , x, Δ)
177
177
η, β = o. eta, o. beta
@@ -189,7 +189,7 @@ function apply!(o::ADAM, x, Δ)
189
189
end
190
190
191
191
"""
192
- RADAM(η = 0.001, β::Tuple = (0.9, 0.999))
192
+ RADAM(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS )
193
193
194
194
[Rectified ADAM](https://arxiv.org/abs/1908.03265) optimizer.
195
195
@@ -212,8 +212,8 @@ mutable struct RADAM <: AbstractOptimiser
212
212
epsilon:: Float64
213
213
state:: IdDict
214
214
end
215
-
216
- RADAM (η = 0.001 , β = ( 0.9 , 0.999 ), ϵ = ϵ) = RADAM (η, β, ϵ, IdDict () )
215
+ RADAM (η :: Real = 0.001 , β :: Tuple = ( 0.9 , 0.999 ), ϵ :: Real = EPS) = RADAM (η, β, ϵ, IdDict ())
216
+ RADAM (η:: Real , β:: Tuple , state :: IdDict ) = RADAM (η, β, EPS, state )
217
217
218
218
function apply! (o:: RADAM , x, Δ)
219
219
η, β = o. eta, o. beta
@@ -239,7 +239,7 @@ function apply!(o::RADAM, x, Δ)
239
239
end
240
240
241
241
"""
242
- AdaMax(η = 0.001, β::Tuple = (0.9, 0.999))
242
+ AdaMax(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS )
243
243
244
244
[AdaMax](https://arxiv.org/abs/1412.6980) is a variant of ADAM based on the ∞-norm.
245
245
@@ -262,8 +262,8 @@ mutable struct AdaMax <: AbstractOptimiser
262
262
epsilon:: Float64
263
263
state:: IdDict
264
264
end
265
-
266
- AdaMax (η = 0.001 , β = ( 0.9 , 0.999 ), ϵ = ϵ) = AdaMax (η, β, ϵ, IdDict () )
265
+ AdaMax (η :: Real = 0.001 , β :: Tuple = ( 0.9 , 0.999 ), ϵ :: Real = EPS) = AdaMax (η, β, ϵ, IdDict ())
266
+ AdaMax (η:: Real , β:: Tuple , state :: IdDict ) = AdaMax (η, β, EPS, state )
267
267
268
268
function apply! (o:: AdaMax , x, Δ)
269
269
η, β = o. eta, o. beta
@@ -281,7 +281,7 @@ function apply!(o::AdaMax, x, Δ)
281
281
end
282
282
283
283
"""
284
- OADAM(η = 0.0001, β::Tuple = (0.5, 0.9))
284
+ OADAM(η = 0.0001, β::Tuple = (0.5, 0.9), ϵ = $EPS )
285
285
286
286
[OADAM](https://arxiv.org/abs/1711.00141) (Optimistic ADAM)
287
287
is a variant of ADAM adding an "optimistic" term suitable for adversarial training.
@@ -305,8 +305,8 @@ mutable struct OADAM <: AbstractOptimiser
305
305
epsilon:: Float64
306
306
state:: IdDict
307
307
end
308
-
309
- OADAM (η = 0.001 , β = ( 0.5 , 0.9 ), ϵ = ϵ) = OADAM (η, β, ϵ, IdDict () )
308
+ OADAM (η :: Real = 0.001 , β :: Tuple = ( 0.5 , 0.9 ), ϵ :: Real = EPS) = OADAM (η, β, ϵ, IdDict ())
309
+ OADAM (η:: Real , β:: Tuple , state :: IdDict ) = RMSProp (η, β, EPS, state )
310
310
311
311
function apply! (o:: OADAM , x, Δ)
312
312
η, β = o. eta, o. beta
@@ -326,7 +326,7 @@ function apply!(o::OADAM, x, Δ)
326
326
end
327
327
328
328
"""
329
- ADAGrad(η = 0.1)
329
+ ADAGrad(η = 0.1, ϵ = $EPS )
330
330
331
331
[ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimizer. It has
332
332
parameter specific learning rates based on how frequently it is updated.
@@ -348,8 +348,8 @@ mutable struct ADAGrad <: AbstractOptimiser
348
348
epsilon:: Float64
349
349
acc:: IdDict
350
350
end
351
-
352
- ADAGrad (η = 0.1 , ϵ = ϵ ) = ADAGrad (η, ϵ, IdDict () )
351
+ ADAGrad (η :: Real = 0.1 , ϵ :: Real = EPS) = ADAGrad (η, ϵ, IdDict ())
352
+ ADAGrad (η:: Real , state :: IdDict ) = ADAGrad (η, EPS, state )
353
353
354
354
function apply! (o:: ADAGrad , x, Δ)
355
355
η = o. eta
@@ -359,7 +359,7 @@ function apply!(o::ADAGrad, x, Δ)
359
359
end
360
360
361
361
"""
362
- ADADelta(ρ = 0.9)
362
+ ADADelta(ρ = 0.9, ϵ = $EPS )
363
363
364
364
[ADADelta](https://arxiv.org/abs/1212.5701) is a version of ADAGrad adapting its learning
365
365
rate based on a window of past gradient updates.
@@ -380,8 +380,8 @@ mutable struct ADADelta <: AbstractOptimiser
380
380
epsilon:: Float64
381
381
state:: IdDict
382
382
end
383
-
384
- ADADelta (ρ = 0.9 , ϵ = ϵ ) = ADADelta (ρ, ϵ, IdDict () )
383
+ ADADelta (ρ :: Real = 0.9 , ϵ :: Real = EPS) = ADADelta (ρ, ϵ, IdDict ())
384
+ ADADelta (ρ:: Real , state :: IdDict ) = ADADelta (ρ, EPS, state )
385
385
386
386
function apply! (o:: ADADelta , x, Δ)
387
387
ρ = o. rho
@@ -395,7 +395,7 @@ function apply!(o::ADADelta, x, Δ)
395
395
end
396
396
397
397
"""
398
- AMSGrad(η = 0.001, β::Tuple = (0.9, 0.999))
398
+ AMSGrad(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS )
399
399
400
400
The [AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) version of the ADAM
401
401
optimiser. Parameters don't need tuning.
@@ -419,8 +419,8 @@ mutable struct AMSGrad <: AbstractOptimiser
419
419
epsilon:: Float64
420
420
state:: IdDict
421
421
end
422
-
423
- AMSGrad (η = 0.001 , β = ( 0.9 , 0.999 ), ϵ = ϵ) = AMSGrad (η, β, ϵ, IdDict () )
422
+ AMSGrad (η :: Real = 0.001 , β = ( 0.9 , 0.999 ), ϵ :: Real = EPS) = AMSGrad (η, β, ϵ, IdDict ())
423
+ AMSGrad (η:: Real , β:: Tuple , state :: IdDict ) = AMSGrad (η, β, EPS, state )
424
424
425
425
function apply! (o:: AMSGrad , x, Δ)
426
426
η, β = o. eta, o. beta
@@ -436,7 +436,7 @@ function apply!(o::AMSGrad, x, Δ)
436
436
end
437
437
438
438
"""
439
- NADAM(η = 0.001, β::Tuple = (0.9, 0.999))
439
+ NADAM(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS )
440
440
441
441
[NADAM](https://openreview.net/forum?id=OM0jvwB8jIp57ZJjtNEZ) is a Nesterov variant of ADAM.
442
442
Parameters don't need tuning.
@@ -460,8 +460,8 @@ mutable struct NADAM <: AbstractOptimiser
460
460
epsilon:: Float64
461
461
state:: IdDict
462
462
end
463
-
464
- NADAM (η = 0.001 , β = ( 0.9 , 0.999 ), ϵ = ϵ) = NADAM (η, β, ϵ, IdDict () )
463
+ NADAM (η :: Real = 0.001 , β = ( 0.9 , 0.999 ), ϵ :: Real = EPS) = NADAM (η, β, ϵ, IdDict ())
464
+ NADAM (η:: Real , β:: Tuple , state :: IdDict ) = NADAM (η, β, EPS, state )
465
465
466
466
function apply! (o:: NADAM , x, Δ)
467
467
η, β = o. eta, o. beta
@@ -503,7 +503,7 @@ ADAMW(η = 0.001, β = (0.9, 0.999), decay = 0) =
503
503
Optimiser (ADAM (1 , β), WeightDecay (decay), Descent (η))
504
504
505
505
"""
506
- AdaBelief(η = 0.001, β::Tuple = (0.9, 0.999))
506
+ AdaBelief(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS )
507
507
508
508
The [AdaBelief](https://arxiv.org/abs/2010.07468) optimiser is a variant of the well-known
509
509
ADAM optimiser.
@@ -527,8 +527,8 @@ mutable struct AdaBelief
527
527
epsilon:: Float64
528
528
state:: IdDict
529
529
end
530
-
531
- AdaBelief (η = 0.001 , β = ( 0.9 , 0.999 ), ϵ = ϵ) = AdaBelief (η, β, ϵ, IdDict () )
530
+ AdaBelief (η :: Real = 0.001 , β = ( 0.9 , 0.999 ), ϵ :: Real = EPS) = AdaBelief (η, β, ϵ, IdDict ())
531
+ AdaBelief (η:: Real , β:: Tuple , state :: IdDict ) = AdaBelief (η, β, EPS, state )
532
532
533
533
function apply! (o:: AdaBelief , x, Δ)
534
534
η, β = o. eta, o. beta
0 commit comments