@@ -3,7 +3,7 @@ using MacroTools: @forward
3
3
4
4
abstract type AbstractOptimiser end
5
5
6
- const ϵ = 1e-8
6
+ const EPS = 1e-8
7
7
8
8
# TODO : should use weak refs
9
9
@@ -110,7 +110,7 @@ function apply!(o::Nesterov, x, Δ)
110
110
end
111
111
112
112
"""
113
- RMSProp(η = 0.001, ρ = 0.9)
113
+ RMSProp(η = 0.001, ρ = 0.9, ϵ = $EPS )
114
114
115
115
Optimizer using the
116
116
[RMSProp](https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
@@ -133,11 +133,11 @@ opt = RMSProp(0.002, 0.95)
133
133
mutable struct RMSProp <: AbstractOptimiser
134
134
eta:: Float64
135
135
rho:: Float64
136
- acc:: IdDict
137
136
epsilon:: Float64
137
+ acc:: IdDict
138
138
end
139
-
140
- RMSProp (η = 0.001 , ρ = 0.9 ; ϵ = ϵ ) = RMSProp (η, ρ, IdDict (), ϵ )
139
+ RMSProp (η :: Real = 0.001 , ρ :: Real = 0.9 , ϵ :: Real = EPS) = RMSProp (η, ρ, ϵ, IdDict ())
140
+ RMSProp (η:: Real , ρ:: Real , acc :: IdDict ) = RMSProp (η, ρ, EPS, acc )
141
141
142
142
function apply! (o:: RMSProp , x, Δ)
143
143
η, ρ = o. eta, o. rho
@@ -147,7 +147,7 @@ function apply!(o::RMSProp, x, Δ)
147
147
end
148
148
149
149
"""
150
- ADAM(η = 0.001, β::Tuple = (0.9, 0.999))
150
+ ADAM(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS )
151
151
152
152
[ADAM](https://arxiv.org/abs/1412.6980) optimiser.
153
153
@@ -167,11 +167,11 @@ opt = ADAM(0.001, (0.9, 0.8))
167
167
mutable struct ADAM <: AbstractOptimiser
168
168
eta:: Float64
169
169
beta:: Tuple{Float64,Float64}
170
- state:: IdDict
171
170
epsilon:: Float64
171
+ state:: IdDict
172
172
end
173
-
174
- ADAM (η = 0.001 , β = ( 0.9 , 0.999 ); ϵ = ϵ) = ADAM (η, β, IdDict (), ϵ )
173
+ ADAM (η :: Real = 0.001 , β :: Tuple = ( 0.9 , 0.999 ), ϵ :: Real = EPS) = ADAM (η, β, ϵ, IdDict ())
174
+ ADAM (η:: Real , β:: Tuple , state :: IdDict ) = ADAM (η, β, EPS, state )
175
175
176
176
function apply! (o:: ADAM , x, Δ)
177
177
η, β = o. eta, o. beta
@@ -189,7 +189,7 @@ function apply!(o::ADAM, x, Δ)
189
189
end
190
190
191
191
"""
192
- RADAM(η = 0.001, β::Tuple = (0.9, 0.999))
192
+ RADAM(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS )
193
193
194
194
[Rectified ADAM](https://arxiv.org/abs/1908.03265) optimizer.
195
195
@@ -209,11 +209,11 @@ opt = RADAM(0.001, (0.9, 0.8))
209
209
mutable struct RADAM <: AbstractOptimiser
210
210
eta:: Float64
211
211
beta:: Tuple{Float64,Float64}
212
- state:: IdDict
213
212
epsilon:: Float64
213
+ state:: IdDict
214
214
end
215
-
216
- RADAM (η = 0.001 , β = ( 0.9 , 0.999 ); ϵ = ϵ) = RADAM (η, β, IdDict (), ϵ )
215
+ RADAM (η :: Real = 0.001 , β :: Tuple = ( 0.9 , 0.999 ), ϵ :: Real = EPS) = RADAM (η, β, ϵ, IdDict ())
216
+ RADAM (η:: Real , β:: Tuple , state :: IdDict ) = RADAM (η, β, EPS, state )
217
217
218
218
function apply! (o:: RADAM , x, Δ)
219
219
η, β = o. eta, o. beta
@@ -239,7 +239,7 @@ function apply!(o::RADAM, x, Δ)
239
239
end
240
240
241
241
"""
242
- AdaMax(η = 0.001, β::Tuple = (0.9, 0.999))
242
+ AdaMax(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS )
243
243
244
244
[AdaMax](https://arxiv.org/abs/1412.6980) is a variant of ADAM based on the ∞-norm.
245
245
@@ -259,11 +259,11 @@ opt = AdaMax(0.001, (0.9, 0.995))
259
259
mutable struct AdaMax <: AbstractOptimiser
260
260
eta:: Float64
261
261
beta:: Tuple{Float64,Float64}
262
- state:: IdDict
263
262
epsilon:: Float64
263
+ state:: IdDict
264
264
end
265
-
266
- AdaMax (η = 0.001 , β = ( 0.9 , 0.999 ); ϵ = ϵ) = AdaMax (η, β, IdDict (), ϵ )
265
+ AdaMax (η :: Real = 0.001 , β :: Tuple = ( 0.9 , 0.999 ), ϵ :: Real = EPS) = AdaMax (η, β, ϵ, IdDict ())
266
+ AdaMax (η:: Real , β:: Tuple , state :: IdDict ) = AdaMax (η, β, EPS, state )
267
267
268
268
function apply! (o:: AdaMax , x, Δ)
269
269
η, β = o. eta, o. beta
@@ -281,7 +281,7 @@ function apply!(o::AdaMax, x, Δ)
281
281
end
282
282
283
283
"""
284
- OADAM(η = 0.0001, β::Tuple = (0.5, 0.9))
284
+ OADAM(η = 0.0001, β::Tuple = (0.5, 0.9), ϵ = $EPS )
285
285
286
286
[OADAM](https://arxiv.org/abs/1711.00141) (Optimistic ADAM)
287
287
is a variant of ADAM adding an "optimistic" term suitable for adversarial training.
@@ -302,11 +302,11 @@ opt = OADAM(0.001, (0.9, 0.995))
302
302
mutable struct OADAM <: AbstractOptimiser
303
303
eta:: Float64
304
304
beta:: Tuple{Float64,Float64}
305
- state:: IdDict
306
305
epsilon:: Float64
306
+ state:: IdDict
307
307
end
308
-
309
- OADAM (η = 0.001 , β = ( 0.5 , 0.9 ); ϵ = ϵ) = OADAM (η, β, IdDict (), ϵ )
308
+ OADAM (η :: Real = 0.001 , β :: Tuple = ( 0.5 , 0.9 ), ϵ :: Real = EPS) = OADAM (η, β, ϵ, IdDict ())
309
+ OADAM (η:: Real , β:: Tuple , state :: IdDict ) = RMSProp (η, β, EPS, state )
310
310
311
311
function apply! (o:: OADAM , x, Δ)
312
312
η, β = o. eta, o. beta
@@ -326,7 +326,7 @@ function apply!(o::OADAM, x, Δ)
326
326
end
327
327
328
328
"""
329
- ADAGrad(η = 0.1)
329
+ ADAGrad(η = 0.1, ϵ = $EPS )
330
330
331
331
[ADAGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimizer. It has
332
332
parameter specific learning rates based on how frequently it is updated.
@@ -345,11 +345,11 @@ opt = ADAGrad(0.001)
345
345
"""
346
346
mutable struct ADAGrad <: AbstractOptimiser
347
347
eta:: Float64
348
- acc:: IdDict
349
348
epsilon:: Float64
349
+ acc:: IdDict
350
350
end
351
-
352
- ADAGrad (η = 0.1 ; ϵ = ϵ ) = ADAGrad (η, IdDict (), ϵ )
351
+ ADAGrad (η :: Real = 0.1 , ϵ :: Real = EPS) = ADAGrad (η, ϵ, IdDict ())
352
+ ADAGrad (η:: Real , state :: IdDict ) = ADAGrad (η, EPS, state )
353
353
354
354
function apply! (o:: ADAGrad , x, Δ)
355
355
η = o. eta
@@ -359,7 +359,7 @@ function apply!(o::ADAGrad, x, Δ)
359
359
end
360
360
361
361
"""
362
- ADADelta(ρ = 0.9)
362
+ ADADelta(ρ = 0.9, ϵ = $EPS )
363
363
364
364
[ADADelta](https://arxiv.org/abs/1212.5701) is a version of ADAGrad adapting its learning
365
365
rate based on a window of past gradient updates.
@@ -377,11 +377,11 @@ opt = ADADelta(0.89)
377
377
"""
378
378
mutable struct ADADelta <: AbstractOptimiser
379
379
rho:: Float64
380
- state:: IdDict
381
380
epsilon:: Float64
381
+ state:: IdDict
382
382
end
383
-
384
- ADADelta (ρ = 0.9 ; ϵ = ϵ ) = ADADelta (ρ, IdDict (), ϵ )
383
+ ADADelta (ρ :: Real = 0.9 , ϵ :: Real = EPS) = ADADelta (ρ, ϵ, IdDict ())
384
+ ADADelta (ρ:: Real , state :: IdDict ) = ADADelta (ρ, EPS, state )
385
385
386
386
function apply! (o:: ADADelta , x, Δ)
387
387
ρ = o. rho
@@ -395,7 +395,7 @@ function apply!(o::ADADelta, x, Δ)
395
395
end
396
396
397
397
"""
398
- AMSGrad(η = 0.001, β::Tuple = (0.9, 0.999))
398
+ AMSGrad(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS )
399
399
400
400
The [AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) version of the ADAM
401
401
optimiser. Parameters don't need tuning.
@@ -416,11 +416,11 @@ opt = AMSGrad(0.001, (0.89, 0.995))
416
416
mutable struct AMSGrad <: AbstractOptimiser
417
417
eta:: Float64
418
418
beta:: Tuple{Float64, Float64}
419
- state:: IdDict
420
419
epsilon:: Float64
420
+ state:: IdDict
421
421
end
422
-
423
- AMSGrad (η = 0.001 , β = ( 0.9 , 0.999 ); ϵ = ϵ) = AMSGrad (η, β, IdDict (), ϵ )
422
+ AMSGrad (η :: Real = 0.001 , β = ( 0.9 , 0.999 ), ϵ :: Real = EPS) = AMSGrad (η, β, ϵ, IdDict ())
423
+ AMSGrad (η:: Real , β:: Tuple , state :: IdDict ) = AMSGrad (η, β, EPS, state )
424
424
425
425
function apply! (o:: AMSGrad , x, Δ)
426
426
η, β = o. eta, o. beta
@@ -436,7 +436,7 @@ function apply!(o::AMSGrad, x, Δ)
436
436
end
437
437
438
438
"""
439
- NADAM(η = 0.001, β::Tuple = (0.9, 0.999))
439
+ NADAM(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS )
440
440
441
441
[NADAM](https://openreview.net/forum?id=OM0jvwB8jIp57ZJjtNEZ) is a Nesterov variant of ADAM.
442
442
Parameters don't need tuning.
@@ -457,11 +457,11 @@ opt = NADAM(0.002, (0.89, 0.995))
457
457
mutable struct NADAM <: AbstractOptimiser
458
458
eta:: Float64
459
459
beta:: Tuple{Float64, Float64}
460
- state:: IdDict
461
460
epsilon:: Float64
461
+ state:: IdDict
462
462
end
463
-
464
- NADAM (η = 0.001 , β = ( 0.9 , 0.999 ); ϵ = ϵ) = NADAM (η, β, IdDict (), ϵ )
463
+ NADAM (η :: Real = 0.001 , β = ( 0.9 , 0.999 ), ϵ :: Real = EPS) = NADAM (η, β, ϵ, IdDict ())
464
+ NADAM (η:: Real , β:: Tuple , state :: IdDict ) = NADAM (η, β, EPS, state )
465
465
466
466
function apply! (o:: NADAM , x, Δ)
467
467
η, β = o. eta, o. beta
@@ -503,7 +503,7 @@ ADAMW(η = 0.001, β = (0.9, 0.999), decay = 0) =
503
503
Optimiser (ADAM (1 , β), WeightDecay (decay), Descent (η))
504
504
505
505
"""
506
- AdaBelief(η = 0.001, β::Tuple = (0.9, 0.999))
506
+ AdaBelief(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS )
507
507
508
508
The [AdaBelief](https://arxiv.org/abs/2010.07468) optimiser is a variant of the well-known
509
509
ADAM optimiser.
@@ -524,11 +524,11 @@ opt = AdaBelief(0.001, (0.9, 0.8))
524
524
mutable struct AdaBelief
525
525
eta:: Float64
526
526
beta:: Tuple{Float64,Float64}
527
- state:: IdDict
528
527
epsilon:: Float64
528
+ state:: IdDict
529
529
end
530
-
531
- AdaBelief (η = 0.001 , β = ( 0.9 , 0.999 ); ϵ = ϵ) = AdaBelief (η, β, IdDict (), ϵ )
530
+ AdaBelief (η :: Real = 0.001 , β = ( 0.9 , 0.999 ), ϵ :: Real = EPS) = AdaBelief (η, β, ϵ, IdDict ())
531
+ AdaBelief (η:: Real , β:: Tuple , state :: IdDict ) = AdaBelief (η, β, EPS, state )
532
532
533
533
function apply! (o:: AdaBelief , x, Δ)
534
534
η, β = o. eta, o. beta
0 commit comments