@@ -133,16 +133,17 @@ opt = RMSProp(0.002, 0.95)
133
133
mutable struct RMSProp <: AbstractOptimiser
134
134
eta:: Float64
135
135
rho:: Float64
136
+ epsilon:: Float64
136
137
acc:: IdDict
137
138
end
138
139
139
- RMSProp (η = 0.001 , ρ = 0.9 ) = RMSProp (η, ρ, IdDict ())
140
+ RMSProp (η = 0.001 , ρ = 0.9 , ϵ = ϵ ) = RMSProp (η, ρ, ϵ , IdDict ())
140
141
141
142
function apply! (o:: RMSProp , x, Δ)
142
143
η, ρ = o. eta, o. rho
143
144
acc = get! (() -> zero (x), o. acc, x):: typeof (x)
144
145
@. acc = ρ * acc + (1 - ρ) * Δ * conj (Δ)
145
- @. Δ *= η / (√ acc + ϵ )
146
+ @. Δ *= η / (√ acc + o . epsilon )
146
147
end
147
148
148
149
"""
@@ -166,10 +167,11 @@ opt = ADAM(0.001, (0.9, 0.8))
166
167
mutable struct ADAM <: AbstractOptimiser
167
168
eta:: Float64
168
169
beta:: Tuple{Float64,Float64}
170
+ epsilon:: Float64
169
171
state:: IdDict
170
172
end
171
173
172
- ADAM (η = 0.001 , β = (0.9 , 0.999 )) = ADAM (η, β, IdDict ())
174
+ ADAM (η = 0.001 , β = (0.9 , 0.999 ), ϵ = ϵ ) = ADAM (η, β, ϵ , IdDict ())
173
175
174
176
function apply! (o:: ADAM , x, Δ)
175
177
η, β = o. eta, o. beta
@@ -180,7 +182,7 @@ function apply!(o::ADAM, x, Δ)
180
182
181
183
@. mt = β[1 ] * mt + (1 - β[1 ]) * Δ
182
184
@. vt = β[2 ] * vt + (1 - β[2 ]) * Δ * conj (Δ)
183
- @. Δ = mt / (1 - βp[1 ]) / (√ (vt / (1 - βp[2 ])) + ϵ ) * η
185
+ @. Δ = mt / (1 - βp[1 ]) / (√ (vt / (1 - βp[2 ])) + o . epsilon ) * η
184
186
βp .= βp .* β
185
187
186
188
return Δ
@@ -207,10 +209,11 @@ opt = RADAM(0.001, (0.9, 0.8))
207
209
mutable struct RADAM <: AbstractOptimiser
208
210
eta:: Float64
209
211
beta:: Tuple{Float64,Float64}
212
+ epsilon:: Float64
210
213
state:: IdDict
211
214
end
212
215
213
- RADAM (η = 0.001 , β = (0.9 , 0.999 )) = RADAM (η, β, IdDict ())
216
+ RADAM (η = 0.001 , β = (0.9 , 0.999 ), ϵ = ϵ ) = RADAM (η, β, ϵ , IdDict ())
214
217
215
218
function apply! (o:: RADAM , x, Δ)
216
219
η, β = o. eta, o. beta
@@ -225,7 +228,7 @@ function apply!(o::RADAM, x, Δ)
225
228
ρ = ρ∞ - 2 t[] * βp[2 ] / (1 - βp[2 ])
226
229
if ρ > 4
227
230
r = sqrt ((ρ- 4 )* (ρ- 2 )* ρ∞/ ((ρ∞- 4 )* (ρ∞- 2 )* ρ))
228
- @. Δ = mt / (1 - βp[1 ]) / (√ (vt / (1 - βp[2 ])) + ϵ ) * η * r
231
+ @. Δ = mt / (1 - βp[1 ]) / (√ (vt / (1 - βp[2 ])) + o . epsilon ) * η * r
229
232
else
230
233
@. Δ = mt / (1 - βp[1 ]) * η
231
234
end
@@ -256,10 +259,11 @@ opt = AdaMax(0.001, (0.9, 0.995))
256
259
mutable struct AdaMax <: AbstractOptimiser
257
260
eta:: Float64
258
261
beta:: Tuple{Float64,Float64}
262
+ epsilon:: Float64
259
263
state:: IdDict
260
264
end
261
265
262
- AdaMax (η = 0.001 , β = (0.9 , 0.999 )) = AdaMax (η, β, IdDict ())
266
+ AdaMax (η = 0.001 , β = (0.9 , 0.999 ), ϵ = ϵ ) = AdaMax (η, β, ϵ , IdDict ())
263
267
264
268
function apply! (o:: AdaMax , x, Δ)
265
269
η, β = o. eta, o. beta
@@ -270,7 +274,7 @@ function apply!(o::AdaMax, x, Δ)
270
274
271
275
@. mt = β[1 ] * mt + (1 - β[1 ]) * Δ
272
276
@. ut = max (β[2 ] * ut, abs (Δ))
273
- @. Δ = (η/ (1 - βp[1 ])) * mt/ (ut + ϵ )
277
+ @. Δ = (η/ (1 - βp[1 ])) * mt/ (ut + o . epsilon )
274
278
βp .= βp .* β
275
279
276
280
return Δ
@@ -298,10 +302,11 @@ opt = OADAM(0.001, (0.9, 0.995))
298
302
mutable struct OADAM <: AbstractOptimiser
299
303
eta:: Float64
300
304
beta:: Tuple{Float64,Float64}
305
+ epsilon:: Float64
301
306
state:: IdDict
302
307
end
303
308
304
- OADAM (η = 0.001 , β = (0.5 , 0.9 )) = OADAM (η, β, IdDict ())
309
+ OADAM (η = 0.001 , β = (0.5 , 0.9 ), ϵ = ϵ ) = OADAM (η, β, ϵ , IdDict ())
305
310
306
311
function apply! (o:: OADAM , x, Δ)
307
312
η, β = o. eta, o. beta
@@ -313,7 +318,7 @@ function apply!(o::OADAM, x, Δ)
313
318
@. mt = β[1 ] * mt + (1 - β[1 ]) * Δ
314
319
@. vt = β[2 ] * vt + (1 - β[2 ]) * Δ * conj (Δ)
315
320
@. Δ = - Δ_
316
- @. Δ_ = η * mt / (1 - βp[1 ]) / (√ (vt / (1 - βp[2 ])) + ϵ )
321
+ @. Δ_ = η * mt / (1 - βp[1 ]) / (√ (vt / (1 - βp[2 ])) + o . epsilon )
317
322
@. Δ += 2 Δ_
318
323
βp .= βp .* β
319
324
@@ -340,16 +345,17 @@ opt = ADAGrad(0.001)
340
345
"""
341
346
mutable struct ADAGrad <: AbstractOptimiser
342
347
eta:: Float64
348
+ epsilon:: Float64
343
349
acc:: IdDict
344
350
end
345
351
346
- ADAGrad (η = 0.1 ) = ADAGrad (η, IdDict ())
352
+ ADAGrad (η = 0.1 , ϵ = ϵ ) = ADAGrad (η, ϵ , IdDict ())
347
353
348
354
function apply! (o:: ADAGrad , x, Δ)
349
355
η = o. eta
350
- acc = get! (() -> fill! (similar (x), ϵ ), o. acc, x):: typeof (x)
356
+ acc = get! (() -> fill! (similar (x), o . epsilon ), o. acc, x):: typeof (x)
351
357
@. acc += Δ * conj (Δ)
352
- @. Δ *= η / (√ acc + ϵ )
358
+ @. Δ *= η / (√ acc + o . epsilon )
353
359
end
354
360
355
361
"""
@@ -371,18 +377,19 @@ opt = ADADelta(0.89)
371
377
"""
372
378
mutable struct ADADelta <: AbstractOptimiser
373
379
rho:: Float64
380
+ epsilon:: Float64
374
381
state:: IdDict
375
382
end
376
383
377
- ADADelta (ρ = 0.9 ) = ADADelta (ρ, IdDict ())
384
+ ADADelta (ρ = 0.9 , ϵ = ϵ ) = ADADelta (ρ, ϵ , IdDict ())
378
385
379
386
function apply! (o:: ADADelta , x, Δ)
380
387
ρ = o. rho
381
388
acc, Δacc = get! (() -> (zero (x), zero (x)), o. state, x):: NTuple{2,typeof(x)}
382
389
@. acc = ρ * acc + (1 - ρ) * Δ * conj (Δ)
383
390
# DON'T remove epsilon from numerator
384
391
# or even out of the square roots
385
- @. Δ *= √ (Δacc + ϵ ) / √ (acc + ϵ )
392
+ @. Δ *= √ (Δacc + o . epsilon ) / √ (acc + o . epsilon )
386
393
@. Δacc = ρ * Δacc + (1 - ρ) * Δ * conj (Δ)
387
394
return Δ
388
395
end
@@ -409,22 +416,23 @@ opt = AMSGrad(0.001, (0.89, 0.995))
409
416
mutable struct AMSGrad <: AbstractOptimiser
410
417
eta:: Float64
411
418
beta:: Tuple{Float64, Float64}
419
+ epsilon:: Float64
412
420
state:: IdDict
413
421
end
414
422
415
- AMSGrad (η = 0.001 , β = (0.9 , 0.999 )) = AMSGrad (η, β, IdDict ())
423
+ AMSGrad (η = 0.001 , β = (0.9 , 0.999 ), ϵ = ϵ ) = AMSGrad (η, β, ϵ , IdDict ())
416
424
417
425
function apply! (o:: AMSGrad , x, Δ)
418
426
η, β = o. eta, o. beta
419
427
420
428
mt, vt, v̂t = get! (o. state, x) do
421
- (fill! (similar (x), ϵ ), fill! (similar (x), ϵ ), fill! (similar (x), ϵ ))
429
+ (fill! (similar (x), o . epsilon ), fill! (similar (x), o . epsilon ), fill! (similar (x), o . epsilon ))
422
430
end :: NTuple{3,typeof(x)}
423
431
424
432
@. mt = β[1 ] * mt + (1 - β[1 ]) * Δ
425
433
@. vt = β[2 ] * vt + (1 - β[2 ]) * Δ ^ 2
426
434
@. v̂t = max (v̂t, vt)
427
- @. Δ = η * mt / (√ v̂t + ϵ )
435
+ @. Δ = η * mt / (√ v̂t + o . epsilon )
428
436
end
429
437
430
438
"""
@@ -449,10 +457,11 @@ opt = NADAM(0.002, (0.89, 0.995))
449
457
mutable struct NADAM <: AbstractOptimiser
450
458
eta:: Float64
451
459
beta:: Tuple{Float64, Float64}
460
+ epsilon:: Float64
452
461
state:: IdDict
453
462
end
454
463
455
- NADAM (η = 0.001 , β = (0.9 , 0.999 )) = NADAM (η, β, IdDict ())
464
+ NADAM (η = 0.001 , β = (0.9 , 0.999 ), ϵ = ϵ ) = NADAM (η, β, ϵ , IdDict ())
456
465
457
466
function apply! (o:: NADAM , x, Δ)
458
467
η, β = o. eta, o. beta
@@ -464,7 +473,7 @@ function apply!(o::NADAM, x, Δ)
464
473
465
474
@. mt = β[1 ] * mt + (1 - β[1 ]) * Δ
466
475
@. vt = β[2 ] * vt + (1 - β[2 ]) * Δ * conj (Δ)
467
- @. Δ = (β[1 ] * mt / (1 - β[1 ] * β1p) + (1 - β[1 ]) * Δ / (1 - β1p)) / (√ (vt * β[2 ] / (1 - β2p)) + ϵ ) * η
476
+ @. Δ = (β[1 ] * mt / (1 - β[1 ] * β1p) + (1 - β[1 ]) * Δ / (1 - β1p)) / (√ (vt * β[2 ] / (1 - β2p)) + o . epsilon ) * η
468
477
βp .= βp .* β
469
478
470
479
return Δ
@@ -515,17 +524,18 @@ opt = AdaBelief(0.001, (0.9, 0.8))
515
524
mutable struct AdaBelief
516
525
eta:: Float64
517
526
beta:: Tuple{Float64,Float64}
527
+ epsilon:: Float64
518
528
state:: IdDict
519
529
end
520
530
521
- AdaBelief (η = 0.001 , β = (0.9 , 0.999 )) = AdaBelief (η, β, IdDict ())
531
+ AdaBelief (η = 0.001 , β = (0.9 , 0.999 ), ϵ = ϵ ) = AdaBelief (η, β, ϵ , IdDict ())
522
532
523
533
function apply! (o:: AdaBelief , x, Δ)
524
534
η, β = o. eta, o. beta
525
535
mt, st = get! (() -> (zero (x), zero (x)), o. state, x):: NTuple{2,typeof(x)}
526
536
@. mt = β[1 ] * mt + (1 - β[1 ]) * Δ
527
537
@. st = β[2 ] * st + (1 - β[2 ]) * (Δ - mt) * conj (Δ - mt)
528
- @. Δ = η * mt / (√ (st) + ϵ )
538
+ @. Δ = η * mt / (√ (st) + o . epsilon )
529
539
return Δ
530
540
end
531
541
0 commit comments