@@ -147,9 +147,9 @@ function apply!(o::RMSProp, x, Δ)
147
147
end
148
148
149
149
"""
150
- ADAM (η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS )
150
+ Adam (η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS )
151
151
152
- [ADAM ](https://arxiv.org/abs/1412.6980) optimiser.
152
+ [Adam ](https://arxiv.org/abs/1412.6980) optimiser.
153
153
154
154
# Parameters
155
155
- Learning rate (`η`): Amount by which gradients are discounted before updating
@@ -159,21 +159,21 @@ end
159
159
160
160
# Examples
161
161
```julia
162
- opt = ADAM ()
162
+ opt = Adam ()
163
163
164
- opt = ADAM (0.001, (0.9, 0.8))
164
+ opt = Adam (0.001, (0.9, 0.8))
165
165
```
166
166
"""
167
- mutable struct ADAM <: AbstractOptimiser
167
+ mutable struct Adam <: AbstractOptimiser
168
168
eta:: Float64
169
169
beta:: Tuple{Float64,Float64}
170
170
epsilon:: Float64
171
171
state:: IdDict{Any, Any}
172
172
end
173
- ADAM (η:: Real = 0.001 , β:: Tuple = (0.9 , 0.999 ), ϵ:: Real = EPS) = ADAM (η, β, ϵ, IdDict ())
174
- ADAM (η:: Real , β:: Tuple , state:: IdDict ) = ADAM (η, β, EPS, state)
173
+ Adam (η:: Real = 0.001 , β:: Tuple = (0.9 , 0.999 ), ϵ:: Real = EPS) = Adam (η, β, ϵ, IdDict ())
174
+ Adam (η:: Real , β:: Tuple , state:: IdDict ) = Adam (η, β, EPS, state)
175
175
176
- function apply! (o:: ADAM , x, Δ)
176
+ function apply! (o:: Adam , x, Δ)
177
177
η, β = o. eta, o. beta
178
178
179
179
mt, vt, βp = get! (o. state, x) do
@@ -189,9 +189,9 @@ function apply!(o::ADAM, x, Δ)
189
189
end
190
190
191
191
"""
192
- RADAM (η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS )
192
+ RAdam (η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS )
193
193
194
- [Rectified ADAM ](https://arxiv.org/abs/1908.03265) optimizer.
194
+ [Rectified Adam ](https://arxiv.org/abs/1908.03265) optimizer.
195
195
196
196
# Parameters
197
197
- Learning rate (`η`): Amount by which gradients are discounted before updating
@@ -201,21 +201,21 @@ end
201
201
202
202
# Examples
203
203
```julia
204
- opt = RADAM ()
204
+ opt = RAdam ()
205
205
206
- opt = RADAM (0.001, (0.9, 0.8))
206
+ opt = RAdam (0.001, (0.9, 0.8))
207
207
```
208
208
"""
209
- mutable struct RADAM <: AbstractOptimiser
209
+ mutable struct RAdam <: AbstractOptimiser
210
210
eta:: Float64
211
211
beta:: Tuple{Float64,Float64}
212
212
epsilon:: Float64
213
213
state:: IdDict{Any, Any}
214
214
end
215
- RADAM (η:: Real = 0.001 , β:: Tuple = (0.9 , 0.999 ), ϵ:: Real = EPS) = RADAM (η, β, ϵ, IdDict ())
216
- RADAM (η:: Real , β:: Tuple , state:: IdDict ) = RADAM (η, β, EPS, state)
215
+ RAdam (η:: Real = 0.001 , β:: Tuple = (0.9 , 0.999 ), ϵ:: Real = EPS) = RAdam (η, β, ϵ, IdDict ())
216
+ RAdam (η:: Real , β:: Tuple , state:: IdDict ) = RAdam (η, β, EPS, state)
217
217
218
- function apply! (o:: RADAM , x, Δ)
218
+ function apply! (o:: RAdam , x, Δ)
219
219
η, β = o. eta, o. beta
220
220
ρ∞ = 2 / (1 - β[2 ])- 1
221
221
241
241
"""
242
242
AdaMax(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS )
243
243
244
- [AdaMax](https://arxiv.org/abs/1412.6980) is a variant of ADAM based on the ∞-norm.
244
+ [AdaMax](https://arxiv.org/abs/1412.6980) is a variant of Adam based on the ∞-norm.
245
245
246
246
# Parameters
247
247
- Learning rate (`η`): Amount by which gradients are discounted before updating
@@ -281,10 +281,10 @@ function apply!(o::AdaMax, x, Δ)
281
281
end
282
282
283
283
"""
284
- OADAM (η = 0.0001, β::Tuple = (0.5, 0.9), ϵ = $EPS )
284
+ OAdam (η = 0.0001, β::Tuple = (0.5, 0.9), ϵ = $EPS )
285
285
286
- [OADAM ](https://arxiv.org/abs/1711.00141) (Optimistic ADAM )
287
- is a variant of ADAM adding an "optimistic" term suitable for adversarial training.
286
+ [OAdam ](https://arxiv.org/abs/1711.00141) (Optimistic Adam )
287
+ is a variant of Adam adding an "optimistic" term suitable for adversarial training.
288
288
289
289
# Parameters
290
290
- Learning rate (`η`): Amount by which gradients are discounted before updating
@@ -294,21 +294,21 @@ is a variant of ADAM adding an "optimistic" term suitable for adversarial traini
294
294
295
295
# Examples
296
296
```julia
297
- opt = OADAM ()
297
+ opt = OAdam ()
298
298
299
- opt = OADAM (0.001, (0.9, 0.995))
299
+ opt = OAdam (0.001, (0.9, 0.995))
300
300
```
301
301
"""
302
- mutable struct OADAM <: AbstractOptimiser
302
+ mutable struct OAdam <: AbstractOptimiser
303
303
eta:: Float64
304
304
beta:: Tuple{Float64,Float64}
305
305
epsilon:: Float64
306
306
state:: IdDict{Any, Any}
307
307
end
308
- OADAM (η:: Real = 0.001 , β:: Tuple = (0.5 , 0.9 ), ϵ:: Real = EPS) = OADAM (η, β, ϵ, IdDict ())
309
- OADAM (η:: Real , β:: Tuple , state:: IdDict ) = RMSProp (η, β, EPS, state)
308
+ OAdam (η:: Real = 0.001 , β:: Tuple = (0.5 , 0.9 ), ϵ:: Real = EPS) = OAdam (η, β, ϵ, IdDict ())
309
+ OAdam (η:: Real , β:: Tuple , state:: IdDict ) = RMSProp (η, β, EPS, state)
310
310
311
- function apply! (o:: OADAM , x, Δ)
311
+ function apply! (o:: OAdam , x, Δ)
312
312
η, β = o. eta, o. beta
313
313
314
314
mt, vt, Δ_, βp = get! (o. state, x) do
@@ -326,9 +326,9 @@ function apply!(o::OADAM, x, Δ)
326
326
end
327
327
328
328
"""
329
- ADAGrad (η = 0.1, ϵ = $EPS )
329
+ AdaGrad (η = 0.1, ϵ = $EPS )
330
330
331
- [ADAGrad ](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimizer. It has
331
+ [AdaGrad ](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimizer. It has
332
332
parameter specific learning rates based on how frequently it is updated.
333
333
Parameters don't need tuning.
334
334
@@ -338,20 +338,20 @@ Parameters don't need tuning.
338
338
339
339
# Examples
340
340
```julia
341
- opt = ADAGrad ()
341
+ opt = AdaGrad ()
342
342
343
- opt = ADAGrad (0.001)
343
+ opt = AdaGrad (0.001)
344
344
```
345
345
"""
346
- mutable struct ADAGrad <: AbstractOptimiser
346
+ mutable struct AdaGrad <: AbstractOptimiser
347
347
eta:: Float64
348
348
epsilon:: Float64
349
349
acc:: IdDict
350
350
end
351
- ADAGrad (η:: Real = 0.1 , ϵ:: Real = EPS) = ADAGrad (η, ϵ, IdDict ())
352
- ADAGrad (η:: Real , state:: IdDict ) = ADAGrad (η, EPS, state)
351
+ AdaGrad (η:: Real = 0.1 , ϵ:: Real = EPS) = AdaGrad (η, ϵ, IdDict ())
352
+ AdaGrad (η:: Real , state:: IdDict ) = AdaGrad (η, EPS, state)
353
353
354
- function apply! (o:: ADAGrad , x, Δ)
354
+ function apply! (o:: AdaGrad , x, Δ)
355
355
η = o. eta
356
356
acc = get! (() -> fill! (similar (x), o. epsilon), o. acc, x):: typeof (x)
357
357
@. acc += Δ * conj (Δ)
361
361
"""
362
362
ADADelta(ρ = 0.9, ϵ = $EPS )
363
363
364
- [ADADelta](https://arxiv.org/abs/1212.5701) is a version of ADAGrad adapting its learning
364
+ [ADADelta](https://arxiv.org/abs/1212.5701) is a version of AdaGrad adapting its learning
365
365
rate based on a window of past gradient updates.
366
366
Parameters don't need tuning.
367
367
397
397
"""
398
398
AMSGrad(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS )
399
399
400
- The [AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) version of the ADAM
400
+ The [AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) version of the Adam
401
401
optimiser. Parameters don't need tuning.
402
402
403
403
# Parameters
@@ -436,9 +436,9 @@ function apply!(o::AMSGrad, x, Δ)
436
436
end
437
437
438
438
"""
439
- NADAM (η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS )
439
+ NAdam (η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS )
440
440
441
- [NADAM ](https://openreview.net/forum?id=OM0jvwB8jIp57ZJjtNEZ) is a Nesterov variant of ADAM .
441
+ [NAdam ](https://openreview.net/forum?id=OM0jvwB8jIp57ZJjtNEZ) is a Nesterov variant of Adam .
442
442
Parameters don't need tuning.
443
443
444
444
# Parameters
@@ -449,21 +449,21 @@ Parameters don't need tuning.
449
449
450
450
# Examples
451
451
```julia
452
- opt = NADAM ()
452
+ opt = NAdam ()
453
453
454
- opt = NADAM (0.002, (0.89, 0.995))
454
+ opt = NAdam (0.002, (0.89, 0.995))
455
455
```
456
456
"""
457
- mutable struct NADAM <: AbstractOptimiser
457
+ mutable struct NAdam <: AbstractOptimiser
458
458
eta:: Float64
459
459
beta:: Tuple{Float64, Float64}
460
460
epsilon:: Float64
461
461
state:: IdDict{Any, Any}
462
462
end
463
- NADAM (η:: Real = 0.001 , β = (0.9 , 0.999 ), ϵ:: Real = EPS) = NADAM (η, β, ϵ, IdDict ())
464
- NADAM (η:: Real , β:: Tuple , state:: IdDict ) = NADAM (η, β, EPS, state)
463
+ NAdam (η:: Real = 0.001 , β = (0.9 , 0.999 ), ϵ:: Real = EPS) = NAdam (η, β, ϵ, IdDict ())
464
+ NAdam (η:: Real , β:: Tuple , state:: IdDict ) = NAdam (η, β, EPS, state)
465
465
466
- function apply! (o:: NADAM , x, Δ)
466
+ function apply! (o:: NAdam , x, Δ)
467
467
η, β = o. eta, o. beta
468
468
469
469
mt, vt, βp = get! (o. state, x) do
@@ -480,9 +480,9 @@ function apply!(o::NADAM, x, Δ)
480
480
end
481
481
482
482
"""
483
- ADAMW (η = 0.001, β::Tuple = (0.9, 0.999), decay = 0)
483
+ AdamW (η = 0.001, β::Tuple = (0.9, 0.999), decay = 0)
484
484
485
- [ADAMW ](https://arxiv.org/abs/1711.05101) is a variant of ADAM fixing (as in repairing) its
485
+ [AdamW ](https://arxiv.org/abs/1711.05101) is a variant of Adam fixing (as in repairing) its
486
486
weight decay regularization.
487
487
488
488
# Parameters
@@ -494,19 +494,19 @@ weight decay regularization.
494
494
495
495
# Examples
496
496
```julia
497
- opt = ADAMW ()
497
+ opt = AdamW ()
498
498
499
- opt = ADAMW (0.001, (0.89, 0.995), 0.1)
499
+ opt = AdamW (0.001, (0.89, 0.995), 0.1)
500
500
```
501
501
"""
502
- ADAMW (η = 0.001 , β = (0.9 , 0.999 ), decay = 0 ) =
503
- Optimiser (ADAM (η, β), WeightDecay (decay))
502
+ AdamW (η = 0.001 , β = (0.9 , 0.999 ), decay = 0 ) =
503
+ Optimiser (Adam (η, β), WeightDecay (decay))
504
504
505
505
"""
506
506
AdaBelief(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS )
507
507
508
508
The [AdaBelief](https://arxiv.org/abs/2010.07468) optimiser is a variant of the well-known
509
- ADAM optimiser.
509
+ Adam optimiser.
510
510
511
511
# Parameters
512
512
- Learning rate (`η`): Amount by which gradients are discounted before updating
@@ -537,7 +537,7 @@ function apply!(o::AdaBelief, x, Δ)
537
537
(zero (x), zero (x), Float64[β[1 ], β[2 ]])
538
538
end :: Tuple{typeof(x), typeof(x), Vector{Float64}}
539
539
540
- #= st is a variance and can go to zero. This is in contrast to ADAM , which uses the
540
+ #= st is a variance and can go to zero. This is in contrast to Adam , which uses the
541
541
second moment which is usually far enough from zero. This is problematic, since st
542
542
can be slightly negative due to numerical error, and the square root below will fail.
543
543
Also, if we want to differentiate through the optimizer, √0 is not differentiable.
@@ -643,10 +643,10 @@ for more general scheduling techniques.
643
643
`ExpDecay` is typically composed with other optimizers
644
644
as the last transformation of the gradient:
645
645
```julia
646
- opt = Optimiser(ADAM (), ExpDecay(1.0))
646
+ opt = Optimiser(Adam (), ExpDecay(1.0))
647
647
```
648
648
Note: you may want to start with `η=1` in `ExpDecay` when combined with other
649
- optimizers (`ADAM ` in this case) that have their own learning rate.
649
+ optimizers (`Adam ` in this case) that have their own learning rate.
650
650
"""
651
651
mutable struct ExpDecay <: AbstractOptimiser
652
652
eta:: Float64
@@ -681,7 +681,7 @@ with coefficient ``λ`` to the loss.
681
681
# Examples
682
682
683
683
```julia
684
- opt = Optimiser(WeightDecay(1f-4), ADAM ())
684
+ opt = Optimiser(WeightDecay(1f-4), Adam ())
685
685
```
686
686
"""
687
687
mutable struct WeightDecay <: AbstractOptimiser
0 commit comments