17
17
# Aliases
18
18
export sigmoid, hardsigmoid, logsigmoid, thresholdrelu
19
19
20
-
20
+ # of type float
21
+ oftf (x, y) = oftype (float (x), y)
21
22
22
23
"""
23
24
σ(x) = 1 / (1 + exp(-x))
33
34
const sigmoid = σ
34
35
35
36
"""
36
- hardσ(x, a=0.2 ) = max(0, min(1, a * x + 0.5) )
37
+ hardσ(x) = max(0, min(1, ( x + 3) / 6 )
37
38
38
- Segment-wise linear approximation of sigmoid.
39
- See [BinaryConnect: Training Deep Neural Networks withbinary weights during propagations](https://arxiv.org/abs/1511.00363).
39
+ Piecewise linear approximation of sigmoid.
40
40
"""
41
- hardσ (x, a= 0.2 ) = oftype (x/ 1 , max (zero (x/ 1 ), min (one (x/ 1 ), oftype (x/ 1 ,a) * x + oftype (x/ 1 ,0.5 ))))
42
-
41
+ hardσ (x) = max (0 , min (1 , (x + 3 ) / 6 ))
42
+
43
+ # https://pytorch.org/docs/stable/generated/torch.nn.Hardsigmoid.html
44
+
43
45
const hardsigmoid = hardσ
44
46
45
47
"""
@@ -56,7 +58,7 @@ const logsigmoid = logσ
56
58
Segment-wise linear approximation of tanh. Cheaper and more computational efficient version of tanh.
57
59
See [Large Scale Machine Learning](https://ronan.collobert.com/pub/matos/2004_phdthesis_lip6.pdf).
58
60
"""
59
- hardtanh (x) = max (- one (x), min ( one (x), x))
61
+ hardtanh (x) = max (- one (x), min (one (x), x))
60
62
61
63
"""
62
64
relu(x) = max(0, x)
@@ -73,7 +75,7 @@ Leaky [Rectified Linear Unit](https://en.wikipedia.org/wiki/Rectifier_(neural_ne
73
75
activation function.
74
76
You can also specify the coefficient explicitly, e.g. `leakyrelu(x, 0.01)`.
75
77
"""
76
- leakyrelu (x, a = oftype (x / 1 , 0.01 )) = max (a * x, x/ 1 )
78
+ leakyrelu (x, a= oftf (x , 0.01 )) = max (a * x, x)
77
79
78
80
"""
79
81
relu6(x) = min(max(0, x), 6)
@@ -93,8 +95,8 @@ Randomized Leaky [Rectified Linear Unit](https://arxiv.org/abs/1505.00853)
93
95
activation function.
94
96
You can also specify the bound explicitly, e.g. `rrelu(x, 0.0, 1.0)`.
95
97
"""
96
- function rrelu (x, l = 1 / 8.0 , u = 1 / 3.0 )
97
- a = oftype (x / 1 , ( u - l) * rand () + l)
98
+ function rrelu (x:: T , l= 1 // 8 , u= 1 // 3 ) where T <: Number
99
+ a = ( u - l) * rand (float (T)) + l
98
100
return leakyrelu (x, a)
99
101
end
100
102
@@ -105,10 +107,9 @@ Exponential Linear Unit activation function.
105
107
See [Fast and Accurate Deep Network Learning by Exponential Linear Units](https://arxiv.org/abs/1511.07289).
106
108
You can also specify the coefficient explicitly, e.g. `elu(x, 1)`.
107
109
"""
108
- elu (x, α= 1 ) = ifelse (x ≥ 0 , x/ 1 , α * (exp (x) - 1 ))
109
-
110
- deriv_elu (x, Ω, α= 1 ) = ifelse (x ≥ 0 , one (x), Ω + α)
110
+ elu (x, α= 1 ) = ifelse (x ≥ 0 , float (x), α * (exp (x) - 1 ))
111
111
112
+ deriv_elu (Ω, α= 1 ) = ifelse (Ω ≥ 0 , 1 , Ω + α)
112
113
113
114
"""
114
115
gelu(x) = 0.5x * (1 + tanh(√(2/π) * (x + 0.044715x^3)))
@@ -117,11 +118,13 @@ deriv_elu(x, Ω, α=1) = ifelse(x ≥ 0, one(x), Ω + α)
117
118
activation function.
118
119
"""
119
120
function gelu (x)
120
- λ = oftype (x / 1 , √ ( 2 / π) )
121
- α = oftype (x / 1 , 0.044715 )
121
+ α = oftf (x, 0.044715 )
122
+ λ = oftf (x, gelu_λ )
122
123
x/ 2 * (1 + tanh (λ * (x + α * x^ 3 )))
123
124
end
124
125
126
+ const gelu_λ = √ (2 / π)
127
+
125
128
"""
126
129
swish(x) = x * σ(x)
127
130
@@ -148,15 +151,18 @@ Scaled exponential linear units.
148
151
See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515).
149
152
"""
150
153
function selu (x)
151
- λ = oftype (x / 1 , 1.0507009873554804934193349852946 )
152
- α = oftype (x / 1 , 1.6732632423543772848170429916717 )
153
- λ * ifelse (x > 0 , x/ 1 , α * (exp (x) - 1 ))
154
+ λ = oftf (x, selu_λ )
155
+ α = oftf (x, selu_α )
156
+ λ * ifelse (x > 0 , x, α * (exp (x) - 1 ))
154
157
end
155
158
159
+ const selu_λ = 1.0507009873554804934193349852946
160
+ const selu_α = 1.6732632423543772848170429916717
161
+
156
162
function deriv_selu (Ω)
157
- λ = oftype (Ω / 1 , 1.0507009873554804934193349852946 )
158
- α = oftype (Ω / 1 , 1.6732632423543772848170429916717 )
159
- return ifelse (Ω > 0 , λ, Ω + α* λ)
163
+ λ = oftf (Ω, selu_λ )
164
+ α = oftf (Ω, selu_α )
165
+ ifelse (Ω > 0 , λ, Ω + α * λ)
160
166
end
161
167
162
168
"""
165
171
Continuously Differentiable Exponential Linear Units
166
172
See [Continuously Differentiable Exponential Linear Units](https://arxiv.org/abs/1704.07483).
167
173
"""
168
- celu (x, α= 1 ) = ifelse (x ≥ 0 , x / 1 , α * (exp (x/ α) - 1 ))
174
+ celu (x, α= 1 ) = ifelse (x ≥ 0 , float (x) , α * (exp (x/ α) - 1 ))
169
175
170
176
"""
171
177
trelu(x, theta=1) = x > theta ? x : 0
@@ -174,14 +180,15 @@ Threshold Gated Rectified Linear.
174
180
See [ThresholdRelu](https://arxiv.org/abs/1402.3337)
175
181
"""
176
182
trelu (x, theta= 1 ) = ifelse (x > theta, x, zero (x))
183
+
177
184
const thresholdrelu = trelu
178
185
179
186
"""
180
187
softsign(x) = x / (1 + |x|)
181
188
182
189
See [Quadratic Polynomials Learn Better Image Features](http://www.iro.umontreal.ca/~lisa/publications2/index.php/attachments/single/205).
183
190
"""
184
- softsign (x) = x / (one (x) + abs (x))
191
+ softsign (x) = x / (1 + abs (x))
185
192
186
193
"""
187
194
softplus(x) = log(exp(x) + 1)
@@ -195,8 +202,9 @@ softplus(x) = ifelse(x > 0, x + log1p(exp(-x)), log1p(exp(x)))
195
202
196
203
Return `log(cosh(x))` which is computed in a numerically stable way.
197
204
"""
198
- logcosh (x) = x + softplus (- 2 x) - log ( oftype ( x, 2 ) )
205
+ logcosh (x) = x + softplus (- 2 x) - oftf ( x, log2 )
199
206
207
+ const log2 = log (2 )
200
208
201
209
"""
202
210
mish(x) = x * tanh(softplus(x))
@@ -219,7 +227,7 @@ tanhshrink(x) = x - tanh(x)
219
227
220
228
See [Softshrink Activation Function](https://www.gabormelli.com/RKB/Softshrink_Activation_Function).
221
229
"""
222
- softshrink (x, λ = oftype (x / 1 , 0.5 )) = min (max (zero (x) , x - λ), x + λ)
230
+ softshrink (x, λ= oftf (x , 0.5 )) = min (max (0 , x - λ), x + λ)
223
231
224
232
# Provide an informative error message if activation functions are called with an array
225
233
for f in ACTIVATIONS
@@ -241,7 +249,7 @@ UNARY_ACTS = [ # f, df
241
249
(:hardtanh , :(- 1 < x < 1 )),
242
250
(:selu , :(deriv_selu (Ω))),
243
251
(:σ , :(conj (Ω * (1 - Ω)))),
244
- (:elu , :(deriv_elu (x, Ω))),
252
+ (:elu , :(deriv_elu (Ω))),
245
253
]
246
254
247
255
for (f, df) in UNARY_ACTS
260
268
261
269
262
270
BINARY_ACTS = [ # f, df1, df2
263
- (:elu , :(deriv_elu (x1, Ω, x2)), :(DoesNotExist ())), # TODO use real deriv instead of DNE
271
+ (:elu , :(deriv_elu (Ω, x2)), :(DoesNotExist ())), # TODO use real deriv instead of DNE
264
272
]
265
273
266
274
for (f, df1, df2) in BINARY_ACTS
0 commit comments