1
1
"""
2
- MHAttention(nheads::Integer, qkv_layer, attn_drop , projection)
2
+ MHAttention(nheads::Integer, qkv_layer, attn_drop_rate , projection)
3
3
4
4
Multi-head self-attention layer.
5
5
@@ -34,9 +34,9 @@ function MHAttention(planes::Integer, nheads::Integer = 8; qkv_bias::Bool = fals
34
34
attn_drop_rate = 0.0 , proj_drop_rate = 0.0 )
35
35
@assert planes % nheads== 0 " planes should be divisible by nheads"
36
36
qkv_layer = Dense (planes, planes * 3 ; bias = qkv_bias)
37
- attn_drop = Dropout (attn_drop_rate)
37
+ attn_drop_rate = Dropout (attn_drop_rate)
38
38
proj = Chain (Dense (planes, planes), Dropout (proj_drop_rate))
39
- return MHAttention (nheads, qkv_layer, attn_drop , proj)
39
+ return MHAttention (nheads, qkv_layer, attn_drop_rate , proj)
40
40
end
41
41
42
42
@functor MHAttention
@@ -52,7 +52,7 @@ function (m::MHAttention)(x::AbstractArray{T, 3}) where {T}
52
52
seq_len * batch_size)
53
53
query_reshaped = reshape (permutedims (query, (1 , 2 , 3 , 4 )), nfeatures ÷ m. nheads,
54
54
m. nheads, seq_len * batch_size)
55
- attention = m. attn_drop (softmax (batched_mul (query_reshaped, key_reshaped) .* scale))
55
+ attention = m. attn_drop_rate (softmax (batched_mul (query_reshaped, key_reshaped) .* scale))
56
56
value_reshaped = reshape (permutedims (value, (1 , 2 , 3 , 4 )), nfeatures ÷ m. nheads,
57
57
m. nheads, seq_len * batch_size)
58
58
pre_projection = reshape (batched_mul (attention, value_reshaped),
0 commit comments