Merge pull request #169 from theabhirath/vit-hotfix

ToucheSir · web-flow · commit 5bd0bf351ea8 · 2022-06-15T23:09:58.000-07:00
Hotfix for ViT on GPU
diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "Metalhead"
 uuid = "dbeba491-748d-5e0e-a39e-b530a07fa0cc"
-version = "0.7.2-DEV"
+version = "0.7.2"
 
 [deps]
 Artifacts = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
diff --git a/src/layers/attention.jl b/src/layers/attention.jl
@@ -50,9 +50,9 @@ function (m::MHAttention)(x::AbstractArray{T, 3}) where {T}
     scale = convert(T, sqrt(size(query, 1) / m.nheads))
     key_reshaped = reshape(permutedims(key, (2, 1, 3, 4)), m.nheads, nfeatures ÷ m.nheads,
                            seq_len * batch_size)
-    query_reshaped = reshape(query, nfeatures ÷ m.nheads, m.nheads, seq_len * batch_size)
+    query_reshaped = reshape(permutedims(query, (1, 2, 3, 4)), nfeatures ÷ m.nheads, m.nheads, seq_len * batch_size)
     attention = m.attn_drop(softmax(batched_mul(query_reshaped, key_reshaped) .* scale))
-    value_reshaped = reshape(value, nfeatures ÷ m.nheads, m.nheads, seq_len * batch_size)
+    value_reshaped = reshape(permutedims(value, (1, 2, 3, 4)), nfeatures ÷ m.nheads, m.nheads, seq_len * batch_size)
     pre_projection = reshape(batched_mul(attention, value_reshaped),
                              (nfeatures, seq_len, batch_size))
     y = m.projection(reshape(pre_projection, size(pre_projection, 1), :))