Add dropout & attention tests for AMDGPU (#472)

pxl-th · web-flow · commit b6645fc803c0 · 2023-02-14T21:12:15.000+01:00
* Add dropout &amp; attention tests for AMDGPU

Refactor code a bit and add more detailed error messages.

* Print-out AMDGPU versioninfo

* Print version
diff --git a/src/attention.jl b/src/attention.jl
@@ -5,7 +5,7 @@ const AA{N,T} = AbstractArray{T,N}
 """
     dot_product_attention(query, key, value, [bias]; [fdrop, mask, nheads])
 
-Multihead dot product attention used in transformer architectures. 
+Multihead dot product attention used in transformer architectures.
 
 The input arrays must have the first two dimensions given by the number of features
 and the sequence length, then an arbitrary number of batch dimensions or none.
@@ -23,15 +23,15 @@ See also [`dot_product_attention_scores`](@ref) if you only need the attention s
 - `value`: Value array of size `(v_dim, kv_len, batch_size...)`.
 - `bias`: Either `nothing` or an array broadcastable to size `(kv_len, q_len, nheads, batch_size)`.
           It will be added to the attention scores before applying the softmax. Default `nothing`.
-- `fdrop`: A dropout function or layer to be applied on the attention scores right after the softmax. 
-           Default `identity` (no dropout). 
+- `fdrop`: A dropout function or layer to be applied on the attention scores right after the softmax.
+           Default `identity` (no dropout).
 - `mask`: Either `nothing` or a boolean array broadcastable to size `(kv_len, q_len, nheads, batch_size)`.
           The mask is applied to the attention scores just before the softmax.
           See [`make_causal_mask`](@ref) fore creating causal masks. Default `nothing`.
 - `nheads`: Number of heads to split the input arrays into. Default `1`.
 
 # Examples
-    
+
 ```julia
 q, k, v = rand(10, 20, 2), rand(10, 30, 2), rand(20, 30, 2)
 y, α = dot_product_attention(q, k, v)
@@ -49,13 +49,34 @@ function dot_product_attention(q::AA{N}, k::AA{N}, v::AA{N}, args...; kws...) wh
     return x, α
 end
 
-function dot_product_attention(q::AA3, k::AA3, v::AA3, bias=nothing; 
+function dot_product_attention(q::AA3, k::AA3, v::AA3, bias=nothing;
             fdrop=identity, mask=nothing, nheads=1)
 
-    (size(q, 3) == size(k, 3) == size(v, 3)) || throw(ArgumentError("Batch dimensions have to be the same."))
-    size(q, 1) == size(k, 1) || throw(ArgumentError("First dimension in query and key has to be the same."))
-    size(k, 2) == size(v, 2) || throw(ArgumentError("Second dimension in key and value has to be the same."))
-    
+    (all(size.((q, k, v), 1) .% nheads .== 0)) || throw(ArgumentError("""
+    First dimension in query, key and value must be divisible by `nheads`.
+    Instead:
+    - size(q): $(size(q))
+    - size(k): $(size(q))
+    - size(v): $(size(q))
+    - nheads: $nheads
+    """))
+    (size(q, 3) == size(k, 3) == size(v, 3)) || throw(ArgumentError("""
+    Batch dimensions have to be the same. Instead:
+    - size(q): $(size(q))
+    - size(k): $(size(k))
+    - size(v): $(size(v))
+    """))
+    size(q, 1) == size(k, 1) || throw(ArgumentError("""
+    First dimension in query and key has to be the same. Instead:
+    - size(q): $(size(q))
+    - size(k): $(size(k))
+    """))
+    size(k, 2) == size(v, 2) || throw(ArgumentError("""
+    Second dimension in key and value has to be the same. Instead:
+    - size(k): $(size(k))
+    - size(v): $(size(v))
+    """))
+
     # Multihead attention. TODO create fastpath for singlehead attention.
     q, k, v = split_heads.((q, k, v), nheads)
     x, α = _dot_product_attention(q, k, v, bias, fdrop, mask)
@@ -69,7 +90,7 @@ function _dot_product_attention(q::AA4, k::AA4, v::AA4, bias, fdrop, mask)
 
     α = dot_product_attention_scores(q, k, bias; fdrop, mask)
     # [α] = [kv_len, q_len, nheads, batch_size]
-    
+
     # The following permutedims and batched_mul are equivalent to
     # @tullio x[d, h, i, b] := α[j, i, h, b] * v[d, h, j, b]
     vt = permutedims(v, (1, 3, 2, 4))
@@ -83,12 +104,12 @@ end
     dot_product_attention_scores(query, key, [bias]; [fdrop, mask])
 
 Return the attention scores for the [`dot_product_attention`](@ref).
-Input arrays must have dimensions 
+Input arrays must have dimensions
 `(num_features ÷ nheads, nheads, sequence_length, batch_size)`.
 
 See [`dot_product_attention`](@ref) for more details.
 """
-function dot_product_attention_scores(q::AA4{T}, k::AA4{T}, bias=nothing; 
+function dot_product_attention_scores(q::AA4{T}, k::AA4{T}, bias=nothing;
             fdrop=identity, mask=nothing) where T
 
     # The following permutedims and batched_mul are equivalent to
@@ -100,7 +121,7 @@ function dot_product_attention_scores(q::AA4{T}, k::AA4{T}, bias=nothing;
 
     logits = apply_attn_bias(logits, bias)
     logits = apply_attn_mask(logits, mask)
-    
+
     α = softmax(logits, dims=1)
     return fdrop(α)
 end
@@ -109,7 +130,6 @@ apply_attn_bias(logits, bias::Nothing) = logits
 
 apply_attn_bias(logits, bias) = logits .+ bias
 
-
 apply_attn_mask(logits, mask::Nothing) = logits
 
 function apply_attn_mask(logits, mask)
@@ -118,11 +138,11 @@ function apply_attn_mask(logits, mask)
 end
 
 
-""" 
+"""
     make_causal_mask(x, dims=2)
 
 Return a boolean square matrix `m` of the same type as `x` and of side `size(x, dims)`.
-Its elements are set such that `m[i, j] == i ≤ j`. 
+Its elements are set such that `m[i, j] == i ≤ j`.
 
 Can be used to mask the attention scores in [`dot_product_attention`](@ref).
 """
@@ -141,4 +161,3 @@ join_heads(x) = reshape(x, :, size(x)[3:end]...)
 @non_differentiable make_causal_mask(::Any...)
 @non_differentiable trues_like(::Any...)
 @non_differentiable falses_like(::Any...)
-
diff --git a/test/amd/attention.jl b/test/amd/attention.jl
@@ -0,0 +1,42 @@
+@testset "Compare CPU & GPU" begin
+    n = 15
+    lenq = 3
+    lenkv = 4
+    for batch_size in [(), 1, 2, (2, 1, 3)], nheads in [1, 3, 5]
+        q = AMDGPU.rand(Float32, n, lenq, batch_size...)
+        k = AMDGPU.rand(Float32, n, lenkv, batch_size...)
+        v = AMDGPU.rand(Float32, n, lenkv, batch_size...)
+        y, α = @inferred dot_product_attention(q, k, v; nheads)
+
+        @test y isa ROCArray{Float32}
+        @test size(y) == (n, lenq, batch_size...)
+        @test size(α) == (lenkv, lenq, nheads, batch_size...)
+        @test sum(Array(α), dims=1) ≈ ones(1, lenq, nheads, batch_size...)
+
+        qh = rand(Float32, n, lenq, batch_size...)
+        kh = rand(Float32, n, lenkv, batch_size...)
+        vh = rand(Float32, n, lenkv, batch_size...)
+        gputest(
+            (x...) -> dot_product_attention(x...; nheads)[1], qh, kh, vh;
+            atol=1f-5)
+    end
+end
+
+@testset "Mask" begin
+    x = AMDGPU.rand(Float32, 4, 2, 3, 1)
+    mask = make_causal_mask(x, dims=3)
+    @test mask isa ROCArray{Bool}
+    α = dot_product_attention_scores(x, x; mask)
+
+    α_host, mask_host = Array.((α, mask))
+    @test all((α_host[:, :, 1, 1] .> 0) .== mask_host)
+    @test all((α_host[:, :, 2, 1] .> 0) .== mask_host)
+end
+
+@testset "Dropout" begin
+    q = k = v = AMDGPU.rand(Float32, 10, 10, 10)
+    fdrop(x, p) = (rand!(similar(x)) .> p) .* x ./ (1-p)
+    y, α = dot_product_attention(
+        q, k, v; nheads=2, fdrop=x -> dropout(x, 0.5))
+    @test 0.6 > mean(>(0), α) > 0.4
+end
diff --git a/test/amd/dropout.jl b/test/amd/dropout.jl
@@ -0,0 +1,20 @@
+@testset "Test API" begin
+    x = AMDGPU.randn(Float32, 3, 4)
+    @test size(@inferred dropout(x, 0.1)) == (3, 4)
+    @test size(@inferred dropout(x, 0.2; dims=2)) == (3, 4)
+    @test size(@inferred dropout(x, 0.3; dims=(1, 2))) == (3, 4)
+
+    rng = AMDGPU.rocRAND.default_rng()
+    @test size(@inferred dropout(rng, x, 0.1)) == (3, 4)
+    @test size(@inferred dropout(rng, x, 0.1; dims=2)) == (3, 4)
+
+    # Values
+    d45 = dropout(AMDGPU.ones(100, 100, 100), 0.45)
+    @test mean(d45) ≈ 1 atol=1e-2
+    dpi2 = dropout(AMDGPU.fill(1f0 * pi, 1000), 0.2)
+    @test sort(unique(Array(dpi2))) ≈ [0, 5 * pi / 4]
+    d33 = dropout(AMDGPU.fill(3f0, 10, 1000), 0.3, dims=2)
+    @test sort(unique(vec(Array(d33)))) ≈ [0, 3 / (1 - 0.3)]
+
+    @test Zygote.gradient(x -> sum(dropout(x, 0.1)), x)[1] isa ROCArray{Float32}
+end
diff --git a/test/amd/runtests.jl b/test/amd/runtests.jl
@@ -52,3 +52,11 @@ end
 @testset "Activations" begin
     include("activations.jl")
 end
+
+@testset "Dropout" begin
+    include("dropout.jl")
+end
+
+@testset "Attention" begin
+    include("attention.jl")
+end
diff --git a/test/attention.jl b/test/attention.jl
@@ -36,7 +36,7 @@ end
 @testset "mask" begin
     q = rand(4, 2, 3, 1)
     k = rand(4, 2, 5, 1)
-    
+
     mask = rand(Bool, (5, 3))
     α = dot_product_attention_scores(q, k; mask)
     @test all((α[:,:,1,1].> 0) .== mask)
@@ -53,7 +53,7 @@ end
 
 @testset "dropout" begin
     q = k = v = rand(10, 10, 10)
-    fdrop(x, p) = (rand!(similar(x)) .> p) .* x ./ (1-p) 
+    fdrop(x, p) = (rand!(similar(x)) .> p) .* x ./ (1-p)
     y, α = dot_product_attention(q, k, v; nheads=2, fdrop = x -> fdrop(x, 0.5))
     @test 0.6 > mean(>(0), α) > 0.4
 end
@@ -63,7 +63,7 @@ end
     k = v = rand(4, 3, 1)
     bias = randn(3, 5)
     y, α = dot_product_attention(q, k, v, bias; nheads=2)
-    @test size(α) == (3, 5, 2, 1) 
+    @test size(α) == (3, 5, 2, 1)
     @test size(y) == (4, 5, 1)
 end
 
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -29,6 +29,8 @@ include("test_utils.jl")
     if get(ENV, "NNLIB_TEST_AMDGPU", "false") == "true"
         using AMDGPU
         if AMDGPU.functional() && AMDGPU.functional(:MIOpen)
+            AMDGPU.versioninfo()
+            @show AMDGPU.MIOpen.version()
             @testset "AMDGPU" begin
                 include("amd/runtests.jl")
             end