diff --git a/docs/src/darray.md b/docs/src/darray.md index 956436b6..93399486 100644 --- a/docs/src/darray.md +++ b/docs/src/darray.md @@ -684,6 +684,9 @@ From `Base`: From `Random`: - `rand!`/`randn!` +From `SparseArrays`: +- `sprand` + From `Statistics`: - `mean` - `var` @@ -694,7 +697,7 @@ From `LinearAlgebra`: - `*` (Out-of-place Matrix-(Matrix/Vector) multiply) - `mul!` (In-place Matrix-Matrix multiply) - `cholesky`/`cholesky!` (In-place/Out-of-place Cholesky factorization) -- `lu`/`lu!` (In-place/Out-of-place LU factorization (`NoPivot` only)) +- `lu`/`lu!` (In-place/Out-of-place LU factorization (`NoPivot` and `RowMaximum` only)) From `AbstractFFTs`: - `fft`/`fft!` diff --git a/ext/CUDAExt.jl b/ext/CUDAExt.jl index 6b8c61f9..09554425 100644 --- a/ext/CUDAExt.jl +++ b/ext/CUDAExt.jl @@ -111,6 +111,12 @@ struct AllocateUndef{S} end (::AllocateUndef{S})(T, dims::Dims{N}) where {S,N} = CuArray{S,N}(undef, dims) Dagger.allocate_array_func(::CuArrayDeviceProc, ::Dagger.AllocateUndef{S}) where S = AllocateUndef{S}() +# Indexing +Base.getindex(arr::CuArray, d::Dagger.ArrayDomain) = arr[Dagger.indexes(d)...] + +# Views +Base.view(A::CuArray{T,N}, p::Dagger.Blocks{N}) where {T,N} = Dagger._view(A, p) + # In-place # N.B. These methods assume that later operations will implicitly or # explicitly synchronize with their associated stream diff --git a/ext/IntelExt.jl b/ext/IntelExt.jl index 74253007..afe170df 100644 --- a/ext/IntelExt.jl +++ b/ext/IntelExt.jl @@ -106,6 +106,9 @@ struct AllocateUndef{S} end (::AllocateUndef{S})(T, dims::Dims{N}) where {S,N} = oneArray{S,N}(undef, dims) Dagger.allocate_array_func(::oneArrayDeviceProc, ::Dagger.AllocateUndef{S}) where S = AllocateUndef{S}() +# Indexing +Base.getindex(arr::oneArray, d::Dagger.ArrayDomain) = arr[Dagger.indexes(d)...] + # In-place # N.B. These methods assume that later operations will implicitly or # explicitly synchronize with their associated stream diff --git a/ext/MetalExt.jl b/ext/MetalExt.jl index 50cfc890..9743a57f 100644 --- a/ext/MetalExt.jl +++ b/ext/MetalExt.jl @@ -94,6 +94,9 @@ struct AllocateUndef{S} end (::AllocateUndef{S})(T, dims::Dims{N}) where {S,N} = MtlArray{S,N}(undef, dims) Dagger.allocate_array_func(::MtlArrayDeviceProc, ::Dagger.AllocateUndef{S}) where S = AllocateUndef{S}() +# Indexing +Base.getindex(arr::MtlArray, d::Dagger.ArrayDomain) = arr[Dagger.indexes(d)...] + # In-place # N.B. These methods assume that later operations will implicitly or # explicitly synchronize with their associated stream diff --git a/ext/OpenCLExt.jl b/ext/OpenCLExt.jl index fbf73de7..1d3a8016 100644 --- a/ext/OpenCLExt.jl +++ b/ext/OpenCLExt.jl @@ -107,6 +107,9 @@ struct AllocateUndef{S} end (::AllocateUndef{S})(T, dims::Dims{N}) where {S,N} = CLArray{S,N}(undef, dims) Dagger.allocate_array_func(::CLArrayDeviceProc, ::Dagger.AllocateUndef{S}) where S = AllocateUndef{S}() +# Indexing +Base.getindex(arr::CLArray, d::Dagger.ArrayDomain) = arr[Dagger.indexes(d)...] + # In-place # N.B. These methods assume that later operations will implicitly or # explicitly synchronize with their associated stream diff --git a/ext/ROCExt.jl b/ext/ROCExt.jl index 288c4744..758ccdc0 100644 --- a/ext/ROCExt.jl +++ b/ext/ROCExt.jl @@ -108,6 +108,9 @@ struct AllocateUndef{S} end (::AllocateUndef{S})(T, dims::Dims{N}) where {S,N} = ROCArray{S,N}(undef, dims) Dagger.allocate_array_func(::ROCArrayDeviceProc, ::Dagger.AllocateUndef{S}) where S = AllocateUndef{S}() +# Indexing +Base.getindex(arr::ROCArray, d::Dagger.ArrayDomain) = arr[Dagger.indexes(d)...] + # In-place # N.B. These methods assume that later operations will implicitly or # explicitly synchronize with their associated stream diff --git a/src/Dagger.jl b/src/Dagger.jl index 0c3761c4..68fd0d89 100644 --- a/src/Dagger.jl +++ b/src/Dagger.jl @@ -7,7 +7,7 @@ import SparseArrays: sprand, SparseMatrixCSC import MemPool import MemPool: DRef, FileRef, poolget, poolset -import Base: collect, reduce +import Base: collect, reduce, view import LinearAlgebra import LinearAlgebra: Adjoint, BLAS, Diagonal, Bidiagonal, Tridiagonal, LAPACK, LowerTriangular, PosDefException, Transpose, UpperTriangular, UnitLowerTriangular, UnitUpperTriangular, diagind, ishermitian, issymmetric diff --git a/src/array/alloc.jl b/src/array/alloc.jl index 921e1dd0..21186d9a 100644 --- a/src/array/alloc.jl +++ b/src/array/alloc.jl @@ -167,7 +167,8 @@ function Base.zero(x::DArray{T,N}) where {T,N} return _to_darray(a) end -function Base.view(A::AbstractArray{T,N}, p::Blocks{N}) where {T,N} +Base.view(A::AbstractArray{T,N}, p::Blocks{N}) where {T,N} = _view(A, p) +function _view(A::AbstractArray{T,N}, p::Blocks{N}) where {T,N} d = ArrayDomain(Base.index_shape(A)) dc = partition(p, d) # N.B. We use `tochunk` because we only want to take the view locally, and diff --git a/src/array/lu.jl b/src/array/lu.jl index 6a0b1468..4c58d9c2 100644 --- a/src/array/lu.jl +++ b/src/array/lu.jl @@ -7,8 +7,9 @@ function LinearAlgebra.lu!(A::DMatrix{T}, ::LinearAlgebra.NoPivot; check::Bool=t mzone = -one(T) Ac = A.chunks mt, nt = size(Ac) - iscomplex = T <: Complex - trans = iscomplex ? 'C' : 'T' + mb, nb = A.partitioning.blocksize + + mb != nb && throw(ArgumentError("Unequal block sizes are not supported: mb = $mb, nb = $nb")) Dagger.spawn_datadeps() do for k in range(1, min(mt, nt)) @@ -29,3 +30,95 @@ function LinearAlgebra.lu!(A::DMatrix{T}, ::LinearAlgebra.NoPivot; check::Bool=t return LinearAlgebra.LU{T,DMatrix{T},DVector{Int}}(A, ipiv, 0) end + +function searchmax_pivot!(piv_idx::AbstractVector{Int}, piv_val::AbstractVector{T}, A::AbstractMatrix{T}, offset::Int=0) where T + max_idx = LinearAlgebra.BLAS.iamax(A[:]) + piv_idx[1] = offset+max_idx + piv_val[1] = A[max_idx] +end + +function update_ipiv!(ipivl::AbstractVector{Int}, piv_idx::AbstractVector{Int}, piv_val::AbstractVector{T}, k::Int, nb::Int) where T + max_piv_idx = LinearAlgebra.BLAS.iamax(piv_val) + max_piv_val = piv_val[max_piv_idx] + abs_max_piv_val = max_piv_val isa Real ? abs(max_piv_val) : abs(real(max_piv_val)) + abs(imag(max_piv_val)) + isapprox(abs_max_piv_val, zero(T); atol=eps(real(T))) && throw(LinearAlgebra.SingularException(k)) + ipivl[1] = (max_piv_idx+k-2)*nb + piv_idx[max_piv_idx] +end + +function swaprows_panel!(A::AbstractMatrix{T}, M::AbstractMatrix{T}, ipivl::AbstractVector{Int}, m::Int, p::Int, nb::Int) where T + q = div(ipivl[1]-1,nb) + 1 + r = (ipivl[1]-1)%nb+1 + if m == q + A[p,:], M[r,:] = M[r,:], A[p,:] + end +end + +function update_panel!(M::AbstractMatrix{T}, A::AbstractMatrix{T}, p::Int) where T + Acinv = one(T) / A[p,p] + LinearAlgebra.BLAS.scal!(Acinv, view(M, :, p)) + LinearAlgebra.BLAS.ger!(-one(T), view(M, :, p), conj.(view(A, p, p+1:size(A,2))), view(M, :, p+1:size(M,2))) +end + +function swaprows_trail!(A::AbstractMatrix{T}, M::AbstractMatrix{T}, ipiv::AbstractVector{Int}, m::Int, nb::Int) where T + for p in eachindex(ipiv) + q = div(ipiv[p]-1,nb) + 1 + r = (ipiv[p]-1)%nb+1 + if m == q + A[p,:], M[r,:] = M[r,:], A[p,:] + end + end +end + +function LinearAlgebra.lu(A::DMatrix{T}, ::LinearAlgebra.RowMaximum; check::Bool=true) where T + A_copy = LinearAlgebra._lucopy(A, LinearAlgebra.lutype(T)) + return LinearAlgebra.lu!(A_copy, LinearAlgebra.RowMaximum(); check=check) +end +function LinearAlgebra.lu!(A::DMatrix{T}, ::LinearAlgebra.RowMaximum; check::Bool=true) where T + zone = one(T) + mzone = -one(T) + + Ac = A.chunks + mt, nt = size(Ac) + m, n = size(A) + mb, nb = A.partitioning.blocksize + + mb != nb && throw(ArgumentError("Unequal block sizes are not supported: mb = $mb, nb = $nb")) + + ipiv = DVector(collect(1:min(m, n)), Blocks(mb)) + ipivc = ipiv.chunks + + max_piv_idx = zeros(Int,mt) + max_piv_val = zeros(T, mt) + + Dagger.spawn_datadeps() do + for k in 1:min(mt, nt) + for p in 1:min(nb, m-(k-1)*nb, n-(k-1)*nb) + Dagger.@spawn searchmax_pivot!(Out(view(max_piv_idx, k:k)), Out(view(max_piv_val, k:k)), In(view(Ac[k,k],p:min(nb,m-(k-1)*nb),p:p)), p-1) + for i in k+1:mt + Dagger.@spawn searchmax_pivot!(Out(view(max_piv_idx, i:i)), Out(view(max_piv_val, i:i)), In(view(Ac[i,k],:,p:p))) + end + Dagger.@spawn update_ipiv!(InOut(view(ipivc[k],p:p)), In(view(max_piv_idx, k:mt)), In(view(max_piv_val, k:mt)), k, nb) + for i in k:mt + Dagger.@spawn swaprows_panel!(InOut(Ac[k, k]), InOut(Ac[i, k]), In(view(ipivc[k],p:p)), i, p, nb) + end + Dagger.@spawn update_panel!(InOut(view(Ac[k,k],p+1:min(nb,m-(k-1)*nb),:)), In(Ac[k,k]), p) + for i in k+1:mt + Dagger.@spawn update_panel!(InOut(Ac[i, k]), In(Ac[k,k]), p) + end + end + for j in Iterators.flatten((1:k-1, k+1:nt)) + for i in k:mt + Dagger.@spawn swaprows_trail!(InOut(Ac[k, j]), InOut(Ac[i, j]), In(ipivc[k]), i, mb) + end + end + for j in k+1:nt + Dagger.@spawn BLAS.trsm!('L', 'L', 'N', 'U', zone, In(Ac[k, k]), InOut(Ac[k, j])) + for i in k+1:mt + Dagger.@spawn BLAS.gemm!('N', 'N', mzone, In(Ac[i, k]), In(Ac[k, j]), zone, InOut(Ac[i, j])) + end + end + end + end + + return LinearAlgebra.LU{T,DMatrix{T},DVector{Int}}(A, ipiv, 0) +end diff --git a/src/array/mul.jl b/src/array/mul.jl index 44e5ab94..5bafbca1 100644 --- a/src/array/mul.jl +++ b/src/array/mul.jl @@ -384,9 +384,7 @@ end m, n = size(A) C = B' - for i = 1:m, j = 1:n - A[i, j] = C[i, j] - end + A[1:m, 1:n] .= view(C, 1:m, 1:n) end @inline function copydiagtile!(A, uplo) @@ -401,7 +399,5 @@ end C[diagind(C)] .= A[diagind(A)] end - for i = 1:m, j = 1:n - A[i, j] = C[i, j] - end + A[1:m, 1:n] .= view(C, 1:m, 1:n) end diff --git a/test/array/linalg/cholesky.jl b/test/array/linalg/cholesky.jl index 95b8fdac..e263d767 100644 --- a/test/array/linalg/cholesky.jl +++ b/test/array/linalg/cholesky.jl @@ -1,49 +1,60 @@ -@testset "$T" for T in (Float32, Float64, ComplexF32, ComplexF64) - D = rand(Blocks(4, 4), T, 32, 32) - if !(T <: Complex) - @test !issymmetric(D) - end - @test !ishermitian(D) +function test_cholesky(AT) + @testset "$T" for T in (Float32, Float64, ComplexF32, ComplexF64) + D = rand(Blocks(4, 4), T, 32, 32) + if !(T <: Complex) + @test !issymmetric(D) + end + @test !ishermitian(D) - A = rand(T, 128, 128) - A = A * A' - A[diagind(A)] .+= size(A, 1) - B = copy(A) - DA = view(A, Blocks(32, 32)) - if !(T <: Complex) - @test issymmetric(DA) - end - @test ishermitian(DA) + A = AT(rand(T, 128, 128)) + A = A * A' + A[diagind(A)] .+= size(A, 1) + B = copy(A) + DA = view(A, Blocks(32, 32)) + if !(T <: Complex) + @test issymmetric(DA) + end + @test ishermitian(DA) - # Out-of-place - chol_A = cholesky(A) - chol_DA = cholesky(DA) - @test chol_DA isa Cholesky - @test chol_A.L ≈ chol_DA.L - @test chol_A.U ≈ chol_DA.U - # Check that cholesky did not modify A or DA - @test A ≈ DA ≈ B + # Out-of-place + chol_A = cholesky(A) + chol_DA = cholesky(DA) + @test chol_DA isa Cholesky + @test chol_A.L ≈ chol_DA.L + @test chol_A.U ≈ chol_DA.U + # Check that cholesky did not modify A or DA + @test A ≈ DA ≈ B - # In-place - A_copy = copy(A) - chol_A = cholesky!(A_copy) - chol_DA = cholesky!(DA) - @test chol_DA isa Cholesky - @test chol_A.L ≈ chol_DA.L - @test chol_A.U ≈ chol_DA.U - # Check that changes propagated to A - @test UpperTriangular(collect(DA)) ≈ UpperTriangular(collect(A)) + # In-place + A_copy = copy(A) + chol_A = cholesky!(A_copy) + chol_DA = cholesky!(DA) + @test chol_DA isa Cholesky + @test chol_A.L ≈ chol_DA.L + @test chol_A.U ≈ chol_DA.U + # Check that changes propagated to A + @test UpperTriangular(collect(DA)) ≈ UpperTriangular(collect(A)) - # Non-PosDef matrix - A = rand(T, 128, 128) - A = A * A' - A[diagind(A)] .+= size(A, 1) - A[1, 1] = -100 - DA = view(A, Blocks(32, 32)) - if !(T <: Complex) - @test issymmetric(DA) + # Non-PosDef matrix + A = AT(rand(T, 128, 128)) + A = A * A' + A[diagind(A)] .+= size(A, 1) + A[1, 1] = -100 + DA = view(A, Blocks(32, 32)) + if !(T <: Complex) + @test issymmetric(DA) + end + @test ishermitian(DA) + @test_broken cholesky(DA).U == 42 # This should throw PosDefException + #@test_throws_unwrap PosDefException cholesky(DA).U end - @test ishermitian(DA) - @test_broken cholesky(DA).U == 42 # This should throw PosDefException - #@test_throws_unwrap PosDefException cholesky(DA).U end + +for (kind, AT, scope) in ALL_SCOPES + kind == :oneAPI || kind == :Metal || kind == :OpenCL && continue + @testset "$kind" begin + Dagger.with_options(;scope) do + test_cholesky(AT) + end + end +end \ No newline at end of file diff --git a/test/array/linalg/core.jl b/test/array/linalg/core.jl index 87c3d3d7..1418de60 100644 --- a/test/array/linalg/core.jl +++ b/test/array/linalg/core.jl @@ -1,11 +1,22 @@ -@testset "isapprox" begin - A = rand(16, 16) +function test_linalg_core(AT) + @testset "isapprox" begin + A = AT(rand(16, 16)) - U1 = UpperTriangular(DArray(A, Blocks(16, 16))) - U2 = UpperTriangular(DArray(A, Blocks(16, 16))) - @test isapprox(U1, U2) + U1 = UpperTriangular(DArray(A, Blocks(16, 16))) + U2 = UpperTriangular(DArray(A, Blocks(16, 16))) + @test isapprox(U1, U2) - L1 = LowerTriangular(DArray(A, Blocks(16, 16))) - L2 = LowerTriangular(DArray(A, Blocks(16, 16))) - @test isapprox(L1, L2) + L1 = LowerTriangular(DArray(A, Blocks(16, 16))) + L2 = LowerTriangular(DArray(A, Blocks(16, 16))) + @test isapprox(L1, L2) + end end + +for (kind, AT, scope) in ALL_SCOPES + kind == :oneAPI || kind == :Metal || kind == :OpenCL && continue + @testset "$kind" begin + Dagger.with_options(;scope) do + test_linalg_core(AT) + end + end +end \ No newline at end of file diff --git a/test/array/linalg/lu.jl b/test/array/linalg/lu.jl index 7f6993a0..d9b613bc 100644 --- a/test/array/linalg/lu.jl +++ b/test/array/linalg/lu.jl @@ -1,33 +1,48 @@ -@testset "$T" for T in (Float32, Float64, ComplexF32, ComplexF64) - A = rand(T, 128, 128) - B = copy(A) - DA = view(A, Blocks(64, 64)) +function test_lu(AT) + @testset "$T with $pivot" for T in (Float32, Float64, ComplexF32, ComplexF64), pivot in (NoPivot(), RowMaximum()) + A = AT(rand(T, 128, 128)) + B = copy(A) + DA = view(A, Blocks(64, 64)) - # Out-of-place - lu_A = lu(A, NoPivot()) - lu_DA = lu(DA, NoPivot()) - @test lu_DA isa LU{T,DMatrix{T},DVector{Int}} - if !(T in (Float32, ComplexF32)) # FIXME: NoPivot is unstable for FP32 - @test lu_A.L ≈ lu_DA.L - @test lu_A.U ≈ lu_DA.U - end - @test lu_A.P ≈ lu_DA.P - @test lu_A.p ≈ lu_DA.p - # Check that lu did not modify A or DA - @test A ≈ DA ≈ B + # Out-of-place + lu_A = lu(A, pivot) + lu_DA = lu(DA, pivot) + @test lu_DA isa LU{T,DMatrix{T},DVector{Int}} + if !(T in (Float32, ComplexF32) && pivot == NoPivot()) # FIXME: NoPivot is unstable for FP32 + @test lu_A.L ≈ lu_DA.L + @test lu_A.U ≈ lu_DA.U + end + @test lu_A.P ≈ lu_DA.P + @test lu_A.p ≈ lu_DA.p + # Check that lu did not modify A or DA + @test A ≈ DA ≈ B + + # In-place + A_copy = copy(A) + lu_A = lu!(A_copy, pivot) + lu_DA = lu!(DA, pivot) + @test lu_DA isa LU{T,DMatrix{T},DVector{Int}} + if !(T in (Float32, ComplexF32) && pivot == NoPivot()) # FIXME: NoPivot is unstable for FP32 + @test lu_A.L ≈ lu_DA.L + @test lu_A.U ≈ lu_DA.U + end + @test lu_A.P ≈ lu_DA.P + @test lu_A.p ≈ lu_DA.p + # Check that changes propagated to A + @test DA ≈ A + @test !(B ≈ A) - # In-place - A_copy = copy(A) - lu_A = lu!(A_copy, NoPivot()) - lu_DA = lu!(DA, NoPivot()) - @test lu_DA isa LU{T,DMatrix{T},DVector{Int}} - if !(T in (Float32, ComplexF32)) # FIXME: NoPivot is unstable for FP32 - @test lu_A.L ≈ lu_DA.L - @test lu_A.U ≈ lu_DA.U + # Non-square block sizes are not supported + @test_throws ArgumentError lu(rand(Blocks(64, 32), T, 128, 128), pivot) + @test_throws ArgumentError lu!(rand(Blocks(64, 32), T, 128, 128), pivot) end - @test lu_A.P ≈ lu_DA.P - @test lu_A.p ≈ lu_DA.p - # Check that changes propagated to A - @test DA ≈ A - @test !(B ≈ A) end + +for (kind, AT, scope) in ALL_SCOPES + kind == :oneAPI || kind == :Metal || kind == :OpenCL && continue + @testset "$kind" begin + Dagger.with_options(;scope) do + test_lu(AT) + end + end +end \ No newline at end of file diff --git a/test/array/linalg/matmul.jl b/test/array/linalg/matmul.jl index e15f7432..6e8a3751 100644 --- a/test/array/linalg/matmul.jl +++ b/test/array/linalg/matmul.jl @@ -17,14 +17,14 @@ end @test collect(x^3) == collect(x*x*x) end -function test_gemm!(T, szA, szB, partA, partB) +function test_gemm!(AT, T, szA, szB, partA, partB) @assert szA[1] == szB[2] szC = (szA[1], szA[1]) @assert partA.blocksize[1] == partB.blocksize[2] partC = Blocks(partA.blocksize[1], partB.blocksize[2]) - A = rand(T, szA...) - B = rand(T, szB...) + A = AT(rand(T, szA...)) + B = AT(rand(T, szB...)) DA = distribute(A, partA) DB = distribute(B, partB) @@ -33,82 +33,82 @@ function test_gemm!(T, szA, szB, partA, partB) # No transA, No transB DC = DA * DB C = A * B - @test collect(DC) ≈ C + @test AT(collect(DC)) ≈ C if szA == szB # No transA, transB DC = DA * DB' C = A * B' - @test collect(DC) ≈ C + @test AT(collect(DC)) ≈ C # transA, No transB DC = DA' * DB C = A' * B - @test collect(DC) ≈ C + @test AT(collect(DC)) ≈ C end # transA, transB DC = DA' * DB' C = A' * B' - @test collect(DC) ≈ C + @test AT(collect(DC)) ≈ C ## In-place gemm # No transA, No transB - C = zeros(T, szC...) + C = AT(zeros(T, szC...)) DC = distribute(C, partC) mul!(C, A, B) mul!(DC, DA, DB) - @test collect(DC) ≈ C + @test AT(collect(DC)) ≈ C if szA == szB # No transA, transB - C = zeros(T, szC...) + C = AT(zeros(T, szC...)) DC = distribute(C, partC) mul!(C, A, B') mul!(DC, DA, DB') - @test collect(DC) ≈ C + @test AT(collect(DC)) ≈ C # transA, No transB - C = zeros(T, szC...) + C = AT(zeros(T, szC...)) DC = distribute(C, partC) mul!(C, A', B) mul!(DC, DA', DB) - @test collect(DC) ≈ C + @test AT(collect(DC)) ≈ C end # transA, transB - C = zeros(T, szA[2], szA[2]) + C = AT(zeros(T, szA[2], szA[2])) DC = distribute(C, partC) mul!(C, A', B') mul!(DC, DA', DB') - collect(DC) ≈ C + @test AT(collect(DC)) ≈ C if szA == szB ## Out-of-place syrk # No trans, trans DC = DA * DA' C = A * A' - @test collect(DC) ≈ C + @test AT(collect(DC)) ≈ C # trans, No trans DC = DA' * DA C = A' * A - @test collect(DC) ≈ C + @test AT(collect(DC)) ≈ C ## In-place syrk # No trans, trans - C = zeros(T, szC...) + C = AT(zeros(T, szC...)) DC = distribute(C, partC) mul!(C, A, A') mul!(DC, DA, DA') - @test collect(DC) ≈ C + @test AT(collect(DC)) ≈ C # trans, No trans - C = zeros(T, szC...) + C = AT(zeros(T, szC...)) DC = distribute(C, partC) mul!(C, A', A) mul!(DC, DA', DA) - @test collect(DC) ≈ C + @test AT(collect(DC)) ≈ C end end @@ -136,10 +136,19 @@ part_sets_to_test = map(_sizes_to_test) do sz ] end parts_to_test = vcat(part_sets_to_test...) -@testset "Size=$szA*$szB" for (szA, szB) in sizes_to_test - @testset "Partitioning=$partA*$partB" for (partA,partB) in parts_to_test - @testset "T=$T" for T in (Float32, Float64, ComplexF32, ComplexF64) - test_gemm!(T, szA, szB, partA, partB) + +for (kind, AT, scope) in ALL_SCOPES + kind == :oneAPI || kind == :Metal || kind == :OpenCL && continue + @testset "$kind" begin + Dagger.with_options(;scope) do + @testset "Size=$szA*$szB" for (szA, szB) in sizes_to_test + @testset "Partitioning=$partA*$partB" for (partA,partB) in parts_to_test + @testset "T=$T" for T in (Float32, Float64, ComplexF32, ComplexF64) + kind == :ROCm && (T == ComplexF32 || T == ComplexF64) && continue # missing herk! + test_gemm!(AT, T, szA, szB, partA, partB) + end + end + end end end -end +end \ No newline at end of file diff --git a/test/array/stencil.jl b/test/array/stencil.jl index 69b7bb06..e128c196 100644 --- a/test/array/stencil.jl +++ b/test/array/stencil.jl @@ -1,6 +1,6 @@ import Dagger: @stencil, Wrap, Pad -function test_stencil() +function test_stencil(AT) @testset "Simple assignment" begin A = zeros(Blocks(2, 2), Int, 4, 4) Dagger.spawn_datadeps() do @@ -12,7 +12,7 @@ function test_stencil() end @testset "Wrap boundary" begin - A = zeros(Int, 4, 4) + A = AT(zeros(Int, 4, 4)) A[1,1] = 10 A = DArray(A, Blocks(2, 2)) B = zeros(Blocks(2, 2), Int, 4, 4) @@ -84,7 +84,7 @@ function test_stencil() @testset "Multiple DArrays" begin A = ones(Blocks(2, 2), Int, 4, 4) - B = DArray(fill(2, 4, 4), Blocks(2, 2)) + B = DArray(AT(fill(2, 4, 4)), Blocks(2, 2)) C = zeros(Blocks(2, 2), Int, 4, 4) Dagger.spawn_datadeps() do @stencil begin @@ -115,7 +115,7 @@ function test_stencil() # Sum = 5*5 (for the padded values) + 1*4 (for the actual values from A) = 25 + 4 = 29. # This logic applies to all elements in B because the array A is small (2x2) and the neighborhood is 1. # Every element's 3x3 neighborhood will include 5 padded values and the 4 values of A. - expected_B_pad_val = fill(pad_value*5 + 1*4, 2, 2) + expected_B_pad_val = AT(fill(pad_value*5 + 1*4, 2, 2)) @test collect(B) == expected_B_pad_val end @@ -163,18 +163,12 @@ function test_stencil() end end -@testset "CPU" begin - test_stencil() -end - -@testset "GPU" begin - for (kind, scope) in GPU_SCOPES - # FIXME - kind == :oneAPI && continue - @testset "$kind" begin - Dagger.with_options(;scope) do - test_stencil() - end +for (kind, AT, scope) in ALL_SCOPES + # FIXME: These both crash in LLVM + kind == :oneAPI || kind == :OpenCL && continue + @testset "$kind" begin + Dagger.with_options(;scope) do + test_stencil(AT) end end -end +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 39860cf2..ac80cf95 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -7,49 +7,47 @@ USE_METAL = parse(Bool, get(ENV, "CI_USE_METAL", "0")) USE_OPENCL = parse(Bool, get(ENV, "CI_USE_OPENCL", "0")) USE_GPU = USE_CUDA || USE_ROCM || USE_ONEAPI || USE_METAL || USE_OPENCL +# (title, filename, supports_gpu) tests = [ - ("Thunk", "thunk.jl"), - ("Scheduler", "scheduler.jl"), - ("Processors", "processors.jl"), - ("Memory Spaces", "memory-spaces.jl"), - ("Logging", "logging.jl"), - ("Checkpointing", "checkpoint.jl"), - ("Scopes", "scopes.jl"), - ("Options", "options.jl"), - ("Mutation", "mutation.jl"), - ("Task Queues", "task-queues.jl"), - ("Task Affinity", "task-affinity.jl"), - ("Datadeps", "datadeps.jl"), - ("Streaming", "streaming.jl"), - ("Domain Utilities", "domain.jl"), - ("Array - Allocation", "array/allocation.jl"), - ("Array - Indexing", "array/indexing.jl"), - ("Array - Core", "array/core.jl"), - ("Array - Copyto", "array/copyto.jl"), - ("Array - MapReduce", "array/mapreduce.jl"), - ("Array - LinearAlgebra - Core", "array/linalg/core.jl"), - ("Array - LinearAlgebra - Matmul", "array/linalg/matmul.jl"), - ("Array - LinearAlgebra - Cholesky", "array/linalg/cholesky.jl"), - ("Array - LinearAlgebra - LU", "array/linalg/lu.jl"), - ("Array - Random", "array/random.jl"), - ("Array - Stencils", "array/stencil.jl"), - ("Array - FFT", "array/fft.jl"), - ("GPU", "gpu.jl"), - ("Caching", "cache.jl"), - ("Disk Caching", "diskcaching.jl"), - ("File IO", "file-io.jl"), - ("External Languages - Python", "extlang/python.jl"), - ("Preferences", "preferences.jl"), - #("Fault Tolerance", "fault-tolerance.jl"), + ("Thunk", "thunk.jl", false), + ("Scheduler", "scheduler.jl", false), + ("Processors", "processors.jl", false), + ("Memory Spaces", "memory-spaces.jl", false), + ("Logging", "logging.jl", false), + ("Checkpointing", "checkpoint.jl", false), + ("Scopes", "scopes.jl", false), + ("Options", "options.jl", false), + ("Mutation", "mutation.jl", false), + ("Task Queues", "task-queues.jl", false), + ("Task Affinity", "task-affinity.jl", false), + ("Datadeps", "datadeps.jl", false), + ("Streaming", "streaming.jl", false), + ("Domain Utilities", "domain.jl", false), + ("Array - Allocation", "array/allocation.jl", false), + ("Array - Indexing", "array/indexing.jl", false), + ("Array - Core", "array/core.jl", false), + ("Array - Copyto", "array/copyto.jl", false), + ("Array - MapReduce", "array/mapreduce.jl", false), + ("Array - LinearAlgebra - Core", "array/linalg/core.jl", true), + ("Array - LinearAlgebra - Matmul", "array/linalg/matmul.jl", true), + ("Array - LinearAlgebra - Cholesky", "array/linalg/cholesky.jl", true), + ("Array - LinearAlgebra - LU", "array/linalg/lu.jl", true), + ("Array - Random", "array/random.jl", false), + ("Array - Stencils", "array/stencil.jl", true), + ("Array - FFT", "array/fft.jl", false), + ("GPU", "gpu.jl", true), + ("Caching", "cache.jl", false), + ("Disk Caching", "diskcaching.jl", false), + ("File IO", "file-io.jl", false), + ("External Languages - Python", "extlang/python.jl", false), + ("Preferences", "preferences.jl", false), + #("Fault Tolerance", "fault-tolerance.jl", false), ] if USE_GPU # Only run GPU tests - tests = [ - ("GPU", "gpu.jl"), - ("Array - Stencils", "array/stencil.jl"), - ] + filter!(test->test[3], tests) end -all_test_names = map(test -> replace(last(test), ".jl"=>""), tests) +all_test_names = map(test -> replace(test[2], ".jl"=>""), tests) additional_workers::Int = 3 @@ -91,7 +89,7 @@ if PROGRAM_FILE != "" && realpath(PROGRAM_FILE) == @__FILE__ to_test = String[] for test in parsed_args["test"] if isdir(joinpath(@__DIR__, test)) - for (_, other_test) in tests + for (_, other_test, _) in tests if startswith(other_test, test) push!(to_test, other_test) continue @@ -102,7 +100,7 @@ if PROGRAM_FILE != "" && realpath(PROGRAM_FILE) == @__FILE__ else println(stderr, "Unknown test: $test") println(stderr, "Available tests:") - for ((test_title, _), test_name) in zip(tests, all_test_names) + for ((test_title, _, _), test_name) in zip(tests, all_test_names) println(stderr, " $test_name: $test_title") end exit(1) @@ -145,10 +143,17 @@ using UUIDs @everywhere MemPool.MEM_RESERVED[] = 0 -GPU_SCOPES = Pair{Symbol, Dagger.AbstractScope}[] +CPU_SCOPES = Tuple{Symbol, Type, Dagger.AbstractScope}[] +GPU_SCOPES = Tuple{Symbol, Type, Dagger.AbstractScope}[] if USE_GPU include("setup_gpu.jl") +else + push!(CPU_SCOPES, (:OneWorker_SingleThreaded, Array, Dagger.scope(;worker=1, thread=1))) + push!(CPU_SCOPES, (:OneWorker_MultiThreaded, Array, Dagger.scope(;worker=1, threads=:))) + push!(CPU_SCOPES, (:MultiWorker_SingleThreaded, Array, Dagger.scope(;workers=:, thread=1))) + push!(CPU_SCOPES, (:MultiWorker_MultiThreaded, Array, Dagger.scope(;workers=:, threads=:))) end +ALL_SCOPES = vcat(CPU_SCOPES, GPU_SCOPES) try for test in to_test diff --git a/test/setup_gpu.jl b/test/setup_gpu.jl index 93116185..cc060ec5 100644 --- a/test/setup_gpu.jl +++ b/test/setup_gpu.jl @@ -58,32 +58,32 @@ end end if USE_CUDA - push!(GPU_SCOPES, :CUDA => Dagger.scope(;worker=1, cuda_gpu=1)) + push!(GPU_SCOPES, (:CUDA, CuArray, Dagger.scope(;worker=1, cuda_gpu=1))) if length(CUDA.devices()) > 1 - push!(GPU_SCOPES, :CUDA => Dagger.scope(;worker=1, cuda_gpu=2)) + push!(GPU_SCOPES, (:CUDA, CuArray, Dagger.scope(;worker=1, cuda_gpu=2))) end end if USE_ROCM - push!(GPU_SCOPES, :ROCm => Dagger.scope(;worker=1, rocm_gpu=1)) + push!(GPU_SCOPES, (:ROCm, ROCArray, Dagger.scope(;worker=1, rocm_gpu=1))) if length(AMDGPU.devices()) > 1 - push!(GPU_SCOPES, :ROCm => Dagger.scope(;worker=1, rocm_gpu=2)) + push!(GPU_SCOPES, (:ROCm, ROCmrray, Dagger.scope(;worker=1, rocm_gpu=2))) end end if USE_ONEAPI - push!(GPU_SCOPES, :oneAPI => Dagger.scope(;worker=1, intel_gpu=1)) + push!(GPU_SCOPES, (:oneAPI, oneArray, Dagger.scope(;worker=1, intel_gpu=1))) if length(oneAPI.devices()) > 1 - push!(GPU_SCOPES, :oneAPI => Dagger.scope(;worker=1, intel_gpu=2)) + push!(GPU_SCOPES, (:oneAPI, oneArray, Dagger.scope(;worker=1, intel_gpu=2))) end end if USE_METAL - push!(GPU_SCOPES, :Metal => Dagger.scope(;worker=1, metal_gpu=1)) + push!(GPU_SCOPES, (:Metal, MtlArray, Dagger.scope(;worker=1, metal_gpu=1))) if length(Metal.devices()) > 1 - push!(GPU_SCOPES, :Metal => Dagger.scope(;worker=1, metal_gpu=2)) + push!(GPU_SCOPES, (:Metal, MtlArray, Dagger.scope(;worker=1, metal_gpu=2))) end end if USE_OPENCL - push!(GPU_SCOPES, :OpenCL => Dagger.scope(;worker=1, cl_device=1)) + push!(GPU_SCOPES, (:OpenCL, CLArray, Dagger.scope(;worker=1, cl_device=1))) if length(cl.devices(cl.default_platform())) > 1 - push!(GPU_SCOPES, :OpenCL => Dagger.scope(;worker=1, cl_device=2)) + push!(GPU_SCOPES, (:OpenCL, CLArray, Dagger.scope(;worker=1, cl_device=2))) end end