diff --git a/docs/src/darray.md b/docs/src/darray.md
index 956436b6..93399486 100644
--- a/docs/src/darray.md
+++ b/docs/src/darray.md
@@ -684,6 +684,9 @@ From `Base`:
 From `Random`:
 - `rand!`/`randn!`
 
+From `SparseArrays`:
+- `sprand`
+
 From `Statistics`:
 - `mean`
 - `var`
@@ -694,7 +697,7 @@ From `LinearAlgebra`:
 - `*` (Out-of-place Matrix-(Matrix/Vector) multiply)
 - `mul!` (In-place Matrix-Matrix multiply)
 - `cholesky`/`cholesky!` (In-place/Out-of-place Cholesky factorization)
-- `lu`/`lu!` (In-place/Out-of-place LU factorization (`NoPivot` only))
+- `lu`/`lu!` (In-place/Out-of-place LU factorization (`NoPivot` and `RowMaximum` only))
 
 From `AbstractFFTs`:
 - `fft`/`fft!`
diff --git a/ext/CUDAExt.jl b/ext/CUDAExt.jl
index 6b8c61f9..09554425 100644
--- a/ext/CUDAExt.jl
+++ b/ext/CUDAExt.jl
@@ -111,6 +111,12 @@ struct AllocateUndef{S} end
 (::AllocateUndef{S})(T, dims::Dims{N}) where {S,N} = CuArray{S,N}(undef, dims)
 Dagger.allocate_array_func(::CuArrayDeviceProc, ::Dagger.AllocateUndef{S}) where S = AllocateUndef{S}()
 
+# Indexing
+Base.getindex(arr::CuArray, d::Dagger.ArrayDomain) = arr[Dagger.indexes(d)...]
+
+# Views
+Base.view(A::CuArray{T,N}, p::Dagger.Blocks{N}) where {T,N} = Dagger._view(A, p)
+
 # In-place
 # N.B. These methods assume that later operations will implicitly or
 # explicitly synchronize with their associated stream
diff --git a/ext/IntelExt.jl b/ext/IntelExt.jl
index 74253007..afe170df 100644
--- a/ext/IntelExt.jl
+++ b/ext/IntelExt.jl
@@ -106,6 +106,9 @@ struct AllocateUndef{S} end
 (::AllocateUndef{S})(T, dims::Dims{N}) where {S,N} = oneArray{S,N}(undef, dims)
 Dagger.allocate_array_func(::oneArrayDeviceProc, ::Dagger.AllocateUndef{S}) where S = AllocateUndef{S}()
 
+# Indexing
+Base.getindex(arr::oneArray, d::Dagger.ArrayDomain) = arr[Dagger.indexes(d)...]
+
 # In-place
 # N.B. These methods assume that later operations will implicitly or
 # explicitly synchronize with their associated stream
diff --git a/ext/MetalExt.jl b/ext/MetalExt.jl
index 50cfc890..9743a57f 100644
--- a/ext/MetalExt.jl
+++ b/ext/MetalExt.jl
@@ -94,6 +94,9 @@ struct AllocateUndef{S} end
 (::AllocateUndef{S})(T, dims::Dims{N}) where {S,N} = MtlArray{S,N}(undef, dims)
 Dagger.allocate_array_func(::MtlArrayDeviceProc, ::Dagger.AllocateUndef{S}) where S = AllocateUndef{S}()
 
+# Indexing
+Base.getindex(arr::MtlArray, d::Dagger.ArrayDomain) = arr[Dagger.indexes(d)...]
+
 # In-place
 # N.B. These methods assume that later operations will implicitly or
 # explicitly synchronize with their associated stream
diff --git a/ext/OpenCLExt.jl b/ext/OpenCLExt.jl
index fbf73de7..1d3a8016 100644
--- a/ext/OpenCLExt.jl
+++ b/ext/OpenCLExt.jl
@@ -107,6 +107,9 @@ struct AllocateUndef{S} end
 (::AllocateUndef{S})(T, dims::Dims{N}) where {S,N} = CLArray{S,N}(undef, dims)
 Dagger.allocate_array_func(::CLArrayDeviceProc, ::Dagger.AllocateUndef{S}) where S = AllocateUndef{S}()
 
+# Indexing
+Base.getindex(arr::CLArray, d::Dagger.ArrayDomain) = arr[Dagger.indexes(d)...]
+
 # In-place
 # N.B. These methods assume that later operations will implicitly or
 # explicitly synchronize with their associated stream
diff --git a/ext/ROCExt.jl b/ext/ROCExt.jl
index 288c4744..758ccdc0 100644
--- a/ext/ROCExt.jl
+++ b/ext/ROCExt.jl
@@ -108,6 +108,9 @@ struct AllocateUndef{S} end
 (::AllocateUndef{S})(T, dims::Dims{N}) where {S,N} = ROCArray{S,N}(undef, dims)
 Dagger.allocate_array_func(::ROCArrayDeviceProc, ::Dagger.AllocateUndef{S}) where S = AllocateUndef{S}()
 
+# Indexing
+Base.getindex(arr::ROCArray, d::Dagger.ArrayDomain) = arr[Dagger.indexes(d)...]
+
 # In-place
 # N.B. These methods assume that later operations will implicitly or
 # explicitly synchronize with their associated stream
diff --git a/src/Dagger.jl b/src/Dagger.jl
index 0c3761c4..68fd0d89 100644
--- a/src/Dagger.jl
+++ b/src/Dagger.jl
@@ -7,7 +7,7 @@ import SparseArrays: sprand, SparseMatrixCSC
 import MemPool
 import MemPool: DRef, FileRef, poolget, poolset
 
-import Base: collect, reduce
+import Base: collect, reduce, view
 
 import LinearAlgebra
 import LinearAlgebra: Adjoint, BLAS, Diagonal, Bidiagonal, Tridiagonal, LAPACK, LowerTriangular, PosDefException, Transpose, UpperTriangular, UnitLowerTriangular, UnitUpperTriangular, diagind, ishermitian, issymmetric
diff --git a/src/array/alloc.jl b/src/array/alloc.jl
index 921e1dd0..21186d9a 100644
--- a/src/array/alloc.jl
+++ b/src/array/alloc.jl
@@ -167,7 +167,8 @@ function Base.zero(x::DArray{T,N}) where {T,N}
     return _to_darray(a)
 end
 
-function Base.view(A::AbstractArray{T,N}, p::Blocks{N}) where {T,N}
+Base.view(A::AbstractArray{T,N}, p::Blocks{N}) where {T,N} = _view(A, p)
+function _view(A::AbstractArray{T,N}, p::Blocks{N}) where {T,N}
     d = ArrayDomain(Base.index_shape(A))
     dc = partition(p, d)
     # N.B. We use `tochunk` because we only want to take the view locally, and
diff --git a/src/array/lu.jl b/src/array/lu.jl
index 6a0b1468..4c58d9c2 100644
--- a/src/array/lu.jl
+++ b/src/array/lu.jl
@@ -7,8 +7,9 @@ function LinearAlgebra.lu!(A::DMatrix{T}, ::LinearAlgebra.NoPivot; check::Bool=t
     mzone = -one(T)
     Ac = A.chunks
     mt, nt = size(Ac)
-    iscomplex = T <: Complex
-    trans = iscomplex ? 'C' : 'T'
+    mb, nb = A.partitioning.blocksize
+
+    mb != nb && throw(ArgumentError("Unequal block sizes are not supported: mb = $mb, nb = $nb"))
 
     Dagger.spawn_datadeps() do
         for k in range(1, min(mt, nt))
@@ -29,3 +30,95 @@ function LinearAlgebra.lu!(A::DMatrix{T}, ::LinearAlgebra.NoPivot; check::Bool=t
 
     return LinearAlgebra.LU{T,DMatrix{T},DVector{Int}}(A, ipiv, 0)
 end
+
+function searchmax_pivot!(piv_idx::AbstractVector{Int}, piv_val::AbstractVector{T}, A::AbstractMatrix{T}, offset::Int=0) where T
+    max_idx = LinearAlgebra.BLAS.iamax(A[:])
+    piv_idx[1] = offset+max_idx
+    piv_val[1] = A[max_idx]
+end
+
+function update_ipiv!(ipivl::AbstractVector{Int}, piv_idx::AbstractVector{Int}, piv_val::AbstractVector{T}, k::Int, nb::Int) where T
+    max_piv_idx = LinearAlgebra.BLAS.iamax(piv_val)
+    max_piv_val = piv_val[max_piv_idx]
+    abs_max_piv_val = max_piv_val isa Real ? abs(max_piv_val) : abs(real(max_piv_val)) + abs(imag(max_piv_val))
+    isapprox(abs_max_piv_val, zero(T); atol=eps(real(T))) && throw(LinearAlgebra.SingularException(k))
+    ipivl[1] = (max_piv_idx+k-2)*nb + piv_idx[max_piv_idx]
+end
+
+function swaprows_panel!(A::AbstractMatrix{T}, M::AbstractMatrix{T}, ipivl::AbstractVector{Int}, m::Int, p::Int, nb::Int) where T
+    q = div(ipivl[1]-1,nb) + 1
+    r = (ipivl[1]-1)%nb+1
+    if m == q
+        A[p,:], M[r,:] = M[r,:], A[p,:]
+    end
+end
+
+function update_panel!(M::AbstractMatrix{T}, A::AbstractMatrix{T}, p::Int) where T
+    Acinv = one(T) / A[p,p]
+    LinearAlgebra.BLAS.scal!(Acinv, view(M, :, p))
+    LinearAlgebra.BLAS.ger!(-one(T), view(M, :, p), conj.(view(A, p, p+1:size(A,2))), view(M, :, p+1:size(M,2)))
+end
+
+function swaprows_trail!(A::AbstractMatrix{T}, M::AbstractMatrix{T}, ipiv::AbstractVector{Int}, m::Int, nb::Int) where T
+    for p in eachindex(ipiv)
+        q = div(ipiv[p]-1,nb) + 1
+        r = (ipiv[p]-1)%nb+1
+        if m == q
+            A[p,:], M[r,:] = M[r,:], A[p,:]
+        end
+    end
+end
+
+function LinearAlgebra.lu(A::DMatrix{T}, ::LinearAlgebra.RowMaximum; check::Bool=true) where T
+    A_copy = LinearAlgebra._lucopy(A, LinearAlgebra.lutype(T))
+    return LinearAlgebra.lu!(A_copy, LinearAlgebra.RowMaximum(); check=check)
+end
+function LinearAlgebra.lu!(A::DMatrix{T}, ::LinearAlgebra.RowMaximum; check::Bool=true) where T
+    zone = one(T)
+    mzone = -one(T)
+
+    Ac = A.chunks
+    mt, nt = size(Ac)
+    m,  n  = size(A)
+    mb, nb = A.partitioning.blocksize
+
+    mb != nb && throw(ArgumentError("Unequal block sizes are not supported: mb = $mb, nb = $nb"))
+
+    ipiv = DVector(collect(1:min(m, n)), Blocks(mb))
+    ipivc = ipiv.chunks
+
+    max_piv_idx = zeros(Int,mt)
+    max_piv_val = zeros(T, mt)
+
+    Dagger.spawn_datadeps() do
+        for k in 1:min(mt, nt)
+            for p in 1:min(nb, m-(k-1)*nb, n-(k-1)*nb)
+                Dagger.@spawn searchmax_pivot!(Out(view(max_piv_idx, k:k)), Out(view(max_piv_val, k:k)), In(view(Ac[k,k],p:min(nb,m-(k-1)*nb),p:p)), p-1)
+                for i in k+1:mt
+                    Dagger.@spawn searchmax_pivot!(Out(view(max_piv_idx, i:i)), Out(view(max_piv_val, i:i)), In(view(Ac[i,k],:,p:p)))
+                end
+                Dagger.@spawn update_ipiv!(InOut(view(ipivc[k],p:p)), In(view(max_piv_idx, k:mt)), In(view(max_piv_val, k:mt)), k, nb)
+                for i in k:mt
+                    Dagger.@spawn swaprows_panel!(InOut(Ac[k, k]), InOut(Ac[i, k]), In(view(ipivc[k],p:p)), i, p, nb)
+                end
+                Dagger.@spawn update_panel!(InOut(view(Ac[k,k],p+1:min(nb,m-(k-1)*nb),:)), In(Ac[k,k]), p)
+                for i in k+1:mt
+                    Dagger.@spawn update_panel!(InOut(Ac[i, k]), In(Ac[k,k]), p)
+                end
+            end
+            for j in Iterators.flatten((1:k-1, k+1:nt))
+                for i in k:mt
+                    Dagger.@spawn swaprows_trail!(InOut(Ac[k, j]), InOut(Ac[i, j]), In(ipivc[k]), i, mb)
+                end
+            end
+            for j in k+1:nt
+                Dagger.@spawn BLAS.trsm!('L', 'L', 'N', 'U', zone, In(Ac[k, k]), InOut(Ac[k, j]))
+                for i in k+1:mt
+                    Dagger.@spawn BLAS.gemm!('N', 'N', mzone, In(Ac[i, k]), In(Ac[k, j]), zone, InOut(Ac[i, j]))
+                end
+            end
+        end
+    end
+
+    return LinearAlgebra.LU{T,DMatrix{T},DVector{Int}}(A, ipiv, 0)
+end
diff --git a/src/array/mul.jl b/src/array/mul.jl
index 44e5ab94..5bafbca1 100644
--- a/src/array/mul.jl
+++ b/src/array/mul.jl
@@ -384,9 +384,7 @@ end
     m, n = size(A)
     C = B'
 
-    for i = 1:m, j = 1:n
-        A[i, j] = C[i, j]
-    end
+    A[1:m, 1:n] .= view(C, 1:m, 1:n)
 end
 
 @inline function copydiagtile!(A, uplo)
@@ -401,7 +399,5 @@ end
         C[diagind(C)] .= A[diagind(A)]
     end
 
-    for i = 1:m, j = 1:n
-        A[i, j] = C[i, j]
-    end
+    A[1:m, 1:n] .= view(C, 1:m, 1:n)
 end
diff --git a/test/array/linalg/cholesky.jl b/test/array/linalg/cholesky.jl
index 95b8fdac..e263d767 100644
--- a/test/array/linalg/cholesky.jl
+++ b/test/array/linalg/cholesky.jl
@@ -1,49 +1,60 @@
-@testset "$T" for T in (Float32, Float64, ComplexF32, ComplexF64)
-    D = rand(Blocks(4, 4), T, 32, 32)
-    if !(T <: Complex)
-        @test !issymmetric(D)
-    end
-    @test !ishermitian(D)
+function test_cholesky(AT)
+    @testset "$T" for T in (Float32, Float64, ComplexF32, ComplexF64)
+        D = rand(Blocks(4, 4), T, 32, 32)
+        if !(T <: Complex)
+            @test !issymmetric(D)
+        end
+        @test !ishermitian(D)
 
-    A = rand(T, 128, 128)
-    A = A * A'
-    A[diagind(A)] .+= size(A, 1)
-    B = copy(A)
-    DA = view(A, Blocks(32, 32))
-    if !(T <: Complex)
-        @test issymmetric(DA)
-    end
-    @test ishermitian(DA)
+        A = AT(rand(T, 128, 128))
+        A = A * A'
+        A[diagind(A)] .+= size(A, 1)
+        B = copy(A)
+        DA = view(A, Blocks(32, 32))
+        if !(T <: Complex)
+            @test issymmetric(DA)
+        end
+        @test ishermitian(DA)
 
-    # Out-of-place
-    chol_A = cholesky(A)
-    chol_DA = cholesky(DA)
-    @test chol_DA isa Cholesky
-    @test chol_A.L ≈ chol_DA.L
-    @test chol_A.U ≈ chol_DA.U
-    # Check that cholesky did not modify A or DA
-    @test A ≈ DA ≈ B
+        # Out-of-place
+        chol_A = cholesky(A)
+        chol_DA = cholesky(DA)
+        @test chol_DA isa Cholesky
+        @test chol_A.L ≈ chol_DA.L
+        @test chol_A.U ≈ chol_DA.U
+        # Check that cholesky did not modify A or DA
+        @test A ≈ DA ≈ B
 
-    # In-place
-    A_copy = copy(A)
-    chol_A = cholesky!(A_copy)
-    chol_DA = cholesky!(DA)
-    @test chol_DA isa Cholesky
-    @test chol_A.L ≈ chol_DA.L
-    @test chol_A.U ≈ chol_DA.U
-    # Check that changes propagated to A
-    @test UpperTriangular(collect(DA)) ≈ UpperTriangular(collect(A))
+        # In-place
+        A_copy = copy(A)
+        chol_A = cholesky!(A_copy)
+        chol_DA = cholesky!(DA)
+        @test chol_DA isa Cholesky
+        @test chol_A.L ≈ chol_DA.L
+        @test chol_A.U ≈ chol_DA.U
+        # Check that changes propagated to A
+        @test UpperTriangular(collect(DA)) ≈ UpperTriangular(collect(A))
 
-    # Non-PosDef matrix
-    A = rand(T, 128, 128)
-    A = A * A'
-    A[diagind(A)] .+= size(A, 1)
-    A[1, 1] = -100
-    DA = view(A, Blocks(32, 32))
-    if !(T <: Complex)
-        @test issymmetric(DA)
+        # Non-PosDef matrix
+        A = AT(rand(T, 128, 128))
+        A = A * A'
+        A[diagind(A)] .+= size(A, 1)
+        A[1, 1] = -100
+        DA = view(A, Blocks(32, 32))
+        if !(T <: Complex)
+            @test issymmetric(DA)
+        end
+        @test ishermitian(DA)
+        @test_broken cholesky(DA).U == 42 # This should throw PosDefException
+        #@test_throws_unwrap PosDefException cholesky(DA).U
     end
-    @test ishermitian(DA)
-    @test_broken cholesky(DA).U == 42 # This should throw PosDefException
-    #@test_throws_unwrap PosDefException cholesky(DA).U
 end
+
+for (kind, AT, scope) in ALL_SCOPES
+    kind == :oneAPI || kind == :Metal || kind == :OpenCL && continue
+    @testset "$kind" begin
+        Dagger.with_options(;scope) do
+            test_cholesky(AT)
+        end
+    end
+end
\ No newline at end of file
diff --git a/test/array/linalg/core.jl b/test/array/linalg/core.jl
index 87c3d3d7..1418de60 100644
--- a/test/array/linalg/core.jl
+++ b/test/array/linalg/core.jl
@@ -1,11 +1,22 @@
-@testset "isapprox" begin
-    A = rand(16, 16)
+function test_linalg_core(AT)
+    @testset "isapprox" begin
+        A = AT(rand(16, 16))
 
-    U1 = UpperTriangular(DArray(A, Blocks(16, 16)))
-    U2 = UpperTriangular(DArray(A, Blocks(16, 16)))
-    @test isapprox(U1, U2)
+        U1 = UpperTriangular(DArray(A, Blocks(16, 16)))
+        U2 = UpperTriangular(DArray(A, Blocks(16, 16)))
+        @test isapprox(U1, U2)
 
-    L1 = LowerTriangular(DArray(A, Blocks(16, 16)))
-    L2 = LowerTriangular(DArray(A, Blocks(16, 16)))
-    @test isapprox(L1, L2)
+        L1 = LowerTriangular(DArray(A, Blocks(16, 16)))
+        L2 = LowerTriangular(DArray(A, Blocks(16, 16)))
+        @test isapprox(L1, L2)
+    end
 end
+
+for (kind, AT, scope) in ALL_SCOPES
+    kind == :oneAPI || kind == :Metal || kind == :OpenCL && continue
+    @testset "$kind" begin
+        Dagger.with_options(;scope) do
+            test_linalg_core(AT)
+        end
+    end
+end
\ No newline at end of file
diff --git a/test/array/linalg/lu.jl b/test/array/linalg/lu.jl
index 7f6993a0..d9b613bc 100644
--- a/test/array/linalg/lu.jl
+++ b/test/array/linalg/lu.jl
@@ -1,33 +1,48 @@
-@testset "$T" for T in (Float32, Float64, ComplexF32, ComplexF64)
-    A = rand(T, 128, 128)
-    B = copy(A)
-    DA = view(A, Blocks(64, 64))
+function test_lu(AT)
+    @testset "$T with $pivot" for T in (Float32, Float64, ComplexF32, ComplexF64), pivot in (NoPivot(), RowMaximum())
+        A = AT(rand(T, 128, 128))
+        B = copy(A)
+        DA = view(A, Blocks(64, 64))
 
-    # Out-of-place
-    lu_A = lu(A, NoPivot())
-    lu_DA = lu(DA, NoPivot())
-    @test lu_DA isa LU{T,DMatrix{T},DVector{Int}}
-    if !(T in (Float32, ComplexF32)) # FIXME: NoPivot is unstable for FP32
-        @test lu_A.L ≈ lu_DA.L
-        @test lu_A.U ≈ lu_DA.U
-    end
-    @test lu_A.P ≈ lu_DA.P
-    @test lu_A.p ≈ lu_DA.p
-    # Check that lu did not modify A or DA
-    @test A ≈ DA ≈ B
+        # Out-of-place
+        lu_A = lu(A, pivot)
+        lu_DA = lu(DA, pivot)
+        @test lu_DA isa LU{T,DMatrix{T},DVector{Int}}
+        if !(T in (Float32, ComplexF32) && pivot == NoPivot()) # FIXME: NoPivot is unstable for FP32
+            @test lu_A.L ≈ lu_DA.L
+            @test lu_A.U ≈ lu_DA.U
+        end
+        @test lu_A.P ≈ lu_DA.P
+        @test lu_A.p ≈ lu_DA.p
+        # Check that lu did not modify A or DA
+        @test A ≈ DA ≈ B
+
+        # In-place
+        A_copy = copy(A)
+        lu_A = lu!(A_copy, pivot)
+        lu_DA = lu!(DA, pivot)
+        @test lu_DA isa LU{T,DMatrix{T},DVector{Int}}
+        if !(T in (Float32, ComplexF32) && pivot == NoPivot()) # FIXME: NoPivot is unstable for FP32
+            @test lu_A.L ≈ lu_DA.L
+            @test lu_A.U ≈ lu_DA.U
+        end
+        @test lu_A.P ≈ lu_DA.P
+        @test lu_A.p ≈ lu_DA.p
+        # Check that changes propagated to A
+        @test DA ≈ A
+        @test !(B ≈ A)
 
-    # In-place
-    A_copy = copy(A)
-    lu_A = lu!(A_copy, NoPivot())
-    lu_DA = lu!(DA, NoPivot())
-    @test lu_DA isa LU{T,DMatrix{T},DVector{Int}}
-    if !(T in (Float32, ComplexF32)) # FIXME: NoPivot is unstable for FP32
-        @test lu_A.L ≈ lu_DA.L
-        @test lu_A.U ≈ lu_DA.U
+        # Non-square block sizes are not supported
+        @test_throws ArgumentError lu(rand(Blocks(64, 32), T, 128, 128), pivot)
+        @test_throws ArgumentError lu!(rand(Blocks(64, 32), T, 128, 128), pivot)
     end
-    @test lu_A.P ≈ lu_DA.P
-    @test lu_A.p ≈ lu_DA.p
-    # Check that changes propagated to A
-    @test DA ≈ A
-    @test !(B ≈ A)
 end
+
+for (kind, AT, scope) in ALL_SCOPES
+    kind == :oneAPI || kind == :Metal || kind == :OpenCL && continue
+    @testset "$kind" begin
+        Dagger.with_options(;scope) do
+            test_lu(AT)
+        end
+    end
+end
\ No newline at end of file
diff --git a/test/array/linalg/matmul.jl b/test/array/linalg/matmul.jl
index e15f7432..6e8a3751 100644
--- a/test/array/linalg/matmul.jl
+++ b/test/array/linalg/matmul.jl
@@ -17,14 +17,14 @@ end
     @test collect(x^3) == collect(x*x*x)
 end
 
-function test_gemm!(T, szA, szB, partA, partB)
+function test_gemm!(AT, T, szA, szB, partA, partB)
     @assert szA[1] == szB[2]
     szC = (szA[1], szA[1])
     @assert partA.blocksize[1] == partB.blocksize[2]
     partC = Blocks(partA.blocksize[1], partB.blocksize[2])
 
-    A = rand(T, szA...)
-    B = rand(T, szB...)
+    A = AT(rand(T, szA...))
+    B = AT(rand(T, szB...))
 
     DA = distribute(A, partA)
     DB = distribute(B, partB)
@@ -33,82 +33,82 @@ function test_gemm!(T, szA, szB, partA, partB)
     # No transA, No transB
     DC = DA * DB
     C = A * B
-    @test collect(DC) ≈ C
+    @test AT(collect(DC)) ≈ C
 
     if szA == szB
         # No transA, transB
         DC = DA * DB'
         C = A * B'
-        @test collect(DC) ≈ C
+        @test AT(collect(DC)) ≈ C
 
         # transA, No transB
         DC = DA' * DB
         C = A' * B
-        @test collect(DC) ≈ C
+        @test AT(collect(DC)) ≈ C
     end
 
     # transA, transB
     DC = DA' * DB'
     C = A' * B'
-    @test collect(DC) ≈ C
+    @test AT(collect(DC)) ≈ C
 
     ## In-place gemm
     # No transA, No transB
-    C = zeros(T, szC...)
+    C = AT(zeros(T, szC...))
     DC = distribute(C, partC)
     mul!(C, A, B)
     mul!(DC, DA, DB)
-    @test collect(DC) ≈ C
+    @test AT(collect(DC)) ≈ C
 
     if szA == szB
         # No transA, transB
-        C = zeros(T, szC...)
+        C = AT(zeros(T, szC...))
         DC = distribute(C, partC)
         mul!(C, A, B')
         mul!(DC, DA, DB')
-        @test collect(DC) ≈ C
+        @test AT(collect(DC)) ≈ C
 
         # transA, No transB
-        C = zeros(T, szC...)
+        C = AT(zeros(T, szC...))
         DC = distribute(C, partC)
         mul!(C, A', B)
         mul!(DC, DA', DB)
-        @test collect(DC) ≈ C
+        @test AT(collect(DC)) ≈ C
     end
 
     # transA, transB
-    C = zeros(T, szA[2], szA[2])
+    C = AT(zeros(T, szA[2], szA[2]))
     DC = distribute(C, partC)
     mul!(C, A', B')
     mul!(DC, DA', DB')
-    collect(DC) ≈ C
+    @test AT(collect(DC)) ≈ C
 
     if szA == szB
         ## Out-of-place syrk
         # No trans, trans
         DC = DA * DA'
         C = A * A'
-        @test collect(DC) ≈ C
+        @test AT(collect(DC)) ≈ C
 
         # trans, No trans
         DC = DA' * DA
         C = A' * A
-        @test collect(DC) ≈ C
+        @test AT(collect(DC)) ≈ C
 
         ## In-place syrk
         # No trans, trans
-        C = zeros(T, szC...)
+        C = AT(zeros(T, szC...))
         DC = distribute(C, partC)
         mul!(C, A, A')
         mul!(DC, DA, DA')
-        @test collect(DC) ≈ C
+        @test AT(collect(DC)) ≈ C
 
         # trans, No trans
-        C = zeros(T, szC...)
+        C = AT(zeros(T, szC...))
         DC = distribute(C, partC)
         mul!(C, A', A)
         mul!(DC, DA', DA)
-        @test collect(DC) ≈ C
+        @test AT(collect(DC)) ≈ C
     end
 end
 
@@ -136,10 +136,19 @@ part_sets_to_test = map(_sizes_to_test) do sz
     ]
 end
 parts_to_test = vcat(part_sets_to_test...)
-@testset "Size=$szA*$szB" for (szA, szB) in sizes_to_test
-    @testset "Partitioning=$partA*$partB" for (partA,partB) in parts_to_test
-        @testset "T=$T" for T in (Float32, Float64, ComplexF32, ComplexF64)
-            test_gemm!(T, szA, szB, partA, partB)
+
+for (kind, AT, scope) in ALL_SCOPES
+    kind == :oneAPI || kind == :Metal || kind == :OpenCL && continue
+    @testset "$kind" begin
+        Dagger.with_options(;scope) do
+            @testset "Size=$szA*$szB" for (szA, szB) in sizes_to_test
+                @testset "Partitioning=$partA*$partB" for (partA,partB) in parts_to_test
+                    @testset "T=$T" for T in (Float32, Float64, ComplexF32, ComplexF64)
+                        kind == :ROCm && (T == ComplexF32 || T == ComplexF64) && continue # missing herk!
+                        test_gemm!(AT, T, szA, szB, partA, partB)
+                    end
+                end
+            end
         end
     end
-end
+end
\ No newline at end of file
diff --git a/test/array/stencil.jl b/test/array/stencil.jl
index 69b7bb06..e128c196 100644
--- a/test/array/stencil.jl
+++ b/test/array/stencil.jl
@@ -1,6 +1,6 @@
 import Dagger: @stencil, Wrap, Pad
 
-function test_stencil()
+function test_stencil(AT)
     @testset "Simple assignment" begin
         A = zeros(Blocks(2, 2), Int, 4, 4)
         Dagger.spawn_datadeps() do
@@ -12,7 +12,7 @@ function test_stencil()
     end
 
     @testset "Wrap boundary" begin
-        A = zeros(Int, 4, 4)
+        A = AT(zeros(Int, 4, 4))
         A[1,1] = 10
         A = DArray(A, Blocks(2, 2))
         B = zeros(Blocks(2, 2), Int, 4, 4)
@@ -84,7 +84,7 @@ function test_stencil()
 
     @testset "Multiple DArrays" begin
         A = ones(Blocks(2, 2), Int, 4, 4)
-        B = DArray(fill(2, 4, 4), Blocks(2, 2))
+        B = DArray(AT(fill(2, 4, 4)), Blocks(2, 2))
         C = zeros(Blocks(2, 2), Int, 4, 4)
         Dagger.spawn_datadeps() do
             @stencil begin
@@ -115,7 +115,7 @@ function test_stencil()
         # Sum = 5*5 (for the padded values) + 1*4 (for the actual values from A) = 25 + 4 = 29.
         # This logic applies to all elements in B because the array A is small (2x2) and the neighborhood is 1.
         # Every element's 3x3 neighborhood will include 5 padded values and the 4 values of A.
-        expected_B_pad_val = fill(pad_value*5 + 1*4, 2, 2)
+        expected_B_pad_val = AT(fill(pad_value*5 + 1*4, 2, 2))
         @test collect(B) == expected_B_pad_val
     end
 
@@ -163,18 +163,12 @@ function test_stencil()
     end
 end
 
-@testset "CPU" begin
-    test_stencil()
-end
-
-@testset "GPU" begin
-    for (kind, scope) in GPU_SCOPES
-        # FIXME
-        kind == :oneAPI && continue
-        @testset "$kind" begin
-            Dagger.with_options(;scope) do
-                test_stencil()
-            end
+for (kind, AT, scope) in ALL_SCOPES
+    # FIXME: These both crash in LLVM
+    kind == :oneAPI || kind == :OpenCL && continue
+    @testset "$kind" begin
+        Dagger.with_options(;scope) do
+            test_stencil(AT)
         end
     end
-end
+end
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index 39860cf2..ac80cf95 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -7,49 +7,47 @@ USE_METAL = parse(Bool, get(ENV, "CI_USE_METAL", "0"))
 USE_OPENCL = parse(Bool, get(ENV, "CI_USE_OPENCL", "0"))
 USE_GPU = USE_CUDA || USE_ROCM || USE_ONEAPI || USE_METAL || USE_OPENCL
 
+# (title, filename, supports_gpu)
 tests = [
-    ("Thunk", "thunk.jl"),
-    ("Scheduler", "scheduler.jl"),
-    ("Processors", "processors.jl"),
-    ("Memory Spaces", "memory-spaces.jl"),
-    ("Logging", "logging.jl"),
-    ("Checkpointing", "checkpoint.jl"),
-    ("Scopes", "scopes.jl"),
-    ("Options", "options.jl"),
-    ("Mutation", "mutation.jl"),
-    ("Task Queues", "task-queues.jl"),
-    ("Task Affinity", "task-affinity.jl"),
-    ("Datadeps", "datadeps.jl"),
-    ("Streaming", "streaming.jl"),
-    ("Domain Utilities", "domain.jl"),
-    ("Array - Allocation", "array/allocation.jl"),
-    ("Array - Indexing", "array/indexing.jl"),
-    ("Array - Core", "array/core.jl"),
-    ("Array - Copyto", "array/copyto.jl"),
-    ("Array - MapReduce", "array/mapreduce.jl"),
-    ("Array - LinearAlgebra - Core", "array/linalg/core.jl"),
-    ("Array - LinearAlgebra - Matmul", "array/linalg/matmul.jl"),
-    ("Array - LinearAlgebra - Cholesky", "array/linalg/cholesky.jl"),
-    ("Array - LinearAlgebra - LU", "array/linalg/lu.jl"),
-    ("Array - Random", "array/random.jl"),
-    ("Array - Stencils", "array/stencil.jl"),
-    ("Array - FFT", "array/fft.jl"),
-    ("GPU", "gpu.jl"),
-    ("Caching", "cache.jl"),
-    ("Disk Caching", "diskcaching.jl"),
-    ("File IO", "file-io.jl"),
-    ("External Languages - Python", "extlang/python.jl"),
-    ("Preferences", "preferences.jl"),
-    #("Fault Tolerance", "fault-tolerance.jl"),
+    ("Thunk", "thunk.jl", false),
+    ("Scheduler", "scheduler.jl", false),
+    ("Processors", "processors.jl", false),
+    ("Memory Spaces", "memory-spaces.jl", false),
+    ("Logging", "logging.jl", false),
+    ("Checkpointing", "checkpoint.jl", false),
+    ("Scopes", "scopes.jl", false),
+    ("Options", "options.jl", false),
+    ("Mutation", "mutation.jl", false),
+    ("Task Queues", "task-queues.jl", false),
+    ("Task Affinity", "task-affinity.jl", false),
+    ("Datadeps", "datadeps.jl", false),
+    ("Streaming", "streaming.jl", false),
+    ("Domain Utilities", "domain.jl", false),
+    ("Array - Allocation", "array/allocation.jl", false),
+    ("Array - Indexing", "array/indexing.jl", false),
+    ("Array - Core", "array/core.jl", false),
+    ("Array - Copyto", "array/copyto.jl", false),
+    ("Array - MapReduce", "array/mapreduce.jl", false),
+    ("Array - LinearAlgebra - Core", "array/linalg/core.jl", true),
+    ("Array - LinearAlgebra - Matmul", "array/linalg/matmul.jl", true),
+    ("Array - LinearAlgebra - Cholesky", "array/linalg/cholesky.jl", true),
+    ("Array - LinearAlgebra - LU", "array/linalg/lu.jl", true),
+    ("Array - Random", "array/random.jl", false),
+    ("Array - Stencils", "array/stencil.jl", true),
+    ("Array - FFT", "array/fft.jl", false),
+    ("GPU", "gpu.jl", true),
+    ("Caching", "cache.jl", false),
+    ("Disk Caching", "diskcaching.jl", false),
+    ("File IO", "file-io.jl", false),
+    ("External Languages - Python", "extlang/python.jl", false),
+    ("Preferences", "preferences.jl", false),
+    #("Fault Tolerance", "fault-tolerance.jl", false),
 ]
 if USE_GPU
     # Only run GPU tests
-    tests = [
-        ("GPU", "gpu.jl"),
-        ("Array - Stencils", "array/stencil.jl"),
-    ]
+    filter!(test->test[3], tests)
 end
-all_test_names = map(test -> replace(last(test), ".jl"=>""), tests)
+all_test_names = map(test -> replace(test[2], ".jl"=>""), tests)
 
 additional_workers::Int = 3
 
@@ -91,7 +89,7 @@ if PROGRAM_FILE != "" && realpath(PROGRAM_FILE) == @__FILE__
     to_test = String[]
     for test in parsed_args["test"]
         if isdir(joinpath(@__DIR__, test))
-            for (_, other_test) in tests
+            for (_, other_test, _) in tests
                 if startswith(other_test, test)
                     push!(to_test, other_test)
                     continue
@@ -102,7 +100,7 @@ if PROGRAM_FILE != "" && realpath(PROGRAM_FILE) == @__FILE__
         else
             println(stderr, "Unknown test: $test")
             println(stderr, "Available tests:")
-            for ((test_title, _), test_name) in zip(tests, all_test_names)
+            for ((test_title, _, _), test_name) in zip(tests, all_test_names)
                 println(stderr, "  $test_name: $test_title")
             end
             exit(1)
@@ -145,10 +143,17 @@ using UUIDs
 
 @everywhere MemPool.MEM_RESERVED[] = 0
 
-GPU_SCOPES = Pair{Symbol, Dagger.AbstractScope}[]
+CPU_SCOPES = Tuple{Symbol, Type, Dagger.AbstractScope}[]
+GPU_SCOPES = Tuple{Symbol, Type, Dagger.AbstractScope}[]
 if USE_GPU
     include("setup_gpu.jl")
+else
+    push!(CPU_SCOPES, (:OneWorker_SingleThreaded, Array, Dagger.scope(;worker=1, thread=1)))
+    push!(CPU_SCOPES, (:OneWorker_MultiThreaded, Array, Dagger.scope(;worker=1, threads=:)))
+    push!(CPU_SCOPES, (:MultiWorker_SingleThreaded, Array, Dagger.scope(;workers=:, thread=1)))
+    push!(CPU_SCOPES, (:MultiWorker_MultiThreaded, Array, Dagger.scope(;workers=:, threads=:)))
 end
+ALL_SCOPES = vcat(CPU_SCOPES, GPU_SCOPES)
 
 try
     for test in to_test
diff --git a/test/setup_gpu.jl b/test/setup_gpu.jl
index 93116185..cc060ec5 100644
--- a/test/setup_gpu.jl
+++ b/test/setup_gpu.jl
@@ -58,32 +58,32 @@ end
 end
 
 if USE_CUDA
-    push!(GPU_SCOPES, :CUDA => Dagger.scope(;worker=1, cuda_gpu=1))
+    push!(GPU_SCOPES, (:CUDA, CuArray, Dagger.scope(;worker=1, cuda_gpu=1)))
     if length(CUDA.devices()) > 1
-        push!(GPU_SCOPES, :CUDA => Dagger.scope(;worker=1, cuda_gpu=2))
+        push!(GPU_SCOPES, (:CUDA, CuArray, Dagger.scope(;worker=1, cuda_gpu=2)))
     end
 end
 if USE_ROCM
-    push!(GPU_SCOPES, :ROCm => Dagger.scope(;worker=1, rocm_gpu=1))
+    push!(GPU_SCOPES, (:ROCm, ROCArray, Dagger.scope(;worker=1, rocm_gpu=1)))
     if length(AMDGPU.devices()) > 1
-        push!(GPU_SCOPES, :ROCm => Dagger.scope(;worker=1, rocm_gpu=2))
+        push!(GPU_SCOPES, (:ROCm, ROCmrray, Dagger.scope(;worker=1, rocm_gpu=2)))
     end
 end
 if USE_ONEAPI
-    push!(GPU_SCOPES, :oneAPI => Dagger.scope(;worker=1, intel_gpu=1))
+    push!(GPU_SCOPES, (:oneAPI, oneArray, Dagger.scope(;worker=1, intel_gpu=1)))
     if length(oneAPI.devices()) > 1
-        push!(GPU_SCOPES, :oneAPI => Dagger.scope(;worker=1, intel_gpu=2))
+        push!(GPU_SCOPES, (:oneAPI, oneArray, Dagger.scope(;worker=1, intel_gpu=2)))
     end
 end
 if USE_METAL
-    push!(GPU_SCOPES, :Metal => Dagger.scope(;worker=1, metal_gpu=1))
+    push!(GPU_SCOPES, (:Metal, MtlArray, Dagger.scope(;worker=1, metal_gpu=1)))
     if length(Metal.devices()) > 1
-        push!(GPU_SCOPES, :Metal => Dagger.scope(;worker=1, metal_gpu=2))
+        push!(GPU_SCOPES, (:Metal, MtlArray, Dagger.scope(;worker=1, metal_gpu=2)))
     end
 end
 if USE_OPENCL
-    push!(GPU_SCOPES, :OpenCL => Dagger.scope(;worker=1, cl_device=1))
+    push!(GPU_SCOPES, (:OpenCL, CLArray, Dagger.scope(;worker=1, cl_device=1)))
     if length(cl.devices(cl.default_platform())) > 1
-        push!(GPU_SCOPES, :OpenCL => Dagger.scope(;worker=1, cl_device=2))
+        push!(GPU_SCOPES, (:OpenCL, CLArray, Dagger.scope(;worker=1, cl_device=2)))
     end
 end