JuliaGPU
diff --git a/‎lib/cusparse/CUSPARSE.jl
Lines changed: 3 additions & 0 deletions b/‎lib/cusparse/CUSPARSE.jl
Lines changed: 3 additions & 0 deletions
diff --git a/‎lib/cusparse/array.jl
Lines changed: 87 additions & 2 deletions b/‎lib/cusparse/array.jl
Lines changed: 87 additions & 2 deletions
diff --git a/‎lib/cusparse/batched.jl
Lines changed: 64 additions & 0 deletions b/‎lib/cusparse/batched.jl
Lines changed: 64 additions & 0 deletions
diff --git a/‎lib/cusparse/device.jl
Lines changed: 23 additions & 0 deletions b/‎lib/cusparse/device.jl
Lines changed: 23 additions & 0 deletions
diff --git a/‎lib/cusparse/generic.jl
Lines changed: 84 additions & 0 deletions b/‎lib/cusparse/generic.jl
Lines changed: 84 additions & 0 deletions
@@ -32,6 +32,7 @@ include("util.jl")
 include("types.jl")
 include("linalg.jl")
 
+
 # low-level wrappers
 include("helpers.jl")
 include("management.jl")
@@ -51,6 +52,8 @@ include("device.jl")
 include("broadcast.jl")
 include("reduce.jl")
 
+include("batched.jl")
+
 # cache for created, but unused handles
 const idle_handles = HandleCache{CuContext,cusparseHandle_t}()
 
 
@@ -3,6 +3,7 @@
 
 export CuSparseMatrixCSC, CuSparseMatrixCSR, CuSparseMatrixBSR, CuSparseMatrixCOO,
        CuSparseMatrix, AbstractCuSparseMatrix,
+       CuSparseArrayCSR,
        CuSparseVector,
        CuSparseVecOrMat
 
@@ -141,6 +142,32 @@ end
 
 CuSparseMatrixCOO(A::CuSparseMatrixCOO) = A
 
+mutable struct CuSparseArrayCSR{Tv, Ti, N} <: AbstractCuSparseArray{Tv, Ti, N}
+    rowPtr::CuArray{Ti}
+    colVal::CuArray{Ti}
+    nzVal::CuArray{Tv}
+    dims::NTuple{N,Int}
+    nnz::Ti
+
+    function CuSparseArrayCSR{Tv, Ti, N}(rowPtr::CuArray{<:Integer, M}, colVal::CuArray{<:Integer, M}, nzVal::CuArray{Tv, M}, dims::NTuple{N,<:Integer}) where {Tv, Ti<:Integer, M, N}
+        @assert M == N - 1 "CuSparseArrayCSR requires ndims(rowPtr) == ndims(colVal) == ndims(nzVal) == length(dims) - 1"
+        new{Tv, Ti, N}(rowPtr, colVal, nzVal, dims, length(nzVal))
+    end
+end
+
+CuSparseArrayCSR(A::CuSparseArrayCSR) = A
+
+function CUDA.unsafe_free!(xs::CuSparseArrayCSR)
+    unsafe_free!(xs.rowPtr)
+    unsafe_free!(xs.colVal)
+    unsafe_free!(nonzeros(xs))
+    return
+end
+
+# broadcast over batch-dim if batchsize==1
+ptrstride(A::CuSparseArrayCSR) = size(A.rowPtr, 2) > 1 ? stride(A.rowPtr, 2) : 0
+valstride(A::CuSparseArrayCSR) = size(A.nzVal, 2) > 1 ? stride(A.nzVal, 2) : 0
+
 """
 Utility union type of [`CuSparseMatrixCSC`](@ref), [`CuSparseMatrixCSR`](@ref),
 [`CuSparseMatrixBSR`](@ref), [`CuSparseMatrixCOO`](@ref).
@@ -154,7 +181,6 @@ const CuSparseMatrix{Tv, Ti} = Union{
 
 const CuSparseVecOrMat = Union{CuSparseVector,CuSparseMatrix}
 
-
 # NOTE: we use Cint as default Ti on CUDA instead of Int to provide
 # maximum compatiblity to old CUSPARSE APIs
 function CuSparseVector{Tv}(iPtr::CuVector{<:Integer}, nzVal::CuVector, len::Integer) where {Tv}
@@ -183,6 +209,11 @@ function CuSparseMatrixCOO{Tv}(rowInd::CuVector{<:Integer}, colInd::CuVector{<:I
     CuSparseMatrixCOO{Tv, Cint}(rowInd,colInd,nzVal,dims,nnz)
 end
 
+function CuSparseArrayCSR{Tv}(rowPtr::CuArray{<:Integer, M}, colVal::CuArray{<:Integer, M},
+                              nzVal::CuArray{Tv, M}, dims::NTuple{N,<:Integer}) where {Tv, M, N}
+    CuSparseArrayCSR{Tv, Cint, N}(rowPtr, colVal, nzVal, dims)
+end
+
 ## convenience constructors
 CuSparseVector(iPtr::DenseCuArray{<:Integer}, nzVal::DenseCuArray{T}, len::Integer) where {T} =
     CuSparseVector{T}(iPtr, nzVal, len)
@@ -201,6 +232,9 @@ CuSparseMatrixBSR(rowPtr::DenseCuArray, colVal::DenseCuArray, nzVal::DenseCuArra
 CuSparseMatrixCOO(rowInd::DenseCuArray, colInd::DenseCuArray, nzVal::DenseCuArray{T}, dims::NTuple{2,<:Integer}, nnz::Integer=length(nzVal)) where T =
     CuSparseMatrixCOO{T}(rowInd, colInd, nzVal, dims, nnz)
 
+CuSparseArrayCSR(rowPtr::DenseCuArray, colVal::DenseCuArray, nzVal::DenseCuArray{T}, dims::NTuple{N,<:Integer}) where {T,N} =
+    CuSparseArrayCSR{T}(rowPtr, colVal, nzVal, dims)
+
 Base.similar(Vec::CuSparseVector) = CuSparseVector(copy(nonzeroinds(Vec)), similar(nonzeros(Vec)), length(Vec))
 Base.similar(Mat::CuSparseMatrixCSC) = CuSparseMatrixCSC(copy(Mat.colPtr), copy(rowvals(Mat)), similar(nonzeros(Mat)), size(Mat))
 Base.similar(Mat::CuSparseMatrixCSR) = CuSparseMatrixCSR(copy(Mat.rowPtr), copy(Mat.colVal), similar(nonzeros(Mat)), size(Mat))
@@ -216,6 +250,7 @@ Base.similar(Mat::CuSparseMatrixCOO, T::Type) = CuSparseMatrixCOO(copy(Mat.rowIn
 Base.similar(Mat::CuSparseMatrixCSC, T::Type, N::Int, M::Int) =  CuSparseMatrixCSC(CuVector{Int32}(undef, M+1), CuVector{Int32}(undef, nnz(Mat)), CuVector{T}(undef, nnz(Mat)), (N,M))
 Base.similar(Mat::CuSparseMatrixCSR, T::Type, N::Int, M::Int) =  CuSparseMatrixCSR(CuVector{Int32}(undef, N+1), CuVector{Int32}(undef, nnz(Mat)), CuVector{T}(undef, nnz(Mat)), (N,M))
 Base.similar(Mat::CuSparseMatrixCOO, T::Type, N::Int, M::Int) =  CuSparseMatrixCOO(CuVector{Int32}(undef, nnz(Mat)), CuVector{Int32}(undef, nnz(Mat)), CuVector{T}(undef, nnz(Mat)), (N,M))
+Base.similar(Mat::CuSparseArrayCSR) = CuSparseArrayCSR(copy(Mat.rowPtr), copy(Mat.colVal), similar(nonzeros(Mat)), size(Mat))
 
 ## array interface
 
@@ -225,6 +260,9 @@ Base.size(g::CuSparseVector) = (g.len,)
 Base.length(g::CuSparseMatrix) = prod(g.dims)
 Base.size(g::CuSparseMatrix) = g.dims
 
+Base.length(g::CuSparseArrayCSR) = prod(g.dims)
+Base.size(g::CuSparseArrayCSR) = g.dims
+
 function Base.size(g::CuSparseVector, d::Integer)
     if d == 1
         return g.len
@@ -245,6 +283,15 @@ function Base.size(g::CuSparseMatrix, d::Integer)
     end
 end
 
+function Base.size(g::CuSparseArrayCSR{Tv,Ti,N}, d::Integer) where {Tv,Ti,N}
+    if 1 <= d <= N
+        return g.dims[d]
+    elseif d > 1
+        return 1
+    else
+        throw(ArgumentError("dimension must be ≥ 1, got $d"))
+    end
+end
 
 ## sparse array interface
 
@@ -348,6 +395,16 @@ function Base.getindex(A::CuSparseMatrixBSR{T}, i0::Integer, i1::Integer) where
     nonzeros(A)[c1+block_idx]
 end
 
+# matrix slices
+function Base.getindex(A::CuSparseArrayCSR{Tv, Ti, N}, ::Colon, ::Colon, idxs::Integer...) where {Tv, Ti, N}
+    @boundscheck checkbounds(A, :, :, idxs...)
+    CuSparseMatrixCSR(A.rowPtr[:,idxs...], A.colVal[:,idxs...], nonzeros(A)[:,idxs...], size(A)[1:2])
+end
+
+function Base.getindex(A::CuSparseArrayCSR{Tv, Ti, N}, i0::Integer, i1::Integer, idxs::Integer...) where {Tv, Ti, N}
+    @boundscheck checkbounds(A, i0, i1, idxs...)
+    CuSparseMatrixCSR(A.rowPtr[:,idxs...], A.colVal[:,idxs...], nonzeros(A)[:,idxs...], size(A)[1:2])[i0, i1]
+end
 
 ## interop with sparse CPU arrays
 
@@ -502,7 +559,7 @@ Base.copy(Mat::CuSparseMatrixCSC) = copyto!(similar(Mat), Mat)
 Base.copy(Mat::CuSparseMatrixCSR) = copyto!(similar(Mat), Mat)
 Base.copy(Mat::CuSparseMatrixBSR) = copyto!(similar(Mat), Mat)
 Base.copy(Mat::CuSparseMatrixCOO) = copyto!(similar(Mat), Mat)
-
+Base.copy(Mat::CuSparseArrayCSR) = CuSparseArrayCSR(copy(Mat.rowPtr), copy(Mat.colVal), copy(nonzeros(Mat)), size(Mat))
 
 # input/output
 
@@ -543,6 +600,24 @@ for (gpu, cpu) in [:CuSparseMatrixCSC => :SparseMatrixCSC,
     end
 end
 
+function Base.show(io::IOContext, ::MIME"text/plain", A::CuSparseArrayCSR)
+    xnnz = nnz(A)
+    dims = join(size(A), "×")
+
+    print(io, dims..., " ", typeof(A), " with ", xnnz, " stored ", xnnz == 1 ? "entry" : "entries")
+
+    if all(size(A) .> 0)
+        println(io, ":")
+        io = IOContext(io, :typeinfo => eltype(A))
+        for (k, c) in enumerate(CartesianIndices(size(A)[3:end]))
+            k > 1 && println(io, "\n")
+            dims = join(c.I, ", ")
+            println(io, "[:, :, $dims] =")
+            Base.print_array(io, SparseMatrixCSC(A[:,:,c.I...]))
+        end
+    end
+end
+
 
 # interop with device arrays
 
@@ -590,3 +665,13 @@ function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseMatrixCOO)
         size(x), x.nnz
     )
 end
+
+function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseArrayCSR)
+    return CuSparseDeviceArrayCSR(
+        adapt(to, x.rowPtr),
+        adapt(to, x.colVal),
+        adapt(to, x.nzVal),
+        size(x), x.nnz
+    )
+end
+
@@ -0,0 +1,64 @@
+function Base.cat(As::CuSparseMatrixCSR...; dims=3)
+    if dims == 1
+        return hcat(As)
+    elseif dims == 2
+        return vcat(As)
+    end
+    newsize = (size(As[1])..., ones(Int, dims-3)..., length(As))
+    CuSparseArrayCSR(cat([A.rowPtr for A in As]...; dims=dims-1),
+                     cat([A.colVal for A in As]...; dims=dims-1),
+                     cat([A.nzVal  for A in As]...; dims=dims-1),
+                     newsize)
+end
+
+function Base.cat(As::CuSparseArrayCSR...; dims=3)
+    if dims == 1
+        return hcat(As)
+    elseif dims == 2
+        return vcat(As)
+    end
+    rowPtr = cat([A.rowPtr for A in As]...; dims=dims-1)
+    CuSparseArrayCSR(rowPtr,
+                     cat([A.colVal for A in As]...; dims=dims-1),
+                     cat([A.nzVal  for A in As]...; dims=dims-1),
+                     (size(As[1])[1:2]..., size(rowPtr)[2:end]...))
+end
+
+# we can't reshape the first two dimensions
+function Base.reshape(A::Union{CuSparseArrayCSR, CuSparseMatrixCSR}, ::Colon, ::Colon, bshape::Int64...) 
+    CuSparseArrayCSR(reshape(A.rowPtr, :, bshape...),
+                     reshape(A.colVal, :, bshape...),
+                     reshape(A.nzVal,  :, bshape...),
+                     (size(A)[1:2]..., bshape...))
+end
+
+function Base.reshape(A::CuSparseArrayCSR, dims::Int64...)
+    s1, s2, bshape = dims[1], dims[2], dims[3:end]
+    @assert s1 == size(A, 1) && s2 == size(A, 2)
+    CuSparseArrayCSR(reshape(A.rowPtr, :, bshape...),
+                     reshape(A.colVal, :, bshape...),
+                     reshape(A.nzVal,  :, bshape...),
+                     (size(A)[1:2]..., bshape...))
+end
+
+# reshape to have a single batch dimension
+function Base.reshape(A::CuSparseArrayCSR, ::Colon, ::Colon, ::Colon)
+    b = prod(size(A)[3:end])
+    CuSparseArrayCSR(reshape(A.rowPtr, :, b),
+                     reshape(A.colVal, :, b),
+                     reshape(A.nzVal,  :, b),
+                     (size(A)[1:2]..., b))
+end
+
+# repeat non-matrix dimensions
+function Base.repeat(A::Union{CuSparseArrayCSR, CuSparseMatrixCSR}, r1::Int64, r2::Int64, rs::Int64...)
+    @assert r1 == 1 && r2 == 1 "Cannot repeat matrix dimensions of CuSparseCSR"
+    CuSparseArrayCSR(repeat(A.rowPtr, 1, rs...),
+                     repeat(A.colVal, 1, rs...),
+                     repeat(A.nzVal,  1, rs...),
+                     (size(A)[1:2]..., [size(A,i+2)*rs[i] for i=1:length(rs)]...))
+end
+
+# scalar addition/subtraction, scalar mul/div (see interfaces.jl +412)
+
+# chkmmdims (see util.jl)
@@ -72,6 +72,22 @@ Base.length(g::CuSparseDeviceMatrixCOO) = prod(g.dims)
 Base.size(g::CuSparseDeviceMatrixCOO) = g.dims
 SparseArrays.nnz(g::CuSparseDeviceMatrixCOO) = g.nnz
 
+struct CuSparseDeviceArrayCSR{Tv, Ti, N, M, A} <: AbstractSparseArray{Tv, Ti, N}
+    rowPtr::CuDeviceArray{Ti, M, A} 
+    colVal::CuDeviceArray{Ti, M, A} 
+    nzVal::CuDeviceArray{Tv, M, A} 
+    dims::NTuple{N, Int}
+    nnz::Ti
+end
+
+function CuSparseDeviceArrayCSR{Tv, Ti, N, A}(rowPtr::CuArray{<:Integer, M}, colVal::CuArray{<:Integer, M}, nzVal::CuArray{Tv, M}, dims::NTuple{N,<:Integer}) where {Tv, Ti<:Integer, M, N, A}
+    @assert M == N - 1 "CuSparseDeviceArrayCSR requires ndims(rowPtr) == ndims(colVal) == ndims(nzVal) == length(dims) - 1"
+    CuSparseDeviceArrayCSR{Tv, Ti, N, M, A}(rowPtr, colVal, nzVal, dims, length(nzVal))
+end
+
+Base.length(g::CuSparseDeviceArrayCSR) = prod(g.dims)
+Base.size(g::CuSparseDeviceArrayCSR) = g.dims
+SparseArrays.nnz(g::CuSparseDeviceArrayCSR) = g.nnz
 
 # input/output
 
@@ -108,3 +124,10 @@ function Base.show(io::IO, ::MIME"text/plain", A::CuSparseDeviceMatrixCOO)
     println(io, "  colInd: $(A.colInd)")
     print(io,   "  nzVal:  $(A.nzVal)")
 end
+
+function Base.show(io::IO, ::MIME"text/plain", A::CuSparseDeviceArrayCSR)
+    println(io, "$(length(A))-element device sparse array CSR at:")
+    println(io, "  rowPtr: $(A.rowPtr)")
+    println(io, "  colVal: $(A.colVal)")
+    print(io,   "  nzVal:  $(A.nzVal)")
+end
@@ -2,6 +2,7 @@
 
 export gather!, scatter!, axpby!, rot!
 export vv!, sv!, sm!, gemm, gemm!, sddmm!
+export bmm!
 
 ## API functions
 
@@ -227,6 +228,89 @@ function mm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::Union{CuS
     descB = CuDenseMatrixDescriptor(B)
     descC = CuDenseMatrixDescriptor(C)
 
+    # cusparseDnMatSetStridedBatch(descB, size(B,3), size(B,1)*size(B,2))
+    # cusparseDnMatSetStridedBatch(descB, size(B,3), size(B,1)*size(B,2))
+    # batchsize = length(nonzeros(A)) ÷ nnz(A)
+    # if batchsize > 1
+    #     cusparseCsrSetStridedBatch(obj, batchsize, 0, nnz(A))
+    # end
+
+    function bufferSize()
+        out = Ref{Csize_t}()
+        cusparseSpMM_bufferSize(
+            handle(), transa, transb, Ref{T}(alpha), descA, descB, Ref{T}(beta),
+            descC, T, algo, out)
+        return out[]
+    end
+    with_workspace(bufferSize) do buffer
+        # Uncomment if we find a way to reuse the buffer (issue #1362)
+        # cusparseSpMM_preprocess(
+        #     handle(), transa, transb, Ref{T}(alpha), descA, descB, Ref{T}(beta),
+        #     descC, T, algo, buffer)
+        # end
+        cusparseSpMM(
+            handle(), transa, transb, Ref{T}(alpha), descA, descB, Ref{T}(beta),
+            descC, T, algo, buffer)
+    end
+    return C
+end
+
+function bmm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::CuSparseArrayCSR{T,Ti,N},
+              B::DenseCuArray{T,N}, beta::Number, C::DenseCuArray{T,N}, index::SparseChar, algo::cusparseSpMMAlg_t=CUSPARSE_SPMM_ALG_DEFAULT) where {T,Ti,N}
+    Ar = reshape(A, :, :, :)
+    Br = reshape(B, size(B,1), size(B,2), :)
+    Cr = reshape(C, size(C,1), size(C,2), :)
+    bmm!(transa, transb, alpha, Ar, Br, beta, Cr, index, algo)
+    return C
+end
+
+# batched sparse * dense -> dense matmul
+function bmm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::CuSparseArrayCSR{T,Ti,3},
+              B::DenseCuArray{T,3}, beta::Number, C::DenseCuArray{T,3}, index::SparseChar, algo::cusparseSpMMAlg_t=CUSPARSE_SPMM_ALG_DEFAULT) where {T,Ti}
+
+    if CUSPARSE.version() < v"11.7.2"
+        throw(ErrorException("Batched dense-matrix times batched sparse-matrix (bmm!) requires a CUSPARSE version ≥ 11.7.2 (yours: $(CUSPARSE.version()))."))
+    end
+
+
+    # Support transa = 'C' and `transb = 'C' for real matrices
+    transa = T <: Real && transa == 'C' ? 'T' : transa
+    transb = T <: Real && transb == 'C' ? 'T' : transb
+
+    m, k = size(A)[1:2]
+    n, bc = size(C)[2:3]
+    b = max(size(A, 3), size(B, 3))
+
+    if b != bc
+        throw(ArgumentError("C must have same batch-dimension as max(size(A,3)=$(size(A,3)), size(B,3)=$(size(B,3))), got $(size(C,3))."))
+    end
+
+    if n == 1 && b > 1
+        throw(ArgumentError("bmm! does not work for n==1 and b>1 due to CUDA error."))
+    end
+
+    if transa == 'N' && transb == 'N'
+        chkbmmdims(B,C,k,n,m,n)
+    elseif transa == 'N' && transb != 'N'
+        chkbmmdims(B,C,n,k,m,n)
+    elseif transa != 'N' && transb == 'N'
+        chkbmmdims(B,C,m,n,k,n)
+    elseif transa != 'N' && transb != 'N'
+        chkbmmdims(B,C,n,m,k,n)
+    end
+
+    descA = CuSparseMatrixDescriptor(A, index)
+    descB = CuDenseMatrixDescriptor(B)
+    descC = CuDenseMatrixDescriptor(C)
+
+    cusparseCsrSetStridedBatch(descA, b, ptrstride(A), valstride(A))
+
+    strideB = stride(B, 3) 
+    cusparseDnMatSetStridedBatch(descB, b, strideB)
+
+    strideC = stride(C, 3)
+    cusparseDnMatSetStridedBatch(descC, b, strideC)
+
     function bufferSize()
         out = Ref{Csize_t}()
         cusparseSpMM_bufferSize(