From 18be296961c051b5857ae2400b2b9f3ce05ed322 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Sat, 4 May 2019 14:46:00 +0200 Subject: [PATCH 1/5] Use unified memory for array allocations. --- src/indexing.jl | 32 +++++++++++++++++++++++++------- src/memory.jl | 2 +- src/solver/CUSOLVER.jl | 3 ++- 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/src/indexing.jl b/src/indexing.jl index 5965972a..26d2ad0f 100644 --- a/src/indexing.jl +++ b/src/indexing.jl @@ -1,13 +1,31 @@ import GPUArrays: allowscalar, @allowscalar -function _getindex(xs::CuArray{T}, i::Integer) where T - buf = Array{T}(undef) - copyto!(buf, 1, xs, i, 1) - buf[] + +## unified memory indexing + +# TODO: needs to think about coherency -- otherwise this might crash since it doesn't sync +# also, this optim would be relevant for CuArray<->Array memcpy as well. + +function GPUArrays._getindex(xs::CuArray{T}, i::Integer) where T + buf = buffer(xs) + if isa(buf, Mem.UnifiedBuffer) + ptr = convert(Ptr{T}, buffer(xs)) + unsafe_load(ptr, i) + else + val = Array{T}(undef) + copyto!(val, 1, xs, i, 1) + val[] + end end -function _setindex!(xs::CuArray{T}, v::T, i::Integer) where T - copyto!(xs, i, T[v], 1, 1) +function GPUArrays._setindex!(xs::CuArray{T}, v::T, i::Integer) where T + buf = buffer(xs) + if isa(buf, Mem.UnifiedBuffer) + ptr = convert(Ptr{T}, buffer(xs)) + unsafe_store!(ptr, v, i) + else + copyto!(xs, i, T[v], 1, 1) + end end @@ -19,7 +37,7 @@ function Base.getindex(xs::CuArray{T}, bools::CuArray{Bool}) where {T} bools = reshape(bools, prod(size(bools))) indices = cumsum(bools) # unique indices for elements that are true - n = _getindex(indices, length(indices)) # number that are true + n = GPUArrays._getindex(indices, length(indices)) # number that are true ys = CuArray{T}(undef, n) if n > 0 diff --git a/src/memory.jl b/src/memory.jl index ef08f708..9066b666 100644 --- a/src/memory.jl +++ b/src/memory.jl @@ -50,7 +50,7 @@ function actual_alloc(bytes) # try the actual allocation try alloc_stats.actual_time += Base.@elapsed begin - @timeit alloc_to "alloc" buf = Mem.alloc(Mem.Device, bytes) + @timeit alloc_to "alloc" buf = Mem.alloc(Mem.Unified, bytes) end @assert sizeof(buf) == bytes alloc_stats.actual_nalloc += 1 diff --git a/src/solver/CUSOLVER.jl b/src/solver/CUSOLVER.jl index 8a974de0..88914783 100644 --- a/src/solver/CUSOLVER.jl +++ b/src/solver/CUSOLVER.jl @@ -1,7 +1,8 @@ module CUSOLVER using ..CuArrays -using ..CuArrays: libcusolver, active_context, _getindex, unsafe_free! +using ..CuArrays: libcusolver, active_context, unsafe_free! +using GPUArrays: _getindex using ..CUBLAS: cublasFillMode_t, cublasOperation_t, cublasSideMode_t, cublasDiagType_t using ..CUSPARSE: cusparseMatDescr_t From 731b915d5063ecc18a5071858bf86468b9df1545 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 10 Oct 2019 13:27:53 +0200 Subject: [PATCH 2/5] Monitor memory coherency. --- src/indexing.jl | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/src/indexing.jl b/src/indexing.jl index 26d2ad0f..16c07067 100644 --- a/src/indexing.jl +++ b/src/indexing.jl @@ -3,12 +3,36 @@ import GPUArrays: allowscalar, @allowscalar ## unified memory indexing -# TODO: needs to think about coherency -- otherwise this might crash since it doesn't sync -# also, this optim would be relevant for CuArray<->Array memcpy as well. +const coherent = Ref(true) + +# toggle coherency based on API calls +function set_coherency(apicall) + # TODO: whitelist + coherent[] = false + return +end + +function force_coherency() + # TODO: not on newer hardware with certain flags + + if CUDAdrv.apicall_hook[] !== set_coherency + # we didn't have our API call hook in place, all bets are off + coherent[] = false + end + + if !coherent[] + CUDAdrv.synchronize() + coherent[] = true + elseif CUDAdrv.apicall_hook[] === nothing + # nobody else is hooking for CUDA API calls, so we can safely install ours + CUDAdrv.apicall_hook[] = set_coherency + end +end function GPUArrays._getindex(xs::CuArray{T}, i::Integer) where T buf = buffer(xs) if isa(buf, Mem.UnifiedBuffer) + force_coherency() ptr = convert(Ptr{T}, buffer(xs)) unsafe_load(ptr, i) else @@ -21,6 +45,7 @@ end function GPUArrays._setindex!(xs::CuArray{T}, v::T, i::Integer) where T buf = buffer(xs) if isa(buf, Mem.UnifiedBuffer) + force_coherency() ptr = convert(Ptr{T}, buffer(xs)) unsafe_store!(ptr, v, i) else From 7abd24ef35d68c2dafe0e49485feb45cce43287a Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 10 Oct 2019 14:27:42 +0200 Subject: [PATCH 3/5] Update CUDAdrv. --- Manifest.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Manifest.toml b/Manifest.toml index 1c8ca3be..c0ec47f6 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -40,7 +40,9 @@ version = "1.2.0" [[CUDAdrv]] deps = ["CUDAapi", "Libdl", "Printf"] -git-tree-sha1 = "9ce99b5732c70e06ed97c042187baed876fb1698" +git-tree-sha1 = "f4420a71d8847fa13ad70d744fe5c3696b7efca0" +repo-rev = "master" +repo-url = "https://github.com/JuliaGPU/CUDAdrv.jl.git" uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde" version = "3.1.0" From c540950cccc0b08a83cf0d42e829234158af4691 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 10 Oct 2019 14:27:59 +0200 Subject: [PATCH 4/5] Inspect memory coherency device capabilities. --- src/CuArrays.jl | 8 ++++++-- src/indexing.jl | 42 +++++++++++++----------------------------- 2 files changed, 19 insertions(+), 31 deletions(-) diff --git a/src/CuArrays.jl b/src/CuArrays.jl index 377b17eb..684f9dc4 100644 --- a/src/CuArrays.jl +++ b/src/CuArrays.jl @@ -90,8 +90,8 @@ function __init__() # package integrations @require ForwardDiff="f6369f11-7733-5829-9624-2563aa707210" include("forwarddiff.jl") - # update the active context when we switch devices - callback = (::CuDevice, ctx::CuContext) -> begin + callback = (dev::CuDevice, ctx::CuContext) -> begin + # update the active context active_context[] = ctx # wipe the active handles @@ -103,6 +103,10 @@ function __init__() CURAND._generator[] = nothing CUDNN._handle[] = C_NULL CUTENSOR._handle[] = C_NULL + + # update the coherent memory access indicator + coherent[] = CUDAdrv.version() >= v"9.0" && + attribute(dev, CUDAdrv.CONCURRENT_MANAGED_ACCESS) end push!(CUDAnative.device!_listeners, callback) diff --git a/src/indexing.jl b/src/indexing.jl index 16c07067..a044c77b 100644 --- a/src/indexing.jl +++ b/src/indexing.jl @@ -3,37 +3,21 @@ import GPUArrays: allowscalar, @allowscalar ## unified memory indexing -const coherent = Ref(true) - -# toggle coherency based on API calls -function set_coherency(apicall) - # TODO: whitelist - coherent[] = false - return -end - -function force_coherency() - # TODO: not on newer hardware with certain flags - - if CUDAdrv.apicall_hook[] !== set_coherency - # we didn't have our API call hook in place, all bets are off - coherent[] = false - end - - if !coherent[] - CUDAdrv.synchronize() - coherent[] = true - elseif CUDAdrv.apicall_hook[] === nothing - # nobody else is hooking for CUDA API calls, so we can safely install ours - CUDAdrv.apicall_hook[] = set_coherency - end -end +# > Simultaneous access to managed memory from the CPU and GPUs of compute capability lower +# > than 6.0 is not possible. This is because pre-Pascal GPUs lack hardware page faulting, +# > so coherence can’t be guaranteed. On these GPUs, an access from the CPU while a kernel +# > is running will cause a segmentation fault. +# +# > On Pascal and later GPUs, the CPU and the GPU can simultaneously access managed memory, +# > since they can both handle page faults; however, it is up to the application developer +# > to ensure there are no race conditions caused by simultaneous accesses. +const coherent = Ref(false) function GPUArrays._getindex(xs::CuArray{T}, i::Integer) where T buf = buffer(xs) if isa(buf, Mem.UnifiedBuffer) - force_coherency() - ptr = convert(Ptr{T}, buffer(xs)) + coherent[] || CUDAdrv.synchronize() + ptr = convert(Ptr{T}, buf) unsafe_load(ptr, i) else val = Array{T}(undef) @@ -45,8 +29,8 @@ end function GPUArrays._setindex!(xs::CuArray{T}, v::T, i::Integer) where T buf = buffer(xs) if isa(buf, Mem.UnifiedBuffer) - force_coherency() - ptr = convert(Ptr{T}, buffer(xs)) + coherent[] || CUDAdrv.synchronize() + ptr = convert(Ptr{T}, buf) unsafe_store!(ptr, v, i) else copyto!(xs, i, T[v], 1, 1) From 556ae8af10ecc50d9ab41d90fce66bb82ea18b22 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 10 Oct 2019 14:28:18 +0200 Subject: [PATCH 5/5] Synchronize before getindex to prevent race conditions. --- src/indexing.jl | 6 +++--- src/solver/CUSOLVER.jl | 2 +- src/solver/dense.jl | 40 ++++++++++++++++++++-------------------- src/solver/highlevel.jl | 2 +- 4 files changed, 25 insertions(+), 25 deletions(-) diff --git a/src/indexing.jl b/src/indexing.jl index a044c77b..ab48c5cb 100644 --- a/src/indexing.jl +++ b/src/indexing.jl @@ -44,7 +44,7 @@ Base.getindex(xs::CuArray, bools::AbstractArray{Bool}) = getindex(xs, CuArray(bo function Base.getindex(xs::CuArray{T}, bools::CuArray{Bool}) where {T} bools = reshape(bools, prod(size(bools))) - indices = cumsum(bools) # unique indices for elements that are true + indices = @sync cumsum(bools) # unique indices for elements that are true n = GPUArrays._getindex(indices, length(indices)) # number that are true ys = CuArray{T}(undef, n) @@ -82,9 +82,9 @@ end ## findall function Base.findall(bools::CuArray{Bool}) - indices = cumsum(bools) + indices = @sync cumsum(bools) - n = _getindex(indices, length(indices)) + n = GPUArrays._getindex(indices, length(indices)) ys = CuArray{Int}(undef, n) if n > 0 diff --git a/src/solver/CUSOLVER.jl b/src/solver/CUSOLVER.jl index 88914783..787d2639 100644 --- a/src/solver/CUSOLVER.jl +++ b/src/solver/CUSOLVER.jl @@ -1,7 +1,7 @@ module CUSOLVER using ..CuArrays -using ..CuArrays: libcusolver, active_context, unsafe_free! +using ..CuArrays: libcusolver, active_context, unsafe_free!, @sync using GPUArrays: _getindex using ..CUBLAS: cublasFillMode_t, cublasOperation_t, cublasSideMode_t, cublasDiagType_t diff --git a/src/solver/dense.jl b/src/solver/dense.jl index 78938428..3f079ad6 100644 --- a/src/solver/dense.jl +++ b/src/solver/dense.jl @@ -34,7 +34,7 @@ for (bname, fname,elty) in ((:cusolverDnSpotrf_bufferSize, :cusolverDnSpotrf, :F buffer = CuArray{$elty}(undef, bufSize[]) devinfo = CuArray{Cint}(undef, 1) - $fname(dense_handle(), cuuplo, n, A, lda, buffer, bufSize[], devinfo) + @sync $fname(dense_handle(), cuuplo, n, A, lda, buffer, bufSize[], devinfo) unsafe_free!(buffer) info = BlasInt(_getindex(devinfo, 1)) @@ -65,7 +65,7 @@ for (fname,elty) in ((:cusolverDnSpotrs, :Float32), ldb = max(1, stride(B, 2)) devinfo = CuArray{Cint}(undef, 1) - $fname(dense_handle(), cuuplo, n, nrhs, A, lda, B, ldb, devinfo) + @sync $fname(dense_handle(), cuuplo, n, nrhs, A, lda, B, ldb, devinfo) info = _getindex(devinfo, 1) unsafe_free!(devinfo) @@ -91,7 +91,7 @@ for (bname, fname,elty) in ((:cusolverDnSgetrf_bufferSize, :cusolverDnSgetrf, :F buffer = CuArray{$elty}(undef, bufSize[]) devipiv = CuArray{Cint}(undef, min(m,n)) devinfo = CuArray{Cint}(undef, 1) - $fname(dense_handle(), m, n, A, lda, buffer, devipiv, devinfo) + @sync $fname(dense_handle(), m, n, A, lda, buffer, devipiv, devinfo) unsafe_free!(buffer) info = _getindex(devinfo, 1) @@ -122,7 +122,7 @@ for (bname, fname,elty) in ((:cusolverDnSgeqrf_bufferSize, :cusolverDnSgeqrf, :F buffer = CuArray{$elty}(undef, bufSize[]) tau = CuArray{$elty}(undef, min(m, n)) devinfo = CuArray{Cint}(undef, 1) - $fname(dense_handle(), m, n, A, lda, tau, buffer, bufSize[], devinfo) + @sync $fname(dense_handle(), m, n, A, lda, tau, buffer, bufSize[], devinfo) unsafe_free!(buffer) info = _getindex(devinfo, 1) @@ -153,7 +153,7 @@ for (bname, fname,elty) in ((:cusolverDnSsytrf_bufferSize, :cusolverDnSsytrf, :F buffer = CuArray{$elty}(undef, bufSize[]) devipiv = CuArray{Cint}(undef, n) devinfo = CuArray{Cint}(undef, 1) - $fname(dense_handle(), cuuplo, n, A, lda, devipiv, buffer, bufSize[], devinfo) + @sync $fname(dense_handle(), cuuplo, n, A, lda, devipiv, buffer, bufSize[], devinfo) unsafe_free!(buffer) info = _getindex(devinfo, 1) @@ -192,7 +192,7 @@ for (fname,elty) in ((:cusolverDnSgetrs, :Float32), ldb = max(1, stride(B, 2)) devinfo = CuArray{Cint}(undef, 1) - $fname(dense_handle(), cutrans, n, nrhs, A, lda, ipiv, B, ldb, devinfo) + @sync $fname(dense_handle(), cutrans, n, nrhs, A, lda, ipiv, B, ldb, devinfo) info = _getindex(devinfo, 1) unsafe_free!(devinfo) @@ -240,8 +240,8 @@ for (bname, fname, elty) in ((:cusolverDnSormqr_bufferSize, :cusolverDnSormqr, : buffer = CuArray{$elty}(undef, bufSize[]) devinfo = CuArray{Cint}(undef, 1) - $fname(dense_handle(), cuside, cutrans, m, n, k, A, lda, tau, C, ldc, buffer, - bufSize[], devinfo) + @sync $fname(dense_handle(), cuside, cutrans, m, n, k, A, lda, tau, C, ldc, buffer, + bufSize[], devinfo) unsafe_free!(buffer) info = _getindex(devinfo, 1) @@ -272,7 +272,7 @@ for (bname, fname, elty) in ((:cusolverDnSorgqr_bufferSize, :cusolverDnSorgqr, : buffer = CuArray{$elty}(undef, bufSize[]) devinfo = CuArray{Cint}(undef, 1) - $fname(dense_handle(), m, n, k, A, lda, tau, buffer, bufSize[], devinfo) + @sync $fname(dense_handle(), m, n, k, A, lda, tau, buffer, bufSize[], devinfo) unsafe_free!(buffer) info = _getindex(devinfo, 1) @@ -310,7 +310,7 @@ for (bname, fname, elty, relty) in ((:cusolverDnSgebrd_bufferSize, :cusolverDnSg E = CuArrays.zeros($relty, k) TAUQ = CuArray{$elty}(undef, k) TAUP = CuArray{$elty}(undef, k) - $fname(dense_handle(), m, n, A, lda, D, E, TAUQ, TAUP, buffer, bufSize[], devinfo) + @sync $fname(dense_handle(), m, n, A, lda, D, E, TAUQ, TAUP, buffer, bufSize[], devinfo) unsafe_free!(buffer) info = _getindex(devinfo, 1) @@ -364,8 +364,8 @@ for (bname, fname, elty, relty) in ((:cusolverDnSgesvd_bufferSize, :cusolverDnSg work = CuArray{$elty}(undef, lwork[]) rwork = CuArray{$relty}(undef, min(m, n) - 1) devinfo = CuArray{Cint}(undef, 1) - $fname(dense_handle(), jobu, jobvt, m, n, A, lda, S, U, ldu, Vt, ldvt, - work, lwork[], rwork, devinfo) + @sync $fname(dense_handle(), jobu, jobvt, m, n, A, lda, S, U, ldu, Vt, ldvt, + work, lwork[], rwork, devinfo) unsafe_free!(work) unsafe_free!(rwork) @@ -423,8 +423,8 @@ for (bname, fname, elty, relty) in ((:cusolverDnSgesvdj_bufferSize, :cusolverDnS work = CuArray{$elty}(undef, lwork[]) devinfo = CuArray{Cint}(undef, 1) - $fname(dense_handle(), cujobz, econ, m, n, A, lda, S, U, ldu, V, ldv, - work, lwork[], devinfo, params[]) + @sync $fname(dense_handle(), cujobz, econ, m, n, A, lda, S, U, ldu, V, ldv, + work, lwork[], devinfo, params[]) unsafe_free!(work) info = _getindex(devinfo, 1) @@ -459,8 +459,8 @@ for (jname, bname, fname, elty, relty) in ((:syevd!, :cusolverDnSsyevd_bufferSiz buffer = CuArray{$elty}(undef, bufSize[]) devinfo = CuArray{Cint}(undef, 1) - $fname(dense_handle(), cujobz, cuuplo, n, A, lda, W, - buffer, bufSize[], devinfo) + @sync $fname(dense_handle(), cujobz, cuuplo, n, A, lda, W, + buffer, bufSize[], devinfo) unsafe_free!(buffer) info = _getindex(devinfo, 1) @@ -505,8 +505,8 @@ for (jname, bname, fname, elty, relty) in ((:sygvd!, :cusolverDnSsygvd_bufferSiz buffer = CuArray{$elty}(undef, bufSize[]) devinfo = CuArray{Cint}(undef, 1) - $fname(dense_handle(), cuitype, cujobz, cuuplo, n, A, lda, B, ldb, W, - buffer, bufSize[], devinfo) + @sync $fname(dense_handle(), cuitype, cujobz, cuuplo, n, A, lda, B, ldb, W, + buffer, bufSize[], devinfo) unsafe_free!(buffer) info = _getindex(devinfo, 1) @@ -558,8 +558,8 @@ for (jname, bname, fname, elty, relty) in ((:sygvj!, :cusolverDnSsygvj_bufferSiz buffer = CuArray{$elty}(undef, bufSize[]) devinfo = CuArray{Cint}(undef, 1) - $fname(dense_handle(), cuitype, cujobz, cuuplo, n, A, lda, B, ldb, W, - buffer, bufSize[], devinfo, params[]) + @sync $fname(dense_handle(), cuitype, cujobz, cuuplo, n, A, lda, B, ldb, W, + buffer, bufSize[], devinfo, params[]) unsafe_free!(buffer) info = _getindex(devinfo, 1) diff --git a/src/solver/highlevel.jl b/src/solver/highlevel.jl index d54939bb..55218182 100644 --- a/src/solver/highlevel.jl +++ b/src/solver/highlevel.jl @@ -56,7 +56,7 @@ LinearAlgebra.lmul!(trA::Transpose{T,<:CuQRPackedQ{T,S}}, B::CuVecOrMat{T}) wher function Base.getindex(A::CuQRPackedQ{T, S}, i::Integer, j::Integer) where {T, S} x = CuArrays.zeros(T, size(A, 2)) x[j] = 1 - lmul!(A, x) + @sync lmul!(A, x) return _getindex(x, i) end