Merge #53

bors[bot] · leios · web-flow · commit 70cf00f2de60 · 2020-03-07T01:56:00.000Z
53: Implement async_copy! r=vchuravy a=vchuravy

bors r+

Co-authored-by: James Schloss &lt;jrs.schloss@gmail.com&gt;
diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
@@ -3,6 +3,8 @@ module KernelAbstractions
 export @kernel
 export @Const, @localmem, @private, @uniform, @synchronize, @index, groupsize
 export Device, GPU, CPU, CUDA, Event
+export async_copy!
+
 
 using MacroTools
 using StaticArrays
@@ -61,10 +63,12 @@ macro Const end
 abstract type Event end
 import Base.wait
 
-# TODO
+"""
+    async_copy!(::Device, dest::AbstractArray, src::AbstractArray; dependencies = nothing)
+
+Perform an asynchronous copy on the device. Returns an event that can be waited upon.
+"""
 function async_copy! end
-# function register end
-# function unregister end
 
 ###
 # Kernel language
diff --git a/src/backends/cpu.jl b/src/backends/cpu.jl
@@ -15,11 +15,30 @@ function wait(::CPU, ev::CPUEvent, progress=nothing)
     else
         while !Base.istaskdone(ev.task)
             progress()
-            yield() # yield to the scheduler
+        end
+    end
+end
+function __waitall(::CPU, dependencies, progress)
+    if dependencies isa Event
+        dependencies = (dependencies,)
+    end
+    if dependencies !== nothing
+        dependencies = collect(dependencies)
+        cpudeps   = filter(d->d isa CPUEvent && d.task !== nothing, dependencies)
+        otherdeps = filter(d->!(d isa CPUEvent), dependencies)
+        Base.sync_end(map(e->e.task, cpudeps))
+        for event in otherdeps
+            wait(CPU(), event, progress)
         end
     end
 end
 
+function async_copy!(::CPU, A, B; dependencies=nothing)
+    __waitall(CPU(), dependencies, yield)
+    copyto!(A, B)
+    return CPUEvent(nothing)
+end
+
 function (obj::Kernel{CPU})(args...; ndrange=nothing, workgroupsize=nothing, dependencies=nothing)
     if ndrange isa Integer
         ndrange = (ndrange,)
@@ -47,20 +66,7 @@ end
 # Inference barrier
 function __run(obj, ndrange, iterspace, args, dependencies)
     return Threads.@spawn begin
-        if dependencies !== nothing
-            cpu_tasks = Core.Task[]
-            for event in dependencies
-                if event isa CPUEvent && event.task isa Core.Task
-                    push!(cpu_tasks, event.task)
-                end
-            end
-            !isempty(cpu_tasks) && Base.sync_end(cpu_tasks)
-            for event in dependencies
-                if !(event isa CPUEvent)
-                    wait(CPU(), event, ()->yield())
-                end
-            end
-        end
+        __waitall(CPU(), dependencies, yield)
         @sync begin
             # TODO: how do we use the information that the iteration space maps perfectly to
             #       the ndrange without incurring a 2x compilation overhead
diff --git a/src/backends/cuda.jl b/src/backends/cuda.jl
@@ -1,6 +1,6 @@
 import CUDAnative, CUDAdrv
 import CUDAnative: cufunction, DevicePtr
-import CUDAdrv: CuEvent, CuStream, CuDefaultStream
+import CUDAdrv: CuEvent, CuStream, CuDefaultStream, Mem
 
 const FREE_STREAMS = CuStream[]
 const STREAMS = CuStream[]
@@ -76,10 +76,66 @@ wait(::CUDA, ev::CudaEvent, progress=nothing) = __enqueue_wait(ev, CUDAdrv.CuDef
 # but which stream would we target?
 wait(::CUDA, ev::CPUEvent,  progress=nothing) = wait(CPU(), ev, progress)
 
-function __enqueue_wait(ev::CudaEvent, stream::CuStream)
-    CUDAdrv.wait(ev.event, stream)
+function __waitall(::CUDA, dependencies, progress, stream)
+    if dependencies isa Event
+        dependencies = (dependencies,)
+    end
+    if dependencies !== nothing
+        dependencies = collect(dependencies)
+        cudadeps  = filter(d->d isa CudaEvent,    dependencies)
+        otherdeps = filter(d->!(d isa CudaEvent), dependencies)
+        for event in cudadeps
+            CUDAdrv.wait(event.event, stream)
+        end
+        for event in otherdeps
+            wait(CUDA(), event, progress)
+        end
+    end
+end
+
+###
+# async_copy
+###
+# - IdDict does not free the memory
+# - WeakRef dict does not unique the key by objectid
+const __pinned_memory = Dict{UInt64, WeakRef}()
+
+function __pin!(a)
+    # use pointer instead of objectid?
+    oid = objectid(a)
+    if haskey(__pinned_memory, oid) && __pinned_memory[oid].value !== nothing
+        return nothing
+    end
+    ad = Mem.register(Mem.Host, pointer(a), sizeof(a))
+    finalizer(_ -> Mem.unregister(ad), a)
+    __pinned_memory[oid] = WeakRef(a)
+    return nothing
+end
+
+function async_copy!(::CUDA, A, B; dependencies=nothing)
+    A isa Array && __pin!(A)
+    B isa Array && __pin!(B)
+
+    stream = next_stream()
+    __waitall(CUDA(), dependencies, yield, stream)
+    event = CuEvent(CUDAdrv.EVENT_DISABLE_TIMING)
+    GC.@preserve A B begin
+        destptr = pointer(A)
+        srcptr  = pointer(B)
+        N       = length(A)
+        unsafe_copyto!(destptr, srcptr, N, async=true, stream=stream)
+    end
+
+    CUDAdrv.record(event, stream)
+
+    return CudaEvent(event)
 end
 
+
+
+###
+# Kernel launch
+###
 function (obj::Kernel{CUDA})(args...; ndrange=nothing, dependencies=nothing, workgroupsize=nothing)
     if ndrange isa Integer
         ndrange = (ndrange,)
@@ -92,18 +148,7 @@ function (obj::Kernel{CUDA})(args...; ndrange=nothing, dependencies=nothing, wor
     end
 
     stream = next_stream()
-    if dependencies !== nothing
-        for event in dependencies
-            if event isa CudaEvent
-                __enqueue_wait(event, stream)
-            end
-        end
-        for event in dependencies
-            if !(event isa CudaEvent)
-                wait(CUDA(), event, ()->yield())
-            end
-        end
-    end
+    __waitall(CUDA(), dependencies, yield, stream)
 
     if KernelAbstractions.workgroupsize(obj) <: DynamicSize && workgroupsize === nothing
         # TODO: allow for NDRange{1, DynamicSize, DynamicSize}(nothing, nothing)
diff --git a/test/async_copy.jl b/test/async_copy.jl
@@ -0,0 +1,25 @@
+using KernelAbstractions, Test, CUDAapi
+if has_cuda_gpu()
+    using CuArrays, CUDAdrv
+    CuArrays.allowscalar(false)
+end
+
+function copy_test(backend, ArrayT, M)
+    A = ArrayT(rand(Float64, M))
+    B = ArrayT(rand(Float64, M))
+
+    a = Array{Float64}(undef, M)
+    event = async_copy!(backend, a, B, dependencies=Event(CPU()))
+    event = async_copy!(backend, A, a, dependencies=event)
+    wait(event)
+
+    @test isapprox(a, Array(A))
+    @test isapprox(a, Array(B))
+end
+
+M = 1024
+
+if has_cuda_gpu()
+    copy_test(CUDA(), CuArray, M)
+end
+copy_test(CPU(), CuArray, M)