Skip to content

Commit f81d8e1

Browse files
authored
Merge pull request #62 from JuliaGPU/vc/multievent
add multievents
2 parents 42cc70e + 6c32d0d commit f81d8e1

File tree

4 files changed

+55
-44
lines changed

4 files changed

+55
-44
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "KernelAbstractions"
22
uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
33
authors = ["Valentin Churavy <v.churavy@gmail.com>"]
4-
version = "0.1.1"
4+
version = "0.1.2"
55

66
[deps]
77
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"

src/KernelAbstractions.jl

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ module KernelAbstractions
22

33
export @kernel
44
export @Const, @localmem, @private, @uniform, @synchronize, @index, groupsize, @print
5-
export Device, GPU, CPU, CUDA, Event
5+
export Device, GPU, CPU, CUDA, Event, MultiEvent
66
export async_copy!
77

88

@@ -64,6 +64,27 @@ macro Const end
6464
abstract type Event end
6565
import Base.wait
6666

67+
struct NoneEvent <: Event end
68+
69+
struct MultiEvent{T} <: Event
70+
events::T
71+
MultiEvent() = new{Tuple{}}(())
72+
function MultiEvent(events::Tuple{Vararg{<:Event}})
73+
evs = tuplejoin(map(flatten, events)...)
74+
new{typeof(evs)}(evs)
75+
end
76+
end
77+
MultiEvent(::Nothing) = MultiEvent()
78+
MultiEvent(ev::MultiEvent) = ev
79+
80+
@inline tuplejoin(x) = x
81+
@inline tuplejoin(x, y) = (x..., y...)
82+
@inline tuplejoin(x, y, z...) = (x..., tuplejoin(y, z...)...)
83+
84+
flatten(ev::MultiEvent) = tuplejoin(map(flatten, ev.events)...)
85+
flatten(ev::NoneEvent) = ()
86+
flatten(ev::Event) = (ev,)
87+
6788
"""
6889
async_copy!(::Device, dest::AbstractArray, src::AbstractArray; dependencies = nothing)
6990

src/backends/cpu.jl

Lines changed: 17 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,24 @@
11
struct CPUEvent <: Event
2-
task::Union{Nothing, Core.Task}
2+
task::Core.Task
33
end
44

55
function Event(::CPU)
6-
return CPUEvent(nothing)
6+
return NoneEvent()
7+
end
8+
9+
wait(ev::Union{CPUEvent, NoneEvent, MultiEvent}, progress=nothing) = wait(CPU(), ev, progress)
10+
wait(::CPU, ev::NoneEvent, progress=nothing) = nothing
11+
12+
function wait(cpu::CPU, ev::MultiEvent, progress=nothing)
13+
dependencies = collect(ev.events)
14+
cpudeps = filter(d->d isa CPUEvent && d.task !== nothing, dependencies)
15+
otherdeps = filter(d->!(d isa CPUEvent), dependencies)
16+
Base.sync_end(map(e->e.task, cpudeps))
17+
for event in otherdeps
18+
wait(CPU(), event, progress)
19+
end
720
end
821

9-
wait(ev::CPUEvent, progress=nothing) = wait(CPU(), ev, progress)
1022
function wait(::CPU, ev::CPUEvent, progress=nothing)
1123
ev.task === nothing && return
1224

@@ -18,23 +30,9 @@ function wait(::CPU, ev::CPUEvent, progress=nothing)
1830
end
1931
end
2032
end
21-
function __waitall(::CPU, dependencies, progress)
22-
if dependencies isa Event
23-
dependencies = (dependencies,)
24-
end
25-
if dependencies !== nothing
26-
dependencies = collect(dependencies)
27-
cpudeps = filter(d->d isa CPUEvent && d.task !== nothing, dependencies)
28-
otherdeps = filter(d->!(d isa CPUEvent), dependencies)
29-
Base.sync_end(map(e->e.task, cpudeps))
30-
for event in otherdeps
31-
wait(CPU(), event, progress)
32-
end
33-
end
34-
end
3533

3634
function async_copy!(::CPU, A, B; dependencies=nothing)
37-
__waitall(CPU(), dependencies, yield)
35+
wait(CPU(), MultiEvent(dependencies), yield)
3836
copyto!(A, B)
3937
return CPUEvent(nothing)
4038
end
@@ -65,7 +63,7 @@ end
6563

6664
# Inference barriers
6765
function __run(obj, ndrange, iterspace, args, dependencies, ::Val{dynamic}) where dynamic
68-
__waitall(CPU(), dependencies, yield)
66+
wait(CPU(), MultiEvent(dependencies), yield)
6967
N = length(iterspace)
7068
Nthreads = Threads.nthreads()
7169
if Nthreads == 1

src/backends/cuda.jl

Lines changed: 15 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ function Event(::CUDA)
5656
end
5757

5858
wait(ev::CudaEvent, progress=nothing) = wait(CPU(), ev, progress)
59+
5960
function wait(::CPU, ev::CudaEvent, progress=nothing)
6061
if progress === nothing
6162
CUDAdrv.synchronize(ev.event)
@@ -68,30 +69,24 @@ function wait(::CPU, ev::CudaEvent, progress=nothing)
6869
end
6970

7071
# Use this to synchronize between computation using the CuDefaultStream
71-
function wait(::CUDA, ev::CudaEvent, progress=nothing)
72-
CUDAdrv.wait(ev.event, CUDAdrv.CuDefaultStream())
73-
end
72+
wait(::CUDA, ev::CudaEvent, progress=nothing, stream=CUDAdrv.CuDefaultStream()) = CUDAdrv.wait(ev.event, stream)
73+
wait(::CUDA, ev::NoneEvent, progress=nothing, stream=nothing) = nothing
7474

7575
# There is no efficient wait for CPU->GPU synchronization, so instead we
7676
# do a CPU wait, and therefore block anyone from submitting more work.
7777
# We maybe could do a spinning wait on the GPU and atomic flag to signal from the CPU,
7878
# but which stream would we target?
79-
wait(::CUDA, ev::CPUEvent, progress=nothing) = wait(CPU(), ev, progress)
80-
81-
function __waitall(::CUDA, dependencies, progress, stream)
82-
if dependencies isa Event
83-
dependencies = (dependencies,)
79+
wait(::CUDA, ev::CPUEvent, progress=nothing, stream=nothing) = wait(CPU(), ev, progress)
80+
81+
function wait(::CUDA, ev::MultiEvent, progress=nothing, stream=CUDAdrv.CuDefaultStream())
82+
dependencies = collect(ev.events)
83+
cudadeps = filter(d->d isa CudaEvent, dependencies)
84+
otherdeps = filter(d->!(d isa CudaEvent), dependencies)
85+
for event in cudadeps
86+
CUDAdrv.wait(event.event, stream)
8487
end
85-
if dependencies !== nothing
86-
dependencies = collect(dependencies)
87-
cudadeps = filter(d->d isa CudaEvent, dependencies)
88-
otherdeps = filter(d->!(d isa CudaEvent), dependencies)
89-
for event in cudadeps
90-
CUDAdrv.wait(event.event, stream)
91-
end
92-
for event in otherdeps
93-
wait(CUDA(), event, progress)
94-
end
88+
for event in otherdeps
89+
wait(CUDA(), event, progress)
9590
end
9691
end
9792

@@ -119,7 +114,7 @@ function async_copy!(::CUDA, A, B; dependencies=nothing)
119114
B isa Array && __pin!(B)
120115

121116
stream = next_stream()
122-
__waitall(CUDA(), dependencies, yield, stream)
117+
wait(CUDA(), MultiEvent(dependencies), yield, stream)
123118
event = CuEvent(CUDAdrv.EVENT_DISABLE_TIMING)
124119
GC.@preserve A B begin
125120
destptr = pointer(A)
@@ -145,12 +140,9 @@ function (obj::Kernel{CUDA})(args...; ndrange=nothing, dependencies=nothing, wor
145140
if workgroupsize isa Integer
146141
workgroupsize = (workgroupsize, )
147142
end
148-
if dependencies isa Event
149-
dependencies = (dependencies,)
150-
end
151143

152144
stream = next_stream()
153-
__waitall(CUDA(), dependencies, yield, stream)
145+
wait(CUDA(), MultiEvent(dependencies), yield, stream)
154146

155147
if KernelAbstractions.workgroupsize(obj) <: DynamicSize && workgroupsize === nothing
156148
# TODO: allow for NDRange{1, DynamicSize, DynamicSize}(nothing, nothing)

0 commit comments

Comments
 (0)