Skip to content

Commit 4cd4d14

Browse files
authored
Perform synchronization on a worker thread (#2025)
This significantly simplifies synchronization, on Julia 1.9+. Also makes nonblocking synchronization configurable, using a preference.
1 parent ce3acf1 commit 4cd4d14

File tree

11 files changed

+315
-133
lines changed

11 files changed

+315
-133
lines changed

.gitignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,4 @@
66
lcov.info
77
build/
88
/lib/**/Manifest.toml
9-
/LocalPreferences.toml
109
/lib/**/LocalPreferences.toml

LocalPreferences.toml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
[CUDA_Runtime_jll]
2+
# which CUDA runtime to use; can be set to a supported
3+
# version number or to "local" for a local installation.
4+
#version = "11.8"
5+
6+
[CUDA]
7+
# whether to use a nonblocking synchronization mechanism,
8+
# making it possible to do use cooperative multitasking.
9+
#nonblocking_synchronization = true

lib/cudadrv/CUDAdrv.jl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,6 @@ include("graph.jl")
2929

3030
# global state (CUDA.jl's driver wrappers behave like CUDA's runtime library)
3131
include("state.jl")
32+
33+
# support for concurrent programming
34+
include("synchronization.jl")

lib/cudadrv/context.jl

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,7 @@ associated with the current task.
299299
function synchronize(ctx::CuContext)
300300
push!(CuContext, ctx)
301301
try
302-
nonblocking_synchronize()
302+
device_synchronize()
303303
finally
304304
pop!(CuContext)
305305
end
@@ -316,21 +316,9 @@ associated with the current task.
316316
On the device, `device_synchronize` acts as a synchronization point for child grids in the
317317
context of dynamic parallelism.
318318
"""
319-
device_synchronize() = nonblocking_synchronize()
319+
device_synchronize()
320320
# XXX: can we put the device docstring in dynamic_parallelism.jl?
321321

322-
@inline function nonblocking_synchronize()
323-
# perform as much of the sync as possible without blocking in CUDA.
324-
# XXX: remove this using a yield callback, or by synchronizing on a dedicated thread?
325-
nonblocking_synchronize(legacy_stream())
326-
327-
# even though the GPU should be idle now, CUDA hooks work to the actual API call.
328-
# see NVIDIA bug #3383169 for more details.
329-
cuCtxSynchronize()
330-
331-
check_exceptions()
332-
end
333-
334322

335323
## cache config
336324

lib/cudadrv/events.jl

Lines changed: 1 addition & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -49,36 +49,7 @@ record(e::CuEvent, stream::CuStream=stream()) =
4949
5050
Waits for an event to complete.
5151
"""
52-
function synchronize(e::CuEvent)
53-
# perform as much of the sync as possible without blocking in CUDA.
54-
# XXX: remove this using a yield callback, or by synchronizing on a dedicated thread?
55-
nonblocking_synchronize(e)
56-
57-
# even though the GPU should be idle now, CUDA hooks work to the actual API call.
58-
# see NVIDIA bug #3383169 for more details.
59-
cuEventSynchronize(e)
60-
end
61-
62-
@inline function nonblocking_synchronize(e::CuEvent)
63-
# fast path
64-
isdone(e) && return
65-
66-
# spin (initially without yielding to minimize latency)
67-
spins = 0
68-
while spins < 256
69-
if spins < 32
70-
ccall(:jl_cpu_pause, Cvoid, ())
71-
# Temporary solution before we have gc transition support in codegen.
72-
ccall(:jl_gc_safepoint, Cvoid, ())
73-
else
74-
yield()
75-
end
76-
isdone(e) && return
77-
spins += 1
78-
end
79-
80-
return
81-
end
52+
synchronize(e::CuEvent)
8253

8354
"""
8455
isdone(e::CuEvent)

lib/cudadrv/state.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -444,7 +444,7 @@ an array or a dictionary, use additional locks.
444444
"""
445445
struct PerDevice{T}
446446
lock::ReentrantLock
447-
values::LazyInitialized{Vector{Union{Nothing,Tuple{CuContext,T}}}}
447+
values::LazyInitialized{Vector{Union{Nothing,Tuple{CuContext,T}}},Nothing}
448448
end
449449

450450
function PerDevice{T}() where {T}

lib/cudadrv/stream.jl

Lines changed: 1 addition & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -120,75 +120,7 @@ associated with the current Julia task.
120120
121121
See also: [`device_synchronize`](@ref)
122122
"""
123-
function synchronize(stream::CuStream=stream(); blocking=nothing)
124-
if blocking !== nothing
125-
Base.depwarn("the blocking keyword to synchronize() has been deprecated", :synchronize)
126-
end
127-
128-
# perform as much of the sync as possible without blocking in CUDA.
129-
# XXX: remove this using a yield callback, or by synchronizing on a dedicated stream?
130-
nonblocking_synchronize(stream)
131-
132-
# even though the GPU should be idle now, CUDA hooks work to the actual API call.
133-
# see NVIDIA bug #3383169 for more details.
134-
cuStreamSynchronize(stream)
135-
136-
check_exceptions()
137-
end
138-
139-
@inline function nonblocking_synchronize(stream::CuStream)
140-
# fast path
141-
isdone(stream) && return
142-
143-
# minimize latency of short operations by busy-waiting,
144-
# initially without even yielding to other tasks
145-
spins = 0
146-
while spins < 256
147-
if spins < 32
148-
ccall(:jl_cpu_pause, Cvoid, ())
149-
# Temporary solution before we have gc transition support in codegen.
150-
ccall(:jl_gc_safepoint, Cvoid, ())
151-
else
152-
yield()
153-
end
154-
isdone(stream) && return
155-
spins += 1
156-
end
157-
158-
# minimize CPU usage of long-running kernels by waiting for an event signalled by CUDA
159-
event = Base.Event()
160-
launch(; stream) do
161-
notify(event)
162-
end
163-
# if an error occurs, the callback may never fire, so use a timer to detect such cases
164-
dev = device()
165-
timer = Timer(0; interval=1)
166-
Base.@sync begin
167-
Threads.@spawn try
168-
device!(dev)
169-
while true
170-
try
171-
Base.wait(timer)
172-
catch err
173-
err isa EOFError && break
174-
rethrow()
175-
end
176-
if unsafe_cuStreamQuery(stream) != ERROR_NOT_READY
177-
break
178-
end
179-
end
180-
finally
181-
notify(event)
182-
end
183-
184-
Threads.@spawn begin
185-
Base.wait(event)
186-
close(timer)
187-
end
188-
end
189-
190-
return
191-
end
123+
synchronize(stream::CuStream=stream())
192124

193125
"""
194126
priority_range()

0 commit comments

Comments
 (0)