Skip to content

Commit ac1bc29

Browse files
authored
Busy-wait before doing nonblocking synchronization. (#2059)
This should improve the latency of short operations. [skip julia] [skip cuda]
1 parent faff26c commit ac1bc29

File tree

1 file changed

+60
-46
lines changed

1 file changed

+60
-46
lines changed

lib/cudadrv/synchronization.jl

Lines changed: 60 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -64,13 +64,46 @@ Base.lock(c::BidirectionalChannel) = lock(c.cond_take)
6464
Base.unlock(c::BidirectionalChannel) = unlock(c.cond_take)
6565

6666

67+
#
68+
# fast-path synchronization
69+
#
70+
71+
# before using a nonblocking mechanism, which has some overhead, use a busy-loop
72+
# that queries the state of the object to synchronize. this reduces latency,
73+
# especially for short operations. note that because it does not actually perform
74+
# the synchronization, when it returns true (indicating that the object is synchronized)
75+
# the actual synchronization API should be called again.
76+
77+
function fast_synchronization(f, obj)
78+
# fast path
79+
f(obj) && return true
80+
81+
# minimize latency of short operations by busy-waiting,
82+
# initially without even yielding to other tasks
83+
spins = 0
84+
while spins < 256
85+
if spins < 32
86+
ccall(:jl_cpu_pause, Cvoid, ())
87+
# temporary solution before we have gc transition support in codegen.
88+
ccall(:jl_gc_safepoint, Cvoid, ())
89+
else
90+
yield()
91+
end
92+
f(obj) && return true
93+
spins += 1
94+
end
95+
96+
return false
97+
end
98+
99+
67100
#
68101
# nonblocking sync
69102
#
70103

71104
@static if VERSION >= v"1.9.2"
72105

73-
# if we support foreign threads, perform the synchronization on a separate thread.
106+
# if we support foreign threads, perform the actual synchronization on a separate thread.
74107

75108
const MAX_SYNC_THREADS = 4
76109
const sync_channels = Array{BidirectionalChannel{Any}}(undef, MAX_SYNC_THREADS)
@@ -133,29 +166,37 @@ end
133166

134167
function device_synchronize()
135168
if use_nonblocking_synchronization
136-
nonblocking_synchronize(context())
169+
if fast_synchronization(isdone, legacy_stream())
170+
cuCtxSynchronize()
171+
else
172+
nonblocking_synchronize(context())
173+
end
137174
else
138175
cuCtxSynchronize()
139176
end
177+
140178
check_exceptions()
141179
end
142180

143181
function synchronize(stream::CuStream=stream())
144182
if use_nonblocking_synchronization
145-
if !isdone(stream)
146-
# slow path
183+
if fast_synchronization(isdone, stream)
184+
cuStreamSynchronize(stream)
185+
else
147186
nonblocking_synchronize(stream)
148187
end
149188
else
150189
cuStreamSynchronize(stream)
151190
end
191+
152192
check_exceptions()
153193
end
154194

155195
function synchronize(event::CuEvent)
156196
if use_nonblocking_synchronization
157-
if !isdone(event)
158-
# slow path
197+
if fast_synchronization(isdone, event)
198+
cuEventSynchronize(event)
199+
else
159200
nonblocking_synchronize(event)
160201
end
161202
else
@@ -171,32 +212,16 @@ else
171212
# requiring us to perform the actual API call again after nonblocking synchronization.
172213

173214
function nonblocking_synchronize(stream::CuStream)
174-
# fast path
175-
isdone(stream) && return
176-
177-
# minimize latency of short operations by busy-waiting,
178-
# initially without even yielding to other tasks
179-
spins = 0
180-
while spins < 256
181-
if spins < 32
182-
ccall(:jl_cpu_pause, Cvoid, ())
183-
# Temporary solution before we have gc transition support in codegen.
184-
ccall(:jl_gc_safepoint, Cvoid, ())
185-
else
186-
yield()
187-
end
188-
isdone(stream) && return
189-
spins += 1
190-
end
191-
192-
# minimize CPU usage of long-running kernels by waiting for an event signalled by CUDA
215+
# wait for an event signalled by CUDA
193216
event = Base.Event()
194217
launch(; stream) do
195218
notify(event)
196219
end
220+
197221
# if an error occurs, the callback may never fire, so use a timer to detect such cases
198222
dev = device()
199223
timer = Timer(0; interval=1)
224+
200225
Base.@sync begin
201226
Threads.@spawn try
202227
device!(dev)
@@ -226,7 +251,10 @@ end
226251

227252
function device_synchronize()
228253
if use_nonblocking_synchronization
229-
nonblocking_synchronize(legacy_stream())
254+
stream = legacy_stream()
255+
if !fast_synchronization(isdone, stream)
256+
nonblocking_synchronize(stream)
257+
end
230258
end
231259
cuCtxSynchronize()
232260

@@ -235,34 +263,20 @@ end
235263

236264
function synchronize(stream::CuStream=stream())
237265
if use_nonblocking_synchronization
238-
nonblocking_synchronize(stream)
266+
if !fast_synchronization(isdone, stream)
267+
nonblocking_synchronize(stream)
268+
end
239269
end
240270
cuStreamSynchronize(stream)
241271

242272
check_exceptions()
243273
end
244274

245-
function synchronize(e::CuEvent)
275+
function synchronize(event::CuEvent)
246276
if use_nonblocking_synchronization
247-
# fast path
248-
isdone(e) && return
249-
250-
# spin (initially without yielding to minimize latency)
251-
spins = 0
252-
while spins < 256
253-
if spins < 32
254-
ccall(:jl_cpu_pause, Cvoid, ())
255-
# Temporary solution before we have gc transition support in codegen.
256-
ccall(:jl_gc_safepoint, Cvoid, ())
257-
else
258-
yield()
259-
end
260-
isdone(e) && return
261-
spins += 1
262-
end
277+
fast_synchronization(isdone, event)
263278
end
264-
265-
cuEventSynchronize(e)
279+
cuEventSynchronize(event)
266280
end
267281

268282
end

0 commit comments

Comments
 (0)