@@ -64,13 +64,46 @@ Base.lock(c::BidirectionalChannel) = lock(c.cond_take)
64
64
Base. unlock (c:: BidirectionalChannel ) = unlock (c. cond_take)
65
65
66
66
67
+ #
68
+ # fast-path synchronization
69
+ #
70
+
71
+ # before using a nonblocking mechanism, which has some overhead, use a busy-loop
72
+ # that queries the state of the object to synchronize. this reduces latency,
73
+ # especially for short operations. note that because it does not actually perform
74
+ # the synchronization, when it returns true (indicating that the object is synchronized)
75
+ # the actual synchronization API should be called again.
76
+
77
+ function fast_synchronization (f, obj)
78
+ # fast path
79
+ f (obj) && return true
80
+
81
+ # minimize latency of short operations by busy-waiting,
82
+ # initially without even yielding to other tasks
83
+ spins = 0
84
+ while spins < 256
85
+ if spins < 32
86
+ ccall (:jl_cpu_pause , Cvoid, ())
87
+ # temporary solution before we have gc transition support in codegen.
88
+ ccall (:jl_gc_safepoint , Cvoid, ())
89
+ else
90
+ yield ()
91
+ end
92
+ f (obj) && return true
93
+ spins += 1
94
+ end
95
+
96
+ return false
97
+ end
98
+
99
+
67
100
#
68
101
# nonblocking sync
69
102
#
70
103
71
104
@static if VERSION >= v " 1.9.2"
72
105
73
- # if we support foreign threads, perform the synchronization on a separate thread.
106
+ # if we support foreign threads, perform the actual synchronization on a separate thread.
74
107
75
108
const MAX_SYNC_THREADS = 4
76
109
const sync_channels = Array {BidirectionalChannel{Any}} (undef, MAX_SYNC_THREADS)
@@ -133,29 +166,37 @@ end
133
166
134
167
function device_synchronize ()
135
168
if use_nonblocking_synchronization
136
- nonblocking_synchronize (context ())
169
+ if fast_synchronization (isdone, legacy_stream ())
170
+ cuCtxSynchronize ()
171
+ else
172
+ nonblocking_synchronize (context ())
173
+ end
137
174
else
138
175
cuCtxSynchronize ()
139
176
end
177
+
140
178
check_exceptions ()
141
179
end
142
180
143
181
function synchronize (stream:: CuStream = stream ())
144
182
if use_nonblocking_synchronization
145
- if ! isdone (stream)
146
- # slow path
183
+ if fast_synchronization (isdone, stream)
184
+ cuStreamSynchronize (stream)
185
+ else
147
186
nonblocking_synchronize (stream)
148
187
end
149
188
else
150
189
cuStreamSynchronize (stream)
151
190
end
191
+
152
192
check_exceptions ()
153
193
end
154
194
155
195
function synchronize (event:: CuEvent )
156
196
if use_nonblocking_synchronization
157
- if ! isdone (event)
158
- # slow path
197
+ if fast_synchronization (isdone, event)
198
+ cuEventSynchronize (event)
199
+ else
159
200
nonblocking_synchronize (event)
160
201
end
161
202
else
@@ -171,32 +212,16 @@ else
171
212
# requiring us to perform the actual API call again after nonblocking synchronization.
172
213
173
214
function nonblocking_synchronize (stream:: CuStream )
174
- # fast path
175
- isdone (stream) && return
176
-
177
- # minimize latency of short operations by busy-waiting,
178
- # initially without even yielding to other tasks
179
- spins = 0
180
- while spins < 256
181
- if spins < 32
182
- ccall (:jl_cpu_pause , Cvoid, ())
183
- # Temporary solution before we have gc transition support in codegen.
184
- ccall (:jl_gc_safepoint , Cvoid, ())
185
- else
186
- yield ()
187
- end
188
- isdone (stream) && return
189
- spins += 1
190
- end
191
-
192
- # minimize CPU usage of long-running kernels by waiting for an event signalled by CUDA
215
+ # wait for an event signalled by CUDA
193
216
event = Base. Event ()
194
217
launch (; stream) do
195
218
notify (event)
196
219
end
220
+
197
221
# if an error occurs, the callback may never fire, so use a timer to detect such cases
198
222
dev = device ()
199
223
timer = Timer (0 ; interval= 1 )
224
+
200
225
Base. @sync begin
201
226
Threads. @spawn try
202
227
device! (dev)
226
251
227
252
function device_synchronize ()
228
253
if use_nonblocking_synchronization
229
- nonblocking_synchronize (legacy_stream ())
254
+ stream = legacy_stream ()
255
+ if ! fast_synchronization (isdone, stream)
256
+ nonblocking_synchronize (stream)
257
+ end
230
258
end
231
259
cuCtxSynchronize ()
232
260
@@ -235,34 +263,20 @@ end
235
263
236
264
function synchronize (stream:: CuStream = stream ())
237
265
if use_nonblocking_synchronization
238
- nonblocking_synchronize (stream)
266
+ if ! fast_synchronization (isdone, stream)
267
+ nonblocking_synchronize (stream)
268
+ end
239
269
end
240
270
cuStreamSynchronize (stream)
241
271
242
272
check_exceptions ()
243
273
end
244
274
245
- function synchronize (e :: CuEvent )
275
+ function synchronize (event :: CuEvent )
246
276
if use_nonblocking_synchronization
247
- # fast path
248
- isdone (e) && return
249
-
250
- # spin (initially without yielding to minimize latency)
251
- spins = 0
252
- while spins < 256
253
- if spins < 32
254
- ccall (:jl_cpu_pause , Cvoid, ())
255
- # Temporary solution before we have gc transition support in codegen.
256
- ccall (:jl_gc_safepoint , Cvoid, ())
257
- else
258
- yield ()
259
- end
260
- isdone (e) && return
261
- spins += 1
262
- end
277
+ fast_synchronization (isdone, event)
263
278
end
264
-
265
- cuEventSynchronize (e)
279
+ cuEventSynchronize (event)
266
280
end
267
281
268
282
end
0 commit comments