@@ -76,13 +76,52 @@ wait(ev::CudaEvent, progress=yield) = wait(CPU(), ev, progress)
76
76
function wait (:: CPU , ev:: CudaEvent , progress= nothing )
77
77
isdone (ev) && return nothing
78
78
79
- event = Base. Threads. Event ()
79
+ # minimize latency of short operations by busy-waiting,
80
+ # initially without even yielding to other tasks
81
+ spins = 0
82
+ while spins < 256
83
+ if spins < 32
84
+ ccall (:jl_cpu_pause , Cvoid, ())
85
+ # Temporary solution before we have gc transition support in codegen.
86
+ ccall (:jl_gc_safepoint , Cvoid, ())
87
+ else
88
+ yield ()
89
+ end
90
+ isdone (ev) && return
91
+ spins += 1
92
+ end
93
+
94
+ event = Base. Event ()
80
95
stream = next_stream ()
81
96
wait (CUDADevice (), ev, nothing , stream)
82
97
CUDA. launch (;stream) do
83
98
notify (event)
84
99
end
85
- wait (event)
100
+ dev = CUDA. device ()
101
+ # if an error occurs, the callback may never fire, so use a timer to detect such cases
102
+ timer = Timer (0 ; interval= 1 )
103
+ Base. @sync begin
104
+ Threads. @spawn try
105
+ CUDA. device! (dev)
106
+ while true
107
+ try
108
+ Base. wait (timer)
109
+ catch err
110
+ err isa EOFError && break
111
+ rethrow ()
112
+ end
113
+ if CUDA. unsafe_cuEventQuery (ev. event) != CUDA. ERROR_NOT_READY
114
+ break
115
+ end
116
+ end
117
+ finally
118
+ notify (event)
119
+ end
120
+ Threads. @spawn begin
121
+ Base. wait (event)
122
+ close (timer)
123
+ end
124
+ end
86
125
end
87
126
88
127
# Use this to synchronize between computation using the task local stream
226
265
227
266
import CUDA: @device_override
228
267
229
- import KernelAbstractions: CompilerMetadata, CompilerPass, DynamicCheck, LinearIndices
268
+ import KernelAbstractions: CompilerMetadata, DynamicCheck, LinearIndices
230
269
import KernelAbstractions: __index_Local_Linear, __index_Group_Linear, __index_Global_Linear, __index_Local_Cartesian, __index_Group_Cartesian, __index_Global_Cartesian, __validindex, __print
231
270
import KernelAbstractions: mkcontext, expand, __iterspace, __ndrange, __dynamic_checkbounds
232
271
0 commit comments