Skip to content

Commit a137cf4

Browse files
authored
Use Timer to avoid hangs due to error on the device (#291)
1 parent f5ceb33 commit a137cf4

File tree

2 files changed

+43
-4
lines changed

2 files changed

+43
-4
lines changed

lib/CUDAKernels/src/CUDAKernels.jl

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,13 +76,52 @@ wait(ev::CudaEvent, progress=yield) = wait(CPU(), ev, progress)
7676
function wait(::CPU, ev::CudaEvent, progress=nothing)
7777
isdone(ev) && return nothing
7878

79-
event = Base.Threads.Event()
79+
# minimize latency of short operations by busy-waiting,
80+
# initially without even yielding to other tasks
81+
spins = 0
82+
while spins < 256
83+
if spins < 32
84+
ccall(:jl_cpu_pause, Cvoid, ())
85+
# Temporary solution before we have gc transition support in codegen.
86+
ccall(:jl_gc_safepoint, Cvoid, ())
87+
else
88+
yield()
89+
end
90+
isdone(ev) && return
91+
spins += 1
92+
end
93+
94+
event = Base.Event()
8095
stream = next_stream()
8196
wait(CUDADevice(), ev, nothing, stream)
8297
CUDA.launch(;stream) do
8398
notify(event)
8499
end
85-
wait(event)
100+
dev = CUDA.device()
101+
# if an error occurs, the callback may never fire, so use a timer to detect such cases
102+
timer = Timer(0; interval=1)
103+
Base.@sync begin
104+
Threads.@spawn try
105+
CUDA.device!(dev)
106+
while true
107+
try
108+
Base.wait(timer)
109+
catch err
110+
err isa EOFError && break
111+
rethrow()
112+
end
113+
if CUDA.unsafe_cuEventQuery(ev.event) != CUDA.ERROR_NOT_READY
114+
break
115+
end
116+
end
117+
finally
118+
notify(event)
119+
end
120+
Threads.@spawn begin
121+
Base.wait(event)
122+
close(timer)
123+
end
124+
end
86125
end
87126

88127
# Use this to synchronize between computation using the task local stream
@@ -226,7 +265,7 @@ end
226265

227266
import CUDA: @device_override
228267

229-
import KernelAbstractions: CompilerMetadata, CompilerPass, DynamicCheck, LinearIndices
268+
import KernelAbstractions: CompilerMetadata, DynamicCheck, LinearIndices
230269
import KernelAbstractions: __index_Local_Linear, __index_Group_Linear, __index_Global_Linear, __index_Local_Cartesian, __index_Group_Cartesian, __index_Global_Cartesian, __validindex, __print
231270
import KernelAbstractions: mkcontext, expand, __iterspace, __ndrange, __dynamic_checkbounds
232271

lib/ROCKernels/src/ROCKernels.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ end
211211

212212
import AMDGPU: @device_override
213213

214-
import KernelAbstractions: CompilerMetadata, CompilerPass, DynamicCheck, LinearIndices
214+
import KernelAbstractions: CompilerMetadata, DynamicCheck, LinearIndices
215215
import KernelAbstractions: __index_Local_Linear, __index_Group_Linear, __index_Global_Linear, __index_Local_Cartesian, __index_Group_Cartesian, __index_Global_Cartesian, __validindex, __print
216216
import KernelAbstractions: mkcontext, expand, __iterspace, __ndrange, __dynamic_checkbounds
217217

0 commit comments

Comments
 (0)