Skip to content

Commit d95ba8e

Browse files
authored
Re-introduce the 'blocking' kwargs to at-sync. (#2060)
This can be used to force a blocking, but low-latency synchronization, e.g., when benchmarking code that uses a single task.
1 parent 0cb5659 commit d95ba8e

File tree

8 files changed

+55
-28
lines changed

8 files changed

+55
-28
lines changed

lib/cudadrv/synchronization.jl

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ Base.unlock(c::BidirectionalChannel) = unlock(c.cond_take)
7474
# the synchronization, when it returns true (indicating that the object is synchronized)
7575
# the actual synchronization API should be called again.
7676

77-
function fast_synchronization(f, obj)
77+
function spinning_synchronization(f, obj)
7878
# fast path
7979
f(obj) && return true
8080

@@ -164,9 +164,9 @@ function nonblocking_synchronize(val)
164164
return
165165
end
166166

167-
function device_synchronize()
168-
if use_nonblocking_synchronization
169-
if fast_synchronization(isdone, legacy_stream())
167+
function device_synchronize(; blocking::Bool=false, spin::Bool=true)
168+
if use_nonblocking_synchronization && !blocking
169+
if spin && spinning_synchronization(isdone, legacy_stream())
170170
cuCtxSynchronize()
171171
else
172172
nonblocking_synchronize(context())
@@ -178,9 +178,9 @@ function device_synchronize()
178178
check_exceptions()
179179
end
180180

181-
function synchronize(stream::CuStream=stream())
182-
if use_nonblocking_synchronization
183-
if fast_synchronization(isdone, stream)
181+
function synchronize(stream::CuStream=stream(); blocking::Bool=false, spin::Bool=true)
182+
if use_nonblocking_synchronization && !blocking
183+
if spin && spinning_synchronization(isdone, stream)
184184
cuStreamSynchronize(stream)
185185
else
186186
nonblocking_synchronize(stream)
@@ -192,9 +192,9 @@ function synchronize(stream::CuStream=stream())
192192
check_exceptions()
193193
end
194194

195-
function synchronize(event::CuEvent)
196-
if use_nonblocking_synchronization
197-
if fast_synchronization(isdone, event)
195+
function synchronize(event::CuEvent; blocking::Bool=false, spin::Bool=true)
196+
if use_nonblocking_synchronization && !blocking
197+
if spin && spinning_synchronization(isdone, event)
198198
cuEventSynchronize(event)
199199
else
200200
nonblocking_synchronize(event)
@@ -249,10 +249,10 @@ function nonblocking_synchronize(stream::CuStream)
249249
return
250250
end
251251

252-
function device_synchronize()
253-
if use_nonblocking_synchronization
252+
function device_synchronize(; blocking::Bool=false, spin::Bool=true)
253+
if use_nonblocking_synchronization && !blocking
254254
stream = legacy_stream()
255-
if !fast_synchronization(isdone, stream)
255+
if !spin || !spinning_synchronization(isdone, stream)
256256
nonblocking_synchronize(stream)
257257
end
258258
end
@@ -261,9 +261,9 @@ function device_synchronize()
261261
check_exceptions()
262262
end
263263

264-
function synchronize(stream::CuStream=stream())
265-
if use_nonblocking_synchronization
266-
if !fast_synchronization(isdone, stream)
264+
function synchronize(stream::CuStream=stream(); blocking::Bool=false, spin::Bool=true)
265+
if use_nonblocking_synchronization && !blocking
266+
if !spin || !spinning_synchronization(isdone, stream)
267267
nonblocking_synchronize(stream)
268268
end
269269
end
@@ -272,9 +272,9 @@ function synchronize(stream::CuStream=stream())
272272
check_exceptions()
273273
end
274274

275-
function synchronize(event::CuEvent)
276-
if use_nonblocking_synchronization
277-
fast_synchronization(isdone, event)
275+
function synchronize(event::CuEvent; blocking::Bool=false, spin::Bool=true)
276+
if use_nonblocking_synchronization && !blocking
277+
spin && spinning_synchronization(isdone, event)
278278
end
279279
cuEventSynchronize(event)
280280
end

perf/byval.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,11 +59,11 @@ function main()
5959
y1 = [similar(x1[1]) for i = 1:num_z_slices]
6060

6161
# reference down to bones add on GPU
62-
results["reference"] = @benchmark CUDA.@sync add!($y1[1], $x1[1], $x2[1])
62+
results["reference"] = @benchmark CUDA.@sync blocking=true add!($y1[1], $x1[1], $x2[1])
6363

6464
# adding arrays in an array
6565
for slices = 1:num_z_slices
66-
results["slices=$slices"] = @benchmark CUDA.@sync add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices])
66+
results["slices=$slices"] = @benchmark CUDA.@sync blocking=true add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices])
6767
end
6868

6969
# BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them

perf/cuda.jl

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
group = addgroup!(SUITE, "cuda")
2+
3+
let group = addgroup!(group, "synchronization")
4+
let group = addgroup!(group, "stream")
5+
group["blocking"] = @benchmarkable synchronize(blocking=true)
6+
group["auto"] = @benchmarkable synchronize()
7+
group["nonblocking"] = @benchmarkable synchronize(spin=false)
8+
end
9+
let group = addgroup!(group, "context")
10+
group["blocking"] = @benchmarkable device_synchronize(blocking=true)
11+
group["auto"] = @benchmarkable device_synchronize()
12+
group["nonblocking"] = @benchmarkable device_synchronize(spin=false)
13+
end
14+
end

perf/cudadevrt.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ function main()
2626
x2 = cu(randn(Float32, (1, n)) .+ Float32(0.5))
2727
y1 = similar(x1)
2828

29-
results = @benchmark CUDA.@sync add!($y1, $x1, $x2)
29+
results = @benchmark CUDA.@sync blocking=true add!($y1, $x1, $x2)
3030

3131
# BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
3232
CUDA.unsafe_free!(x1)

perf/runbenchmarks.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ end
1717
# convenience macro to create a benchmark that requires synchronizing the GPU
1818
macro async_benchmarkable(ex...)
1919
quote
20-
@benchmarkable CUDA.@sync $(ex...)
20+
@benchmarkable CUDA.@sync blocking=true $(ex...)
2121
end
2222
end
2323

@@ -30,6 +30,7 @@ SUITE = BenchmarkGroup()
3030

3131
# NOTE: don't use spaces in benchmark names (tobami/codespeed#256)
3232

33+
include("cuda.jl")
3334
include("kernel.jl")
3435
include("array.jl")
3536

perf/volumerhs.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -255,8 +255,8 @@ function main()
255255
$(Base.format_bytes(CUDA.memory(kernel).shared)) shared memory,
256256
$(Base.format_bytes(CUDA.memory(kernel).constant)) constant memory"""
257257
results = @benchmark begin
258-
CUDA.@sync $kernel($rhs, $Q, $vgeo, $(DFloat(grav)), $D, $nelem;
259-
threads=$threads, blocks=$nelem)
258+
CUDA.@sync blocking=true $kernel($rhs, $Q, $vgeo, $(DFloat(grav)), $D, $nelem;
259+
threads=$threads, blocks=$nelem)
260260
end
261261

262262
# BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them

src/utilities.jl

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,15 @@
11
"""
2-
@sync ex
2+
@sync [blocking=false] ex
33
44
Run expression `ex` and synchronize the GPU afterwards.
55
6+
The `blocking` keyword argument determines how synchronization is performed. By default,
7+
non-blocking synchronization will be used, which gives other Julia tasks a chance to run
8+
while waiting for the GPU to finish. This may increase latency, so for short operations,
9+
or when benchmaring code that does not use multiple tasks, it may be beneficial to use
10+
blocking synchronization instead by setting `blocking=true`. Blocking synchronization
11+
can also be enabled globally by changing the `nonblocking_synchronization` preference.
12+
613
See also: [`synchronize`](@ref).
714
"""
815
macro sync(ex...)
@@ -11,19 +18,22 @@ macro sync(ex...)
1118
kwargs = ex[1:end-1]
1219

1320
# decode keyword arguments
21+
blocking = false
1422
for kwarg in kwargs
1523
Meta.isexpr(kwarg, :(=)) || error("Invalid keyword argument $kwarg")
1624
key, val = kwarg.args
1725
if key == :blocking
18-
Base.depwarn("the blocking keyword to @sync has been deprecated", :sync)
26+
isa(val, Bool) ||
27+
error("Invalid value for keyword argument $kwarg; expected Bool, got $(val)")
28+
blocking = val
1929
else
2030
error("Unknown keyword argument $kwarg")
2131
end
2232
end
2333

2434
quote
2535
local ret = $(esc(code))
26-
synchronize()
36+
synchronize(; blocking=$blocking)
2737
ret
2838
end
2939
end

test/core/utils.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ end
2929
end
3030
@test t >= 0
3131
@test ret == 42
32+
33+
CUDA.@sync blocking=true identity(nothing)
3234
end
3335

3436
@testset "versioninfo" begin

0 commit comments

Comments
 (0)