Skip to content

Commit 95fd5b2

Browse files
Merge pull request #1905 from CliMA/ck/benchmark_script
Add index-swapping benchmark script
2 parents 3611a0f + 55f9e63 commit 95fd5b2

File tree

1 file changed

+213
-0
lines changed

1 file changed

+213
-0
lines changed

benchmarks/scripts/index_swapping.jl

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
#=
2+
julia --project=.buildkite
3+
julia -g2 --check-bounds=yes --project=.buildkite
4+
using Revise; include(joinpath("benchmarks", "scripts", "index_swapping.jl"))
5+
6+
# Info:
7+
This script compares the performance
8+
of our universal index support (for
9+
mixed DataLayout operations) against
10+
specialized index support for uniform
11+
DataLayout operations.
12+
13+
In particular,
14+
- `at_dot_call!` is a reference for the speed of light
15+
we could achieve on the hardware, as
16+
memory coalescence comes for free on
17+
vectors (as opposed to arrays).
18+
- `custom_kernel_bc!(; swap = 0)` mimics our specialized operations
19+
- `custom_kernel_bc!(; swap = 1)` mimics our generalized pointwise operations
20+
- `custom_kernel_bc!(; swap = 2)` mimics our generalized stencil operations
21+
22+
# Benchmark results:
23+
24+
Clima A100
25+
```
26+
at_dot_call!($X_vector, $Y_vector):
27+
6 milliseconds, 19 microseconds
28+
custom_kernel_bc!($X_array, $Y_array, $uss, swap = 0):
29+
6 milliseconds, 329 microseconds
30+
custom_kernel_bc!($X_array, $Y_array, $uss, swap = 1):
31+
14 milliseconds, 232 microseconds
32+
custom_kernel_bc!($X_array, $Y_array, $uss, swap = 2):
33+
15 milliseconds, 960 microseconds
34+
```
35+
=#
36+
37+
#! format: off
38+
import CUDA
39+
using BenchmarkTools, Dates
40+
using LazyBroadcast: @lazy
41+
ArrayType = CUDA.CuArray;
42+
# ArrayType = identity;
43+
44+
if ArrayType === identity
45+
macro pretty_belapsed(expr)
46+
return quote
47+
println($(string(expr)), ":")
48+
print(" ")
49+
print_time_and_units(BenchmarkTools.@belapsed(esc($expr)))
50+
end
51+
end
52+
else
53+
macro pretty_belapsed(expr)
54+
return quote
55+
println($(string(expr)), ":")
56+
print(" ")
57+
print_time_and_units(
58+
BenchmarkTools.@belapsed(CUDA.@sync((esc($expr))))
59+
)
60+
end
61+
end
62+
macro pretty_elapsed(expr)
63+
return quote
64+
println($(string(expr)), ":")
65+
print(" ")
66+
print_time_and_units(
67+
BenchmarkTools.@elapsed(CUDA.@sync((esc($expr))))
68+
)
69+
end
70+
end
71+
end
72+
print_time_and_units(x) = println(time_and_units_str(x))
73+
time_and_units_str(x::Real) =
74+
trunc_time(string(compound_period(x, Dates.Second)))
75+
function compound_period(x::Real, ::Type{T}) where {T <: Dates.Period}
76+
nf = Dates.value(convert(Dates.Nanosecond, T(1)))
77+
ns = Dates.Nanosecond(ceil(x * nf))
78+
return Dates.canonicalize(Dates.CompoundPeriod(ns))
79+
end
80+
trunc_time(s::String) = count(',', s) > 1 ? join(split(s, ",")[1:2], ",") : s
81+
foo(x1, x2, x3) = x1
82+
function at_dot_call!(X, Y)
83+
(; x1, x2, x3) = X
84+
(; y1) = Y
85+
for i in 1:100 # reduce variance / impact of launch latency
86+
@. y1 = foo(x1, x2, x3) # 3 reads, 1 write
87+
end
88+
return nothing
89+
end;
90+
91+
struct UniversalSizesStatic{Nv, Nij, Nh} end
92+
93+
get_Nv(::UniversalSizesStatic{Nv}) where {Nv} = Nv
94+
get_Nij(::UniversalSizesStatic{Nv, Nij}) where {Nv, Nij} = Nij
95+
get_Nh(::UniversalSizesStatic{Nv, Nij, Nh}) where {Nv, Nij, Nh} = Nh
96+
get_N(us::UniversalSizesStatic{Nv, Nij}) where {Nv, Nij} = prod((Nv,Nij,Nij,1,get_Nh(us)))
97+
UniversalSizesStatic(Nv, Nij, Nh) = UniversalSizesStatic{Nv, Nij, Nh}()
98+
using Test
99+
100+
function custom_kernel_bc!(X, Y, us::UniversalSizesStatic; swap=0, printtb=false)
101+
(; x1, x2, x3) = X
102+
(; y1) = Y
103+
bc = @lazy @. y1 = foo(x1, x2, x3)
104+
@assert !(y1 isa Array)
105+
f = if swap==0
106+
custom_kernel_knl_bc_no_swap!
107+
elseif swap == 1
108+
custom_kernel_knl_bc_swap!
109+
elseif swap == 2
110+
custom_kernel_knl_bc_2swap!
111+
else
112+
error("oops")
113+
end
114+
kernel =
115+
CUDA.@cuda always_inline = true launch = false f(
116+
y1,
117+
bc,
118+
us,
119+
)
120+
N = get_N(us)
121+
config = CUDA.launch_configuration(kernel.fun)
122+
threads = min(N, config.threads)
123+
blocks = cld(N, threads)
124+
printtb && @show blocks, threads
125+
for i in 1:100 # reduce variance / impact of launch latency
126+
kernel(y1, bc,us; threads, blocks)
127+
end
128+
return nothing
129+
end;
130+
131+
# Mimics how indexing works in generalized pointwise kernels
132+
function custom_kernel_knl_bc_swap!(y1, bc, us)
133+
@inbounds begin
134+
tidx = (CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x + CUDA.threadIdx().x
135+
if tidx get_N(us)
136+
n = (get_Nij(us), get_Nij(us), 1, get_Nv(us), get_Nh(us))
137+
GCI = CartesianIndices(map(x -> Base.OneTo(x), n))[tidx]
138+
# Perform index swap (as in `getindex(::AbstractData, ::CartesianIndex)`)
139+
i, j, _, v, h = GCI.I
140+
CI = CartesianIndex(v, i, j, 1, h)
141+
y1[CI] = bc[CI]
142+
end
143+
end
144+
return nothing
145+
end
146+
147+
# Mimics how indexing works in specialized kernels
148+
function custom_kernel_knl_bc_no_swap!(y1, bc, us)
149+
@inbounds begin
150+
tidx = (CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x + CUDA.threadIdx().x
151+
if tidx get_N(us)
152+
n = (get_Nv(us), get_Nij(us), get_Nij(us), 1, get_Nh(us))
153+
CI = CartesianIndices(map(x -> Base.OneTo(x), n))[tidx]
154+
y1[CI] = bc[CI] # requires special broadcasted index support
155+
end
156+
end
157+
return nothing
158+
end
159+
# Mimics how indexing works in generalized stencil kernels
160+
function custom_kernel_knl_bc_2swap!(y1, bc, us)
161+
@inbounds begin
162+
tidx = (CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x + CUDA.threadIdx().x
163+
if tidx get_N(us)
164+
# We start with a VIJFH-specific CartesianIndex
165+
n = (get_Nv(us), get_Nij(us), get_Nij(us), 1, get_Nh(us))
166+
CIK = CartesianIndices(map(x -> Base.OneTo(x), n))[tidx] # data-specific in kernel
167+
168+
# Swap in `getidx`
169+
(v, i, j, _, h) = CIK.I
170+
GCI = CartesianIndex(i, j, 1, v, h)
171+
172+
# Swap again (in `getindex(::AbstractData, ::CartesianIndex)`)
173+
(i, j, _, v, h) = GCI.I
174+
CI = CartesianIndex(v, i, j, 1, h)
175+
y1[CI] = bc[CI]
176+
end
177+
end
178+
return nothing
179+
end
180+
181+
import Random
182+
function test_custom_kernel_bc!(X_array, Y_array, uss; swap)
183+
Random.seed!(1234)
184+
X_array.x1 .= typeof(X_array.x1)(rand(eltype(X_array.x1), size(X_array.x1)))
185+
Y_array_cp = deepcopy(Y_array)
186+
custom_kernel_bc!(X_array, Y_array_cp, uss; swap=0)
187+
custom_kernel_bc!(X_array, Y_array, uss; swap)
188+
@test all(Y_array_cp.y1 .== Y_array.y1)
189+
end
190+
191+
FT = Float32;
192+
arr(T) = T(zeros(63,4,4,1,5400))
193+
X_array = (;x1 = arr(ArrayType),x2 = arr(ArrayType),x3 = arr(ArrayType));
194+
Y_array = (;y1 = arr(ArrayType),);
195+
to_vec(ξ) = (;zip(propertynames(ξ), map-> vec(θ), values(ξ)))...);
196+
X_vector = to_vec(X_array);
197+
Y_vector = to_vec(Y_array);
198+
N = length(X_vector.x1)
199+
(Nv, Nij, _, _, Nh) = size(Y_array.y1);
200+
uss = UniversalSizesStatic(Nv, Nij, Nh);
201+
at_dot_call!(X_vector, Y_vector)
202+
custom_kernel_bc!(X_array, Y_array, uss; swap=0)
203+
custom_kernel_bc!(X_array, Y_array, uss; swap=1)
204+
custom_kernel_bc!(X_array, Y_array, uss; swap=2)
205+
test_custom_kernel_bc!(X_array, Y_array, uss; swap=1)
206+
test_custom_kernel_bc!(X_array, Y_array, uss; swap=2)
207+
208+
@pretty_belapsed at_dot_call!($X_vector, $Y_vector)
209+
@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $uss, swap=0)
210+
@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $uss, swap=1)
211+
@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $uss, swap=2)
212+
213+
#! format: on

0 commit comments

Comments
 (0)