Skip to content

Commit 7ab0acf

Browse files
Merge pull request #1926 from CliMA/ck/offset_benchmark
Add offset benchmark
2 parents c64bb71 + 5b98d9d commit 7ab0acf

File tree

2 files changed

+319
-0
lines changed

2 files changed

+319
-0
lines changed

.buildkite/pipeline.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1242,6 +1242,16 @@ steps:
12421242
agents:
12431243
slurm_gpus: 1
12441244

1245+
- label: "Perf: benchmark scripts benchmark_offset"
1246+
key: benchmark_offset
1247+
command:
1248+
- "julia --project=.buildkite -e 'using CUDA; CUDA.versioninfo()'"
1249+
- "julia --color=yes --project=.buildkite benchmarks/scripts/benchmark_offset.jl"
1250+
env:
1251+
CLIMACOMMS_DEVICE: "CUDA"
1252+
agents:
1253+
slurm_gpus: 1
1254+
12451255
- group: "Perf: Operators"
12461256
steps:
12471257

Lines changed: 309 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,309 @@
1+
#=
2+
julia --project=.buildkite
3+
using Revise; include(joinpath("benchmarks", "scripts", "benchmark_offset.jl"))
4+
5+
# Info
6+
7+
- This benchmark demos the performance for different offset styles:
8+
- Array of structs with Cartesian offsets
9+
- Array of structs with Linear offsets
10+
- Struct of arrays with no offsets
11+
12+
# Benchmark results:
13+
14+
Clima A100:
15+
```
16+
[ Info: ArrayType = CuArray
17+
Problem size: (63, 4, 4, 1, 5400), float_type = Float32, device_bandwidth_GBs=2039
18+
┌────────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐
19+
│ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps │
20+
├────────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤
21+
│ BO.aos_cart_offset!(X_aos_ref, Y_aos_ref, us; bm, nreps = 100) │ 68 microseconds, 834 nanoseconds │ 57.7908 │ 1178.35 │ 4 │ 100 │
22+
│ BO.aos_lin_offset!(X_aos, Y_aos, us; bm, nreps = 100) │ 58 microseconds, 153 nanoseconds │ 68.4046 │ 1394.77 │ 4 │ 100 │
23+
│ BO.soa_linear_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 56 microseconds, 576 nanoseconds │ 70.3113 │ 1433.65 │ 4 │ 100 │
24+
│ BO.soa_cart_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 67 microseconds, 185 nanoseconds │ 59.2089 │ 1207.27 │ 4 │ 100 │
25+
└────────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘
26+
27+
[ Info: ArrayType = CuArray
28+
Problem size: (63, 4, 4, 1, 5400), float_type = Float32, device_bandwidth_GBs=2039
29+
┌────────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐
30+
│ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps │
31+
├────────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤
32+
│ BO.aos_cart_offset!(X_aos_ref, Y_aos_ref, us; bm, nreps = 100) │ 68 microseconds, 967 nanoseconds │ 57.6793 │ 1176.08 │ 4 │ 100 │
33+
│ BO.aos_lin_offset!(X_aos, Y_aos, us; bm, nreps = 100) │ 58 microseconds, 82 nanoseconds │ 68.489 │ 1396.49 │ 4 │ 100 │
34+
│ BO.soa_linear_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 56 microseconds, 597 nanoseconds │ 70.2858 │ 1433.13 │ 4 │ 100 │
35+
│ BO.soa_cart_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 67 microseconds, 288 nanoseconds │ 59.1188 │ 1205.43 │ 4 │ 100 │
36+
└────────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘
37+
```
38+
=#
39+
40+
#! format: off
41+
module BenchmarkOffset
42+
43+
include("benchmark_utils.jl")
44+
45+
add3(x1, x2, x3) = x1 + x2 + x3
46+
47+
function aos_cart_offset!(X, Y, us; nreps = 1, bm=nothing, n_trials = 30)
48+
if Y isa Array
49+
e = Inf
50+
CI = CartesianIndices((get_Nv(us), get_Nij(us), get_Nij(us), 1, get_Nh(us)))
51+
for t in 1:n_trials
52+
et = Base.@elapsed begin
53+
for i in 1:nreps
54+
@inbounds @simd for I in 1:get_N(us)
55+
CI1 = CI[I]
56+
CI2 = CI1 + CartesianIndex((0, 0, 0, 1, 0))
57+
CI3 = CI1 + CartesianIndex((0, 0, 0, 2, 0))
58+
Y[CI1] = add3(X[CI1], X[CI2], X[CI3])
59+
end
60+
end
61+
end
62+
e = min(e, et)
63+
end
64+
else
65+
e = Inf
66+
kernel = CUDA.@cuda always_inline = true launch = false aos_cart_offset_kernel!(X,Y,us)
67+
config = CUDA.launch_configuration(kernel.fun)
68+
threads = min(get_N(us), config.threads)
69+
blocks = cld(get_N(us), threads)
70+
for t in 1:n_trials
71+
et = CUDA.@elapsed begin
72+
for i in 1:nreps # reduce variance / impact of launch latency
73+
kernel(X,Y,us; threads, blocks)
74+
end
75+
end
76+
e = min(e, et)
77+
end
78+
end
79+
push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=4)
80+
return nothing
81+
end;
82+
function aos_cart_offset_kernel!(X, Y, us)
83+
@inbounds begin
84+
I = (CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x + CUDA.threadIdx().x
85+
if I get_N(us)
86+
n = (get_Nv(us), get_Nij(us), get_Nij(us), 1, get_Nh(us))
87+
CI1 = CartesianIndices(map(x -> Base.OneTo(x), n))[I]
88+
CI2 = CI1 + CartesianIndex((0, 0, 0, 1, 0))
89+
CI3 = CI1 + CartesianIndex((0, 0, 0, 2, 0))
90+
Y[CI1] = add3(X[CI1], X[CI2], X[CI3])
91+
end
92+
end
93+
return nothing
94+
end;
95+
96+
function aos_lin_offset!(X, Y, us; nreps = 1, bm=nothing, n_trials = 30)
97+
if Y isa Array
98+
e = Inf
99+
for t in 1:n_trials
100+
et = Base.@elapsed begin
101+
for i in 1:nreps
102+
@inbounds @simd for I in 1:get_N(us)
103+
CI = CartesianIndices((get_Nv(us), get_Nij(us), get_Nij(us), 1, get_Nh(us)))
104+
LI1 = LinearIndices((get_Nv(us), get_Nij(us), get_Nij(us), 1, get_Nh(us)))
105+
LI3 = LinearIndices((get_Nv(us), get_Nij(us), get_Nij(us), 3, get_Nh(us)))
106+
CI1 = CI[I]
107+
CI2 = CI1 + CartesianIndex((0, 0, 0, 1, 0))
108+
CI3 = CI1 + CartesianIndex((0, 0, 0, 2, 0))
109+
IY1 = LI1[CI1]
110+
IX1 = LI3[CI1]
111+
IX2 = LI3[CI2]
112+
IX3 = LI3[CI3]
113+
Y[IY1] = add3(X[IX1], X[IX2], X[IX3])
114+
end
115+
end
116+
end
117+
e = min(e, et)
118+
end
119+
else
120+
e = Inf
121+
kernel = CUDA.@cuda always_inline = true launch = false aos_lin_offset_kernel!(X,Y,us)
122+
config = CUDA.launch_configuration(kernel.fun)
123+
threads = min(get_N(us), config.threads)
124+
blocks = cld(get_N(us), threads)
125+
for t in 1:n_trials
126+
et = CUDA.@elapsed begin
127+
for i in 1:nreps
128+
kernel(X,Y,us; threads, blocks)
129+
end
130+
end
131+
e = min(e, et)
132+
end
133+
end
134+
push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=4)
135+
return nothing
136+
end;
137+
function aos_lin_offset_kernel!(X, Y, us)
138+
@inbounds begin
139+
I = (CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x + CUDA.threadIdx().x
140+
if I get_N(us)
141+
CI = CartesianIndices((get_Nv(us), get_Nij(us), get_Nij(us), 1, get_Nh(us)))
142+
LI1 = LinearIndices((get_Nv(us), get_Nij(us), get_Nij(us), 1, get_Nh(us)))
143+
LI3 = LinearIndices((get_Nv(us), get_Nij(us), get_Nij(us), 3, get_Nh(us)))
144+
CI1 = CI[I]
145+
CI2 = CI1 + CartesianIndex((0, 0, 0, 1, 0))
146+
CI3 = CI1 + CartesianIndex((0, 0, 0, 2, 0))
147+
IY1 = LI1[CI1]
148+
IX1 = LI3[CI1]
149+
IX2 = LI3[CI2]
150+
IX3 = LI3[CI3]
151+
Y[IY1] = add3(X[IX1], X[IX2], X[IX3])
152+
end
153+
end
154+
return nothing
155+
end;
156+
157+
function soa_cart_index!(X, Y, us; nreps = 1, bm=nothing, n_trials = 30)
158+
e = Inf
159+
if first(Y) isa Array
160+
CI = CartesianIndices((get_Nv(us), get_Nij(us), get_Nij(us), get_Nh(us)))
161+
for t in 1:n_trials
162+
et = Base.@elapsed begin
163+
for i in 1:nreps
164+
(y1,) = Y
165+
(x1, x2, x3) = X
166+
@inbounds @simd for I in 1:get_N(us)
167+
y1[CI[I]] = add3(x1[CI[I]], x2[CI[I]], x3[CI[I]])
168+
end
169+
end
170+
end
171+
e = min(e, et)
172+
end
173+
else
174+
kernel = CUDA.@cuda always_inline = true launch = false soa_cart_index_kernel!(X,Y,us)
175+
config = CUDA.launch_configuration(kernel.fun)
176+
threads = min(get_N(us), config.threads)
177+
blocks = cld(get_N(us), threads)
178+
for t in 1:n_trials
179+
et = CUDA.@elapsed begin
180+
for i in 1:nreps # reduce variance / impact of launch latency
181+
kernel(X,Y,us; threads, blocks)
182+
end
183+
end
184+
e = min(e, et)
185+
end
186+
end
187+
push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=4)
188+
return nothing
189+
end;
190+
function soa_cart_index_kernel!(X, Y, us)
191+
@inbounds begin
192+
I = (CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x + CUDA.threadIdx().x
193+
if I get_N(us)
194+
CI = CartesianIndices((get_Nv(us), get_Nij(us), get_Nij(us), get_Nh(us)))
195+
(y1,) = Y
196+
(x1, x2, x3) = X
197+
y1[CI[I]] = add3(x1[CI[I]], x2[CI[I]], x3[CI[I]])
198+
end
199+
end
200+
return nothing
201+
end;
202+
203+
function soa_linear_index!(X, Y, us; nreps = 1, bm=nothing, n_trials = 30)
204+
e = Inf
205+
if first(Y) isa Array
206+
for t in 1:n_trials
207+
et = Base.@elapsed begin
208+
for i in 1:nreps
209+
(y1,) = Y
210+
(x1, x2, x3) = X
211+
@inbounds @simd for I in 1:get_N(us)
212+
y1[I] = add3(x1[I], x2[I], x3[I])
213+
end
214+
end
215+
end
216+
e = min(e, et)
217+
end
218+
else
219+
kernel = CUDA.@cuda always_inline = true launch = false soa_linear_index_kernel!(X,Y,us)
220+
config = CUDA.launch_configuration(kernel.fun)
221+
threads = min(get_N(us), config.threads)
222+
blocks = cld(get_N(us), threads)
223+
for t in 1:n_trials
224+
et = CUDA.@elapsed begin
225+
for i in 1:nreps # reduce variance / impact of launch latency
226+
kernel(X,Y,us; threads, blocks)
227+
end
228+
end
229+
e = min(e, et)
230+
end
231+
end
232+
push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=4)
233+
return nothing
234+
end;
235+
function soa_linear_index_kernel!(X, Y, us)
236+
@inbounds begin
237+
I = (CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x + CUDA.threadIdx().x
238+
if I get_N(us)
239+
(y1,) = Y
240+
(x1, x2, x3) = X
241+
y1[I] = add3(x1[I], x2[I], x3[I])
242+
end
243+
end
244+
return nothing
245+
end;
246+
247+
end # module
248+
249+
import .BenchmarkOffset as BO
250+
251+
function fill_with_rand!(arr)
252+
FT = eltype(arr)
253+
T = typeof(arr)
254+
s = size(arr)
255+
arr .= T(rand(FT, s))
256+
end
257+
258+
using CUDA
259+
using Test
260+
@testset "Offset benchmark" begin
261+
bm = BO.Benchmark(;problem_size=(63,4,4,1,5400), float_type=Float32) # size(problem_size, 4) == 1 to avoid double counting reads/writes
262+
ArrayType = CUDA.CuArray;
263+
# ArrayType = Base.identity;
264+
arr(float_type, problem_size, T) = T(zeros(float_type, problem_size...))
265+
266+
FT = Float64;
267+
s = (63,4,4,3,5400);
268+
sY = (63,4,4,1,5400);
269+
st = (63,4,4,5400);
270+
ndofs = prod(st);
271+
us = BO.UniversalSizesStatic(s[1], s[2], s[end]);
272+
273+
X_aos = arr(bm.float_type, s, ArrayType);
274+
Y_aos = arr(bm.float_type, sY, ArrayType);
275+
X_aos_ref = arr(bm.float_type, s, ArrayType);
276+
Y_aos_ref = arr(bm.float_type, sY, ArrayType);
277+
X_soa = ntuple(_ -> arr(bm.float_type, st, ArrayType), 3);
278+
Y_soa = ntuple(_ -> arr(bm.float_type, st, ArrayType), 1);
279+
fill_with_rand!(X_aos)
280+
fill_with_rand!(Y_aos)
281+
X_aos_ref .= X_aos
282+
Y_aos_ref .= Y_aos
283+
for i in 1:3; X_soa[i] .= X_aos[:,:,:,i,:]; end
284+
for i in 1:1; Y_soa[i] .= Y_aos[:,:,:,i,:]; end
285+
@info "ArrayType = $ArrayType"
286+
287+
BO.aos_cart_offset!(X_aos_ref, Y_aos_ref, us; n_trials = 1, nreps = 1)
288+
BO.aos_lin_offset!(X_aos, Y_aos, us; n_trials = 1, nreps = 1)
289+
BO.soa_linear_index!(X_soa, Y_soa, us; n_trials = 1, nreps = 1)
290+
291+
@test all(X_aos .== X_aos_ref)
292+
@test all(Y_aos .== Y_aos_ref)
293+
for i in 1:3; @test all(X_soa[i] .== X_aos_ref[:,:,:,i,:]); end
294+
for i in 1:1; @test all(Y_soa[i] .== Y_aos_ref[:,:,:,i,:]); end
295+
296+
BO.soa_cart_index!(X_soa, Y_soa, us; n_trials = 1, nreps = 1)
297+
298+
for i in 1:3; @test all(X_soa[i] .== X_aos_ref[:,:,:,i,:]); end
299+
for i in 1:1; @test all(Y_soa[i] .== Y_aos_ref[:,:,:,i,:]); end
300+
301+
BO.aos_cart_offset!(X_aos_ref, Y_aos_ref, us; bm, nreps = 100)
302+
BO.aos_lin_offset!(X_aos, Y_aos, us; bm, nreps = 100)
303+
BO.soa_linear_index!(X_soa, Y_soa, us; bm, nreps = 100)
304+
BO.soa_cart_index!(X_soa, Y_soa, us; bm, nreps = 100)
305+
306+
BO.tabulate_benchmark(bm)
307+
end
308+
309+
# #! format: on

0 commit comments

Comments
 (0)