Skip to content

Commit 59121aa

Browse files
Merge pull request #1836 from CliMA/ck/misc
Update and add cuda stencil benchmarks
2 parents f675217 + 1a6e2ef commit 59121aa

File tree

6 files changed

+201
-70
lines changed

6 files changed

+201
-70
lines changed

.buildkite/pipeline.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1180,15 +1180,15 @@ steps:
11801180
key: "perf_cpu_implicit_stencil"
11811181
command: "julia --color=yes --check-bounds=yes --project=.buildkite test/Operators/finitedifference/opt_implicit_stencils.jl"
11821182

1183-
- label: "Perf: FD operator benchmarks"
1183+
- label: "Perf: FD operator stencil benchmarks"
11841184
key: "perf_fd_ops"
1185-
command: "julia --color=yes --project=.buildkite test/Operators/finitedifference/benchmark_column.jl"
1185+
command: "julia --color=yes --project=.buildkite test/Operators/finitedifference/benchmark_stencils.jl"
11861186

1187-
- label: "Perf: GPU FD operator benchmarks"
1187+
- label: "Perf: GPU FD operator stencil benchmarks"
11881188
key: "gpu_perf_fd_ops"
11891189
command:
11901190
- "julia --project=.buildkite -e 'using CUDA; CUDA.versioninfo()'"
1191-
- "julia --color=yes --project=.buildkite test/Operators/finitedifference/benchmark_column.jl"
1191+
- "julia --color=yes --project=.buildkite test/Operators/finitedifference/benchmark_stencils.jl"
11921192
env:
11931193
CLIMACOMMS_DEVICE: "CUDA"
11941194
agents:

test/Operators/finitedifference/benchmark_column.jl renamed to test/Operators/finitedifference/benchmark_stencils.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
#=
22
julia --project
3-
using Revise; include(joinpath("test", "Operators", "finitedifference", "benchmark_column.jl"))
3+
using Revise; include(joinpath("test", "Operators", "finitedifference", "benchmark_stencils.jl"))
44
=#
5-
include("benchmark_column_utils.jl")
5+
include("benchmark_stencils_utils.jl")
66

77
@testset "Benchmark operators" begin
88
benchmark_operators(Float64; z_elems = 63, helem = 30, Nq = 4)
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
import CUDA
2+
#####
3+
##### CPU column
4+
#####
5+
6+
#= e.g., any 2nd order interpolation / derivative operator =#
7+
function column_op_2mul_1add!(x, y, D, U)
8+
y1 = @view y[1:(end - 1)]
9+
y2 = @view y[2:end]
10+
@inbounds for i in eachindex(x)
11+
x[i] = D[i] * y1[i] + U[i] * y2[i]
12+
end
13+
return nothing
14+
end
15+
16+
#= e.g., div(grad(scalar)), div(interp(vec)) =#
17+
function column_op_3mul_2add!(x, y, L, D, U)
18+
y1 = @view y[1:(end - 1)]
19+
y2 = @view y[2:(end - 1)]
20+
y3 = @view y[2:end]
21+
@inbounds for i in eachindex(x)
22+
i == 1 && continue
23+
i == length(x) && continue
24+
x[i] = L[i] * y1[i] + D[i] * y2[i] + U[i] * y3[i]
25+
end
26+
return nothing
27+
end
28+
29+
#= e.g., curlC2F =#
30+
function column_curl_like!(curluₕ, uₕ_x, uₕ_y, D, U)
31+
@inbounds for i in eachindex(curluₕ)
32+
curluₕ[i] = D[i] * uₕ_x[i] + U[i] * uₕ_y[i]
33+
end
34+
return nothing
35+
end
36+
37+
#####
38+
##### CUDA column
39+
#####
40+
41+
# TODO: expand on this
42+
43+
function column_op_2mul_1add_cuda!(x, y, D, U)
44+
kernel =
45+
CUDA.@cuda always_inline = true launch = false op_2mul_1add_cuda_kernel!(
46+
x,
47+
y,
48+
D,
49+
U,
50+
Val(length(x)),
51+
)
52+
config = CUDA.launch_configuration(kernel.fun)
53+
nitems = length(x)
54+
threads = min(nitems, config.threads)
55+
blocks = cld(nitems, threads)
56+
kernel(x, y, D, U; threads, blocks) # This knows to use always_inline from above.
57+
return nothing
58+
end
59+
60+
function op_2mul_1add_cuda_kernel!(x, y, D, U, ::Val{N}) where {N}
61+
@inbounds begin
62+
i = thread_index()
63+
if i N
64+
x[i] = D[i] * y[i] + U[i] * y[i + 1]
65+
end
66+
end
67+
return nothing
68+
end
69+
70+
71+
#####
72+
##### CPU sphere
73+
#####
74+
75+
# TODO
76+
77+
#####
78+
##### CUDA sphere
79+
#####
80+
81+
# TODO: move to CUDA utils
82+
thread_index() =
83+
(CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x + CUDA.threadIdx().x
84+
Base.@propagate_inbounds kernel_indexes(tidx, Nv, Nij, Nh) =
85+
CartesianIndices(map(x -> Base.OneTo(x), (Nv, Nij, Nij, 1, Nh)))[tidx]
86+
valid_range(tidx, n) = 1 tidx n
87+
88+
89+
#= e.g., any 2nd order interpolation / derivative operator =#
90+
function sphere_op_2mul_1add_cuda!(x, y, D, U)
91+
Nv = size(x, 1)
92+
Nij = size(x, 2)
93+
Nh = size(x, 5)
94+
N = length(x)
95+
kernel =
96+
CUDA.@cuda always_inline = true launch = false sphere_op_2mul_1add_cuda_kernel!(
97+
x,
98+
y,
99+
D,
100+
U,
101+
Val(Nv),
102+
Val(Nij),
103+
Val(Nh),
104+
Val(N),
105+
)
106+
config = CUDA.launch_configuration(kernel.fun)
107+
threads = min(N, config.threads)
108+
blocks = cld(N, threads)
109+
kernel(x, y, D, U; threads, blocks)
110+
return nothing
111+
end
112+
113+
function sphere_op_2mul_1add_cuda_kernel!(
114+
x,
115+
y,
116+
D,
117+
U,
118+
::Val{Nv},
119+
::Val{Nij},
120+
::Val{Nh},
121+
::Val{N},
122+
) where {Nv, Nij, Nh, N}
123+
@inbounds begin
124+
tidx = thread_index()
125+
if valid_range(tidx, N)
126+
I = kernel_indexes(tidx, Nv, Nij, Nh)
127+
x[I] = D[I] * y[I] + U[I] * y[I + CartesianIndex(1, 0, 0, 0, 0)]
128+
end
129+
end
130+
return nothing
131+
end

test/Operators/finitedifference/benchmark_column_utils.jl renamed to test/Operators/finitedifference/benchmark_stencils_utils.jl

Lines changed: 63 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -33,41 +33,6 @@ field_vars(::Type{FT}) where {FT} = (;
3333
ᶠw = Geometry.Covariant3Vector(FT(0)),
3434
)
3535

36-
#####
37-
##### Second order interpolation / derivatives
38-
#####
39-
40-
#= e.g., any 2nd order interpolation / derivative operator =#
41-
function op_2mul_1add!(x, y, D, U)
42-
y1 = @view y[1:(end - 1)]
43-
y2 = @view y[2:end]
44-
@inbounds for i in eachindex(x)
45-
x[i] = D[i] * y1[i] + U[i] * y2[i]
46-
end
47-
return nothing
48-
end
49-
50-
#= e.g., div(grad(scalar)), div(interp(vec)) =#
51-
function op_3mul_2add!(x, y, L, D, U)
52-
y1 = @view y[1:(end - 1)]
53-
y2 = @view y[2:(end - 1)]
54-
y3 = @view y[2:end]
55-
@inbounds for i in eachindex(x)
56-
i==1 && continue
57-
i==length(x) && continue
58-
x[i] = L[i] * y1[i] + D[i] * y2[i] + U[i] * y3[i]
59-
end
60-
return nothing
61-
end
62-
63-
#= e.g., curlC2F =#
64-
function curl_like!(curluₕ, uₕ_x, uₕ_y, D, U)
65-
@inbounds for i in eachindex(curluₕ)
66-
curluₕ[i] = D[i] * uₕ_x[i] + U[i] * uₕ_y[i]
67-
end
68-
return nothing
69-
end
70-
7136
function set_value_bcs(c)
7237
FT = Spaces.undertype(axes(c))
7338
return (;bottom = Operators.SetValue(FT(0)),
@@ -211,7 +176,8 @@ bc_name(bcs::Tuple) = (:none,)
211176
bc_name_base(bcs::@NamedTuple{}) = (:none,)
212177
bc_name(bcs::@NamedTuple{}) = (:none,)
213178

214-
include("benchmark_column_kernels.jl")
179+
include("benchmark_stencils_array_kernels.jl")
180+
include("benchmark_stencils_climacore_kernels.jl")
215181

216182
uses_bycolumn(::typeof(op_broadcast_example0!)) = true
217183
uses_bycolumn(::typeof(op_broadcast_example1!)) = true
@@ -268,54 +234,88 @@ function benchmark_func!(t_min, trials, fun, c, f, verbose = false)
268234
end
269235
end
270236

271-
function benchmark_arrays(z_elems, ::Type{FT}) where {FT}
272-
L = zeros(FT, z_elems)
273-
D = zeros(FT, z_elems)
274-
U = zeros(FT, z_elems)
275-
xarr = rand(FT, z_elems)
276-
uₕ_x = rand(FT, z_elems)
277-
uₕ_y = rand(FT, z_elems)
278-
yarr = rand(FT, z_elems + 1)
279-
280-
println("\n############################ 2-point stencil")
281-
trial = BenchmarkTools.@benchmark op_2mul_1add!($xarr, $yarr, $D, $U)
282-
show(stdout, MIME("text/plain"), trial)
283-
println()
284-
println("\n############################ 3-point stencil")
285-
trial = BenchmarkTools.@benchmark op_3mul_2add!($xarr, $yarr, $L, $D, $U)
286-
show(stdout, MIME("text/plain"), trial)
287-
println()
288-
println("\n############################ curl-like stencil")
289-
trial = BenchmarkTools.@benchmark curl_like!($xarr, $uₕ_x, $uₕ_y, $D, $U)
290-
show(stdout, MIME("text/plain"), trial)
291-
println()
237+
function column_benchmark_arrays(device, z_elems, ::Type{FT}) where {FT}
238+
ArrayType = ClimaComms.array_type(device)
239+
L = ArrayType(zeros(FT, z_elems))
240+
D = ArrayType(zeros(FT, z_elems))
241+
U = ArrayType(zeros(FT, z_elems))
242+
xarr = ArrayType(rand(FT, z_elems))
243+
uₕ_x = ArrayType(rand(FT, z_elems))
244+
uₕ_y = ArrayType(rand(FT, z_elems))
245+
yarr = ArrayType(rand(FT, z_elems + 1))
246+
247+
if device isa ClimaComms.CUDADevice
248+
println("\n############################ column 2-point stencil")
249+
trial = BenchmarkTools.@benchmark ClimaComms.@cuda_sync $device column_op_2mul_1add_cuda!($xarr, $yarr, $D, $U)
250+
show(stdout, MIME("text/plain"), trial)
251+
println()
252+
else
253+
println("\n############################ column 2-point stencil")
254+
trial = BenchmarkTools.@benchmark column_op_2mul_1add!($xarr, $yarr, $D, $U)
255+
show(stdout, MIME("text/plain"), trial)
256+
println()
257+
println("\n############################ column 3-point stencil")
258+
trial = BenchmarkTools.@benchmark column_op_3mul_2add!($xarr, $yarr, $L, $D, $U)
259+
show(stdout, MIME("text/plain"), trial)
260+
println()
261+
println("\n############################ column curl-like stencil")
262+
trial = BenchmarkTools.@benchmark column_curl_like!($xarr, $uₕ_x, $uₕ_y, $D, $U)
263+
show(stdout, MIME("text/plain"), trial)
264+
println()
265+
end
266+
end
267+
268+
function sphere_benchmark_arrays(device, z_elems, helem, Nq, ::Type{FT}) where {FT}
269+
ArrayType = ClimaComms.array_type(device)
270+
# VIJFH
271+
Nh = helem * helem * 6
272+
cdims = (z_elems , Nq, Nq, 1, Nh)
273+
fdims = (z_elems+1, Nq, Nq, 1, Nh)
274+
L = ArrayType(zeros(FT, cdims...))
275+
D = ArrayType(zeros(FT, cdims...))
276+
U = ArrayType(zeros(FT, cdims...))
277+
xarr = ArrayType(rand(FT, cdims...))
278+
uₕ_x = ArrayType(rand(FT, cdims...))
279+
uₕ_y = ArrayType(rand(FT, cdims...))
280+
yarr = ArrayType(rand(FT, fdims...))
281+
282+
if device isa ClimaComms.CUDADevice
283+
println("\n############################ sphere 2-point stencil")
284+
trial = BenchmarkTools.@benchmark ClimaComms.@cuda_sync $device sphere_op_2mul_1add_cuda!($xarr, $yarr, $D, $U)
285+
show(stdout, MIME("text/plain"), trial)
286+
println()
287+
else
288+
@info "Sphere CPU kernels have not been added yet."
289+
end
292290
end
293291

294292
function benchmark_operators(::Type{FT}; z_elems, helem, Nq) where {FT}
295-
@show ClimaComms.device()
293+
device = ClimaComms.device()
294+
@show device
296295
trials = OrderedCollections.OrderedDict()
297296
t_min = OrderedCollections.OrderedDict()
298-
benchmark_arrays(z_elems, FT)
297+
column_benchmark_arrays(device, z_elems, FT)
298+
sphere_benchmark_arrays(device, z_elems, helem, Nq, FT)
299299

300300
cspace = TU.ColumnCenterFiniteDifferenceSpace(FT; zelem=z_elems)
301301
fspace = Spaces.FaceFiniteDifferenceSpace(cspace)
302302
cfield = fill(field_vars(FT), cspace)
303303
ffield = fill(field_vars(FT), fspace)
304-
benchmark_operators_base(trials, t_min, cfield, ffield)
304+
benchmark_operators_base(trials, t_min, cfield, ffield, "column")
305305

306306
cspace = TU.CenterExtrudedFiniteDifferenceSpace(FT; zelem=z_elems, helem, Nq)
307307
fspace = Spaces.FaceExtrudedFiniteDifferenceSpace(cspace)
308308
cfield = fill(field_vars(FT), cspace)
309309
ffield = fill(field_vars(FT), fspace)
310-
benchmark_operators_base(trials, t_min, cfield, ffield)
310+
benchmark_operators_base(trials, t_min, cfield, ffield, "sphere")
311311

312312
# Tests are removed since they're flakey. And maintaining
313313
# them before they're converged is a bit of work..
314314
test_results(t_min)
315315
return (; trials, t_min)
316316
end
317317

318-
function benchmark_operators_base(trials, t_min, cfield, ffield)
318+
function benchmark_operators_base(trials, t_min, cfield, ffield, name)
319319
ops = [
320320
#### Core discrete operators
321321
op_GradientF2C!,
@@ -346,7 +346,7 @@ function benchmark_operators_base(trials, t_min, cfield, ffield)
346346
op_divgrad_uₕ!,
347347
]
348348

349-
@info "Benchmarking operators, this may take a minute or two..."
349+
@info "Benchmarking $name operators, this may take a minute or two..."
350350
for op in ops
351351
if uses_bycolumn(op) && axes(cfield) isa Spaces.FiniteDifferenceSpace
352352
continue

test/Operators/finitedifference/column_benchmark_profile.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#= This script is helpful for targeting specific kernels =#
2-
include("benchmark_column_utils.jl")
2+
include("benchmark_stencils_utils.jl")
33

44
function apply_kernel!(cfield, ffield)
55
# op_DivergenceF2C!(cfield, ffield)

0 commit comments

Comments
 (0)