Skip to content

Commit 111dc75

Browse files
Merge pull request #1917 from CliMA/ck/thermo_bench_bw
Add thermo benchmark bandwidth script
2 parents c272d15 + 0e183fb commit 111dc75

File tree

4 files changed

+225
-11
lines changed

4 files changed

+225
-11
lines changed

.buildkite/pipeline.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1232,6 +1232,16 @@ steps:
12321232
agents:
12331233
slurm_gpus: 1
12341234

1235+
- label: "Perf: benchmark scripts thermo_bench_bw"
1236+
key: thermo_bench_bw
1237+
command:
1238+
- "julia --project=.buildkite -e 'using CUDA; CUDA.versioninfo()'"
1239+
- "julia --color=yes --project=.buildkite benchmarks/scripts/thermo_bench_bw.jl"
1240+
env:
1241+
CLIMACOMMS_DEVICE: "CUDA"
1242+
agents:
1243+
slurm_gpus: 1
1244+
12351245
- group: "Perf: Operators"
12361246
steps:
12371247

benchmarks/scripts/benchmark_utils.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ end
1717
Base.@kwdef mutable struct Benchmark
1818
problem_size::Tuple
1919
float_type::Type
20-
device_bandwidth_GBs::Int = 2_039
20+
device_bandwidth_GBs::Int = 2_039 # (A100 SXM4 80GB)
2121
data::Vector = []
2222
end
2323

benchmarks/scripts/thermo_bench_bw.jl

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
#=
2+
julia --project=.buildkite
3+
using Revise; include(joinpath("benchmarks", "scripts", "thermo_bench_bw.jl"))
4+
5+
# Info
6+
7+
- This is a benchmark for ClimaCore pointwise kernels, with
8+
multiple field variables. We locally define a toy version
9+
of Thermodynamics, and avoid all flops and only measure
10+
the bandwidth performance achieved on the hardware.
11+
12+
# Benchmark results:
13+
14+
Clima A100:
15+
```
16+
[ Info: device = ClimaComms.CUDADevice()
17+
Problem size: (63, 4, 4, 1, 5400), float_type = Float32, device_bandwidth_GBs=2039
18+
┌────────────────────────────────────────────────────┬───────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐
19+
│ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps │
20+
├────────────────────────────────────────────────────┼───────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤
21+
│ TBB.thermo_func_bc!(x, us; nreps=100, bm) │ 797 microseconds, 92 nanoseconds │ 12.4764 │ 254.394 │ 10 │ 100 │
22+
│ TBB.thermo_func_sol!(x_vec, us; nreps=100, bm) │ 131 microseconds, 851 nanoseconds │ 75.4252 │ 1537.92 │ 10 │ 100 │
23+
│ TBB.thermo_func_bc!(x, us; nreps=100, bm) │ 797 microseconds, 164 nanoseconds │ 12.4753 │ 254.371 │ 10 │ 100 │
24+
│ TBB.thermo_func_sol!(x_vec, us; nreps=100, bm) │ 131 microseconds, 943 nanoseconds │ 75.3725 │ 1536.84 │ 10 │ 100 │
25+
└────────────────────────────────────────────────────┴───────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘
26+
27+
[ Info: device = ClimaComms.CUDADevice()
28+
Problem size: (63, 4, 4, 1, 5400), float_type = Float64, device_bandwidth_GBs=2039
29+
┌────────────────────────────────────────────────────┬───────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐
30+
│ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps │
31+
├────────────────────────────────────────────────────┼───────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤
32+
│ TBB.thermo_func_bc!(x, us; nreps=100, bm) │ 1 millisecond, 45 microseconds │ 19.0163 │ 387.743 │ 10 │ 100 │
33+
│ TBB.thermo_func_sol!(x_vec, us; nreps=100, bm) │ 258 microseconds, 120 nanoseconds │ 77.0559 │ 1571.17 │ 10 │ 100 │
34+
│ TBB.thermo_func_bc!(x, us; nreps=100, bm) │ 1 millisecond, 46 microseconds │ 19.0147 │ 387.709 │ 10 │ 100 │
35+
│ TBB.thermo_func_sol!(x_vec, us; nreps=100, bm) │ 257 microseconds, 915 nanoseconds │ 77.1171 │ 1572.42 │ 10 │ 100 │
36+
└────────────────────────────────────────────────────┴───────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘
37+
```
38+
=#
39+
40+
#! format: off
41+
module ThermoBenchBandwidth
42+
43+
include("benchmark_utils.jl")
44+
45+
import ClimaCore
46+
import CUDA
47+
using ClimaComms
48+
using Test
49+
using StaticArrays, IntervalSets, LinearAlgebra
50+
using JET
51+
52+
import ClimaCore: Spaces, Fields
53+
import ClimaCore.Domains: Geometry
54+
55+
struct PhaseEquil{FT}
56+
ρ::FT
57+
p::FT
58+
e_int::FT
59+
q_tot::FT
60+
T::FT
61+
end
62+
63+
@inline Base.zero(::Type{PhaseEquil{FT}}) where {FT} =
64+
PhaseEquil{FT}(0, 0, 0, 0, 0)
65+
66+
function thermo_func_bc!(x, us; nreps = 1, bm=nothing, n_trials = 30)
67+
e = Inf
68+
for t in 1:n_trials
69+
et = CUDA.@elapsed begin
70+
for _ in 1:nreps
71+
(; ts, ρ,p,e_int,q_tot,T) = x
72+
@. ts = PhaseEquil(ρ,p,e_int,q_tot,T) # 5 reads, 5 writes, 0 flops
73+
end
74+
end
75+
e = min(e, et)
76+
end
77+
push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=10)
78+
return nothing
79+
end
80+
81+
function thermo_func_sol!(x, us::UniversalSizesStatic; nreps = 1, bm=nothing, n_trials = 30)
82+
e = Inf
83+
for t in 1:n_trials
84+
et = CUDA.@elapsed begin
85+
(; ts, ρ,p,e_int,q_tot,T) = x
86+
kernel = CUDA.@cuda always_inline = true launch = false thermo_func_sol_kernel!(ts,ρ,p,e_int,q_tot,T,us)
87+
N = get_N(us)
88+
config = CUDA.launch_configuration(kernel.fun)
89+
threads = min(N, config.threads)
90+
blocks = cld(N, threads)
91+
for _ in 1:nreps
92+
kernel(ts,ρ,p,e_int,q_tot,T,us; threads, blocks)
93+
end
94+
end
95+
e = min(e, et)
96+
end
97+
push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=10)
98+
return nothing
99+
end
100+
101+
# Mimics how indexing works in generalized pointwise kernels
102+
function thermo_func_sol_kernel!(ts, ρ,p,e_int,q_tot,T, us)
103+
@inbounds begin
104+
FT = eltype(ts.ρ)
105+
I = (CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x + CUDA.threadIdx().x
106+
if I get_N(us)
107+
# 5 reads, 5 writes, 0 flops
108+
ts_i = PhaseEquil(ρ[I],p[I],e_int[I],q_tot[I],T[I])
109+
ts.ρ[I] = ts_i.ρ
110+
ts.p[I] = ts_i.p
111+
ts.T[I] = ts_i.T
112+
ts.e_int[I] = ts_i.e_int
113+
ts.q_tot[I] = ts_i.q_tot
114+
end
115+
end
116+
return nothing
117+
end
118+
119+
end # module
120+
121+
import .ThermoBenchBandwidth as TBB
122+
123+
import CUDA
124+
using ClimaComms
125+
using ClimaCore
126+
import ClimaCore: Spaces, Fields
127+
import ClimaCore.Domains: Geometry
128+
129+
ENV["CLIMACOMMS_DEVICE"] = "CUDA";
130+
ClimaComms.@import_required_backends
131+
using BenchmarkTools
132+
@isdefined(TU) || include(
133+
joinpath(pkgdir(ClimaCore), "test", "TestUtilities", "TestUtilities.jl"),
134+
);
135+
import .TestUtilities as TU;
136+
137+
using Test
138+
@testset "Thermo state" begin
139+
FT = Float32
140+
bm = TBB.Benchmark(;problem_size=(63,4,4,1,5400), float_type=FT)
141+
device = ClimaComms.device()
142+
context = ClimaComms.context(device)
143+
cspace = TU.CenterExtrudedFiniteDifferenceSpace(
144+
FT;
145+
zelem = 63,
146+
context,
147+
helem = 30,
148+
Nq = 4,
149+
)
150+
fspace = Spaces.FaceExtrudedFiniteDifferenceSpace(cspace)
151+
@info "device = $device"
152+
# TODO: fill with non-trivial values (e.g., use Thermodynamics TestedProfiles) to verify correctness.
153+
nt_core = (; ρ = FT(1), p = FT(2),e_int = FT(3),q_tot = FT(4),T = FT(5))
154+
nt_ts = (;
155+
ρ = FT(0),
156+
p = FT(0),
157+
e_int = FT(0),
158+
q_tot = FT(0),
159+
T = FT(0),
160+
)
161+
x = fill((; ts = zero(TBB.PhaseEquil{FT}), nt_core...), cspace)
162+
xv = fill((; ts = nt_ts, nt_core...), cspace)
163+
(_, Nij, _, Nv, Nh) = size(Fields.field_values(x.ts))
164+
us = TBB.UniversalSizesStatic(Nv, Nij, Nh)
165+
function to_vec(ξ)
166+
pns = propertynames(ξ)
167+
dl_vals = map(pns) do pn
168+
val = getproperty(ξ, pn)
169+
pn == :ts ? to_vec(val) :
170+
CUDA.CuArray(collect(vec(parent(Fields.field_values(val)))))
171+
end
172+
return (; zip(propertynames(ξ), dl_vals)...)
173+
end
174+
x_vec = to_vec(xv)
175+
176+
TBB.thermo_func_bc!(x, us; nreps=1, n_trials = 1)
177+
TBB.thermo_func_sol!(x_vec, us; nreps=1, n_trials = 1)
178+
179+
rc = Fields.rcompare(x_vec, to_vec(x))
180+
rc || Fields.rprint_diff(x_vec, to_vec(x)) # test correctness (should print nothing)
181+
@test rc # test correctness
182+
183+
TBB.thermo_func_bc!(x, us; nreps=100, bm)
184+
TBB.thermo_func_sol!(x_vec, us; nreps=100, bm)
185+
186+
TBB.thermo_func_bc!(x, us; nreps=100, bm)
187+
TBB.thermo_func_sol!(x_vec, us; nreps=100, bm)
188+
189+
TBB.tabulate_benchmark(bm)
190+
191+
end
192+
#! format: on

src/Fields/fieldvector.jl

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -396,20 +396,31 @@ function __rprint_diff(io::IO, xi, yi; pc, xname, yname) # assume we can compute
396396
end
397397

398398
"""
399-
rprint_diff(io::IO, ::T, ::T) where {T <: FieldVector}
400-
rprint_diff(::T, ::T) where {T <: FieldVector}
399+
rprint_diff(io::IO, ::T, ::T) where {T <: Union{FieldVector, NamedTuple}}
400+
rprint_diff(::T, ::T) where {T <: Union{FieldVector, NamedTuple}}
401401
402-
Recursively print differences in given `FieldVector`.
402+
Recursively print differences in given `Union{FieldVector, NamedTuple}`.
403403
"""
404-
_rprint_diff(io::IO, x::T, y::T, xname, yname) where {T <: FieldVector} =
404+
_rprint_diff(
405+
io::IO,
406+
x::T,
407+
y::T,
408+
xname,
409+
yname,
410+
) where {T <: Union{FieldVector, NamedTuple}} =
405411
__rprint_diff(io, x, y; pc = (), xname, yname)
406-
_rprint_diff(x::T, y::T, xname, yname) where {T <: FieldVector} =
412+
_rprint_diff(
413+
x::T,
414+
y::T,
415+
xname,
416+
yname,
417+
) where {T <: Union{FieldVector, NamedTuple}} =
407418
_rprint_diff(stdout, x, y, xname, yname)
408419

409420
"""
410-
@rprint_diff(::T, ::T) where {T <: FieldVector}
421+
@rprint_diff(::T, ::T) where {T <: Union{FieldVector, NamedTuple}}
411422
412-
Recursively print differences in given `FieldVector`.
423+
Recursively print differences in given `Union{FieldVector, NamedTuple}`.
413424
"""
414425
macro rprint_diff(x, y)
415426
return :(_rprint_diff(
@@ -429,17 +440,18 @@ _rcompare(pass, x::T, y::T) where {T <: DataLayouts.AbstractData} =
429440
pass && (parent(x) == parent(y))
430441
_rcompare(pass, x::T, y::T) where {T} = pass && (x == y)
431442

432-
function _rcompare(pass, x::T, y::T) where {T <: FieldVector}
443+
function _rcompare(pass, x::T, y::T) where {T <: Union{FieldVector, NamedTuple}}
433444
for pn in propertynames(x)
434445
pass &= _rcompare(pass, getproperty(x, pn), getproperty(y, pn))
435446
end
436447
return pass
437448
end
438449

439450
"""
440-
rcompare(x::T, y::T) where {T <: FieldVector}
451+
rcompare(x::T, y::T) where {T <: Union{FieldVector, NamedTuple}}
441452
442453
Recursively compare given fieldvectors via `==`.
443454
Returns `true` if `x == y` recursively.
444455
"""
445-
rcompare(x::T, y::T) where {T <: FieldVector} = _rcompare(true, x, y)
456+
rcompare(x::T, y::T) where {T <: Union{FieldVector, NamedTuple}} =
457+
_rcompare(true, x, y)

0 commit comments

Comments
 (0)