|
| 1 | +#= |
| 2 | +julia --project=.buildkite |
| 3 | +using Revise; include(joinpath("benchmarks", "scripts", "thermo_bench_bw.jl")) |
| 4 | +
|
| 5 | +# Info |
| 6 | +
|
| 7 | + - This is a benchmark for ClimaCore pointwise kernels, with |
| 8 | + multiple field variables. We locally define a toy version |
| 9 | + of Thermodynamics, and avoid all flops and only measure |
| 10 | + the bandwidth performance achieved on the hardware. |
| 11 | +
|
| 12 | +# Benchmark results: |
| 13 | +
|
| 14 | +Clima A100: |
| 15 | +``` |
| 16 | +[ Info: device = ClimaComms.CUDADevice() |
| 17 | +Problem size: (63, 4, 4, 1, 5400), float_type = Float32, device_bandwidth_GBs=2039 |
| 18 | +┌────────────────────────────────────────────────────┬───────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐ |
| 19 | +│ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps │ |
| 20 | +├────────────────────────────────────────────────────┼───────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤ |
| 21 | +│ TBB.thermo_func_bc!(x, us; nreps=100, bm) │ 797 microseconds, 92 nanoseconds │ 12.4764 │ 254.394 │ 10 │ 100 │ |
| 22 | +│ TBB.thermo_func_sol!(x_vec, us; nreps=100, bm) │ 131 microseconds, 851 nanoseconds │ 75.4252 │ 1537.92 │ 10 │ 100 │ |
| 23 | +│ TBB.thermo_func_bc!(x, us; nreps=100, bm) │ 797 microseconds, 164 nanoseconds │ 12.4753 │ 254.371 │ 10 │ 100 │ |
| 24 | +│ TBB.thermo_func_sol!(x_vec, us; nreps=100, bm) │ 131 microseconds, 943 nanoseconds │ 75.3725 │ 1536.84 │ 10 │ 100 │ |
| 25 | +└────────────────────────────────────────────────────┴───────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘ |
| 26 | +
|
| 27 | +[ Info: device = ClimaComms.CUDADevice() |
| 28 | +Problem size: (63, 4, 4, 1, 5400), float_type = Float64, device_bandwidth_GBs=2039 |
| 29 | +┌────────────────────────────────────────────────────┬───────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐ |
| 30 | +│ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps │ |
| 31 | +├────────────────────────────────────────────────────┼───────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤ |
| 32 | +│ TBB.thermo_func_bc!(x, us; nreps=100, bm) │ 1 millisecond, 45 microseconds │ 19.0163 │ 387.743 │ 10 │ 100 │ |
| 33 | +│ TBB.thermo_func_sol!(x_vec, us; nreps=100, bm) │ 258 microseconds, 120 nanoseconds │ 77.0559 │ 1571.17 │ 10 │ 100 │ |
| 34 | +│ TBB.thermo_func_bc!(x, us; nreps=100, bm) │ 1 millisecond, 46 microseconds │ 19.0147 │ 387.709 │ 10 │ 100 │ |
| 35 | +│ TBB.thermo_func_sol!(x_vec, us; nreps=100, bm) │ 257 microseconds, 915 nanoseconds │ 77.1171 │ 1572.42 │ 10 │ 100 │ |
| 36 | +└────────────────────────────────────────────────────┴───────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘ |
| 37 | +``` |
| 38 | +=# |
| 39 | + |
| 40 | +#! format: off |
| 41 | +module ThermoBenchBandwidth |
| 42 | + |
| 43 | +include("benchmark_utils.jl") |
| 44 | + |
| 45 | +import ClimaCore |
| 46 | +import CUDA |
| 47 | +using ClimaComms |
| 48 | +using Test |
| 49 | +using StaticArrays, IntervalSets, LinearAlgebra |
| 50 | +using JET |
| 51 | + |
| 52 | +import ClimaCore: Spaces, Fields |
| 53 | +import ClimaCore.Domains: Geometry |
| 54 | + |
| 55 | +struct PhaseEquil{FT} |
| 56 | + ρ::FT |
| 57 | + p::FT |
| 58 | + e_int::FT |
| 59 | + q_tot::FT |
| 60 | + T::FT |
| 61 | +end |
| 62 | + |
| 63 | +@inline Base.zero(::Type{PhaseEquil{FT}}) where {FT} = |
| 64 | + PhaseEquil{FT}(0, 0, 0, 0, 0) |
| 65 | + |
| 66 | +function thermo_func_bc!(x, us; nreps = 1, bm=nothing, n_trials = 30) |
| 67 | + e = Inf |
| 68 | + for t in 1:n_trials |
| 69 | + et = CUDA.@elapsed begin |
| 70 | + for _ in 1:nreps |
| 71 | + (; ts, ρ,p,e_int,q_tot,T) = x |
| 72 | + @. ts = PhaseEquil(ρ,p,e_int,q_tot,T) # 5 reads, 5 writes, 0 flops |
| 73 | + end |
| 74 | + end |
| 75 | + e = min(e, et) |
| 76 | + end |
| 77 | + push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=10) |
| 78 | + return nothing |
| 79 | +end |
| 80 | + |
| 81 | +function thermo_func_sol!(x, us::UniversalSizesStatic; nreps = 1, bm=nothing, n_trials = 30) |
| 82 | + e = Inf |
| 83 | + for t in 1:n_trials |
| 84 | + et = CUDA.@elapsed begin |
| 85 | + (; ts, ρ,p,e_int,q_tot,T) = x |
| 86 | + kernel = CUDA.@cuda always_inline = true launch = false thermo_func_sol_kernel!(ts,ρ,p,e_int,q_tot,T,us) |
| 87 | + N = get_N(us) |
| 88 | + config = CUDA.launch_configuration(kernel.fun) |
| 89 | + threads = min(N, config.threads) |
| 90 | + blocks = cld(N, threads) |
| 91 | + for _ in 1:nreps |
| 92 | + kernel(ts,ρ,p,e_int,q_tot,T,us; threads, blocks) |
| 93 | + end |
| 94 | + end |
| 95 | + e = min(e, et) |
| 96 | + end |
| 97 | + push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=10) |
| 98 | + return nothing |
| 99 | +end |
| 100 | + |
| 101 | +# Mimics how indexing works in generalized pointwise kernels |
| 102 | +function thermo_func_sol_kernel!(ts, ρ,p,e_int,q_tot,T, us) |
| 103 | + @inbounds begin |
| 104 | + FT = eltype(ts.ρ) |
| 105 | + I = (CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x + CUDA.threadIdx().x |
| 106 | + if I ≤ get_N(us) |
| 107 | + # 5 reads, 5 writes, 0 flops |
| 108 | + ts_i = PhaseEquil(ρ[I],p[I],e_int[I],q_tot[I],T[I]) |
| 109 | + ts.ρ[I] = ts_i.ρ |
| 110 | + ts.p[I] = ts_i.p |
| 111 | + ts.T[I] = ts_i.T |
| 112 | + ts.e_int[I] = ts_i.e_int |
| 113 | + ts.q_tot[I] = ts_i.q_tot |
| 114 | + end |
| 115 | + end |
| 116 | + return nothing |
| 117 | +end |
| 118 | + |
| 119 | +end # module |
| 120 | + |
| 121 | +import .ThermoBenchBandwidth as TBB |
| 122 | + |
| 123 | +import CUDA |
| 124 | +using ClimaComms |
| 125 | +using ClimaCore |
| 126 | +import ClimaCore: Spaces, Fields |
| 127 | +import ClimaCore.Domains: Geometry |
| 128 | + |
| 129 | +ENV["CLIMACOMMS_DEVICE"] = "CUDA"; |
| 130 | +ClimaComms.@import_required_backends |
| 131 | +using BenchmarkTools |
| 132 | +@isdefined(TU) || include( |
| 133 | + joinpath(pkgdir(ClimaCore), "test", "TestUtilities", "TestUtilities.jl"), |
| 134 | +); |
| 135 | +import .TestUtilities as TU; |
| 136 | + |
| 137 | +using Test |
| 138 | +@testset "Thermo state" begin |
| 139 | + FT = Float32 |
| 140 | + bm = TBB.Benchmark(;problem_size=(63,4,4,1,5400), float_type=FT) |
| 141 | + device = ClimaComms.device() |
| 142 | + context = ClimaComms.context(device) |
| 143 | + cspace = TU.CenterExtrudedFiniteDifferenceSpace( |
| 144 | + FT; |
| 145 | + zelem = 63, |
| 146 | + context, |
| 147 | + helem = 30, |
| 148 | + Nq = 4, |
| 149 | + ) |
| 150 | + fspace = Spaces.FaceExtrudedFiniteDifferenceSpace(cspace) |
| 151 | + @info "device = $device" |
| 152 | + # TODO: fill with non-trivial values (e.g., use Thermodynamics TestedProfiles) to verify correctness. |
| 153 | + nt_core = (; ρ = FT(1), p = FT(2),e_int = FT(3),q_tot = FT(4),T = FT(5)) |
| 154 | + nt_ts = (; |
| 155 | + ρ = FT(0), |
| 156 | + p = FT(0), |
| 157 | + e_int = FT(0), |
| 158 | + q_tot = FT(0), |
| 159 | + T = FT(0), |
| 160 | + ) |
| 161 | + x = fill((; ts = zero(TBB.PhaseEquil{FT}), nt_core...), cspace) |
| 162 | + xv = fill((; ts = nt_ts, nt_core...), cspace) |
| 163 | + (_, Nij, _, Nv, Nh) = size(Fields.field_values(x.ts)) |
| 164 | + us = TBB.UniversalSizesStatic(Nv, Nij, Nh) |
| 165 | + function to_vec(ξ) |
| 166 | + pns = propertynames(ξ) |
| 167 | + dl_vals = map(pns) do pn |
| 168 | + val = getproperty(ξ, pn) |
| 169 | + pn == :ts ? to_vec(val) : |
| 170 | + CUDA.CuArray(collect(vec(parent(Fields.field_values(val))))) |
| 171 | + end |
| 172 | + return (; zip(propertynames(ξ), dl_vals)...) |
| 173 | + end |
| 174 | + x_vec = to_vec(xv) |
| 175 | + |
| 176 | + TBB.thermo_func_bc!(x, us; nreps=1, n_trials = 1) |
| 177 | + TBB.thermo_func_sol!(x_vec, us; nreps=1, n_trials = 1) |
| 178 | + |
| 179 | + rc = Fields.rcompare(x_vec, to_vec(x)) |
| 180 | + rc || Fields.rprint_diff(x_vec, to_vec(x)) # test correctness (should print nothing) |
| 181 | + @test rc # test correctness |
| 182 | + |
| 183 | + TBB.thermo_func_bc!(x, us; nreps=100, bm) |
| 184 | + TBB.thermo_func_sol!(x_vec, us; nreps=100, bm) |
| 185 | + |
| 186 | + TBB.thermo_func_bc!(x, us; nreps=100, bm) |
| 187 | + TBB.thermo_func_sol!(x_vec, us; nreps=100, bm) |
| 188 | + |
| 189 | + TBB.tabulate_benchmark(bm) |
| 190 | + |
| 191 | +end |
| 192 | +#! format: on |
0 commit comments