Skip to content

Commit 256319a

Browse files
Refactor and update benchmarks
1 parent 9dd63b9 commit 256319a

File tree

7 files changed

+221
-131
lines changed

7 files changed

+221
-131
lines changed

benchmarks/scripts/benchmark_offset.jl

Lines changed: 25 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -14,32 +14,33 @@ using Revise; include(joinpath("benchmarks", "scripts", "benchmark_offset.jl"))
1414
Clima A100:
1515
```
1616
[ Info: ArrayType = CuArray
17-
Problem size: (63, 4, 4, 1, 5400), float_type = Float32, device_bandwidth_GBs=2039
18-
┌────────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐
19-
│ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps
20-
├────────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤
21-
BO.aos_cart_offset!(X_aos_ref, Y_aos_ref, us; bm, nreps = 100) │ 68 microseconds, 834 nanoseconds │ 57.79081178.35 │ 4 │ 100
22-
BO.aos_lin_offset!(X_aos, Y_aos, us; bm, nreps = 100) │ 58 microseconds, 153 nanoseconds │ 68.40461394.77 │ 4 │ 100
23-
BO.soa_linear_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 56 microseconds, 576 nanoseconds │ 70.3113 │ 1433.65 │ 4 │ 100
24-
BO.soa_cart_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 67 microseconds, 185 nanoseconds │ 59.2089 │ 1207.27 │ 4 │ 100
25-
└────────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘
17+
Problem size: (63, 4, 4, 1, 5400), N reads-writes: 4, N-reps: 100, Float_type = Float32, Device_bandwidth_GBs=2039
18+
┌────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┐
19+
│ funcs │ time per call │ bw % │ achieved bw │
20+
├────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┤
21+
│ BO.aos_cart_offset!(X_aos_ref, Y_aos_ref, us; bm, nreps = 100) │ 84 microseconds, 726 nanoseconds │ 46.9507957.324
22+
│ BO.aos_lin_offset!(X_aos, Y_aos, us; bm, nreps = 100) │ 58 microseconds, 102 nanoseconds │ 68.46491396.0
23+
│ BO.soa_linear_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 56 microseconds, 331 nanoseconds │ 70.618 │ 1439.9
24+
│ BO.soa_cart_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 67 microseconds, 390 nanoseconds │ 59.029 │ 1203.6
25+
└────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┘
2626
2727
[ Info: ArrayType = CuArray
28-
Problem size: (63, 4, 4, 1, 5400), float_type = Float32, device_bandwidth_GBs=2039
29-
┌────────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐
30-
│ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps
31-
├────────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤
32-
BO.aos_cart_offset!(X_aos_ref, Y_aos_ref, us; bm, nreps = 100) │ 68 microseconds, 967 nanoseconds │ 57.6793 │ 1176.08 │ 4 │ 100
33-
BO.aos_lin_offset!(X_aos, Y_aos, us; bm, nreps = 100) │ 58 microseconds, 82 nanoseconds │ 68.489 │ 1396.49 │ 4 │ 100
34-
BO.soa_linear_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 56 microseconds, 597 nanoseconds │ 70.28581433.13 │ 4 │ 100
35-
BO.soa_cart_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 67 microseconds, 288 nanoseconds │ 59.11881205.43 │ 4 │ 100
36-
└────────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘
28+
Problem size: (63, 4, 4, 1, 5400), N reads-writes: 4, N-reps: 100, Float_type = Float64, Device_bandwidth_GBs=2039
29+
┌────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┐
30+
│ funcs │ time per call │ bw % │ achieved bw │
31+
├────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┤
32+
│ BO.aos_cart_offset!(X_aos_ref, Y_aos_ref, us; bm, nreps = 100) │ 107 microseconds, 387 nanoseconds │ 74.086 │ 1510.61
33+
│ BO.aos_lin_offset!(X_aos, Y_aos, us; bm, nreps = 100) │ 105 microseconds, 42 nanoseconds │ 75.7399 │ 1544.34
34+
│ BO.soa_linear_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 102 microseconds, 636 nanoseconds │ 77.51571580.54
35+
│ BO.soa_cart_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 106 microseconds, 896 nanoseconds │ 74.42661517.56
36+
└────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┘
3737
```
3838
=#
3939

4040
#! format: off
4141
module BenchmarkOffset
4242

43+
import CUDA
4344
include("benchmark_utils.jl")
4445

4546
add3(x1, x2, x3) = x1 + x2 + x3
@@ -76,7 +77,7 @@ function aos_cart_offset!(X, Y, us; nreps = 1, bm=nothing, n_trials = 30)
7677
e = min(e, et)
7778
end
7879
end
79-
push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=4)
80+
push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=size(us),n_reads_writes=4)
8081
return nothing
8182
end;
8283
function aos_cart_offset_kernel!(X, Y, us)
@@ -131,7 +132,7 @@ function aos_lin_offset!(X, Y, us; nreps = 1, bm=nothing, n_trials = 30)
131132
e = min(e, et)
132133
end
133134
end
134-
push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=4)
135+
push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=size(us),n_reads_writes=4)
135136
return nothing
136137
end;
137138
function aos_lin_offset_kernel!(X, Y, us)
@@ -184,7 +185,7 @@ function soa_cart_index!(X, Y, us; nreps = 1, bm=nothing, n_trials = 30)
184185
e = min(e, et)
185186
end
186187
end
187-
push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=4)
188+
push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=size(us),n_reads_writes=4)
188189
return nothing
189190
end;
190191
function soa_cart_index_kernel!(X, Y, us)
@@ -229,7 +230,7 @@ function soa_linear_index!(X, Y, us; nreps = 1, bm=nothing, n_trials = 30)
229230
e = min(e, et)
230231
end
231232
end
232-
push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=4)
233+
push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__),problem_size=size(us),n_reads_writes=4)
233234
return nothing
234235
end;
235236
function soa_linear_index_kernel!(X, Y, us)
@@ -258,9 +259,10 @@ end
258259
using CUDA
259260
using Test
260261
@testset "Offset benchmark" begin
261-
bm = BO.Benchmark(;problem_size=(63,4,4,1,5400), float_type=Float32) # size(problem_size, 4) == 1 to avoid double counting reads/writes
262262
ArrayType = CUDA.CuArray;
263263
# ArrayType = Base.identity;
264+
device_name = CUDA.name(CUDA.device())
265+
bm = BO.Benchmark(;problem_size=(63,4,4,1,5400), device_name, float_type=Float32) # size(problem_size, 4) == 1 to avoid double counting reads/writes
264266
arr(float_type, problem_size, T) = T(zeros(float_type, problem_size...))
265267

266268
FT = Float64;

benchmarks/scripts/benchmark_utils.jl

Lines changed: 97 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
import CUDA
1+
# import CUDA
2+
import ClimaComms
23
using BenchmarkTools, Dates
34
using LazyBroadcast: @lazy
45

@@ -14,21 +15,40 @@ macro caller_name(f)
1415
end
1516
end
1617

18+
"""
19+
device_info(device_name::String)
20+
21+
Call with `device_info(CUDA.name(CUDA.device()))`
22+
"""
23+
function device_info(device_name)
24+
device_specs = Dict(
25+
"NVIDIA A100-SXM4-80GB" => (; device_bandwidth_GBs = 2_039),
26+
"Tesla P100-PCIE-16GB" => (; device_bandwidth_GBs = 732),
27+
)
28+
is_cuda = ClimaComms.device() isa ClimaComms.CUDADevice
29+
if is_cuda && haskey(device_specs, device_name)
30+
(; device_bandwidth_GBs) = device_specs[device_name]
31+
return (; device_bandwidth_GBs, exists = true, name = device_name)
32+
else
33+
return (; device_bandwidth_GBs = 1, exists = false, name = device_name)
34+
end
35+
end
36+
1737
Base.@kwdef mutable struct Benchmark
18-
problem_size::Tuple
38+
problem_size = nothing
1939
float_type::Type
20-
device_bandwidth_GBs::Int = 2_039 # (A100 SXM4 80GB)
2140
data::Vector = []
41+
unfound_device::Bool = false
42+
unfound_device_name::String = ""
43+
device_name::String = ""
2244
end
2345

24-
function perf_stats(; bm::Benchmark, kernel_time_s, n_reads_writes)
25-
N = prod(bm.problem_size)
26-
GB = N * n_reads_writes * sizeof(bm.float_type) / 1024^3
27-
achieved_bandwidth_GBs = GB / kernel_time_s
28-
bandwidth_efficiency =
29-
achieved_bandwidth_GBs / bm.device_bandwidth_GBs * 100
30-
return (; N, GB, achieved_bandwidth_GBs, bandwidth_efficiency)
31-
end;
46+
function print_unfound_devices(bm::Benchmark)
47+
bm.unfound_device || return nothing
48+
println("\nUnfound device: $(bm.unfound_device_name). Please")
49+
println("look up specs and add to device_bandwidth() in")
50+
println("$(@__FILE__).\n")
51+
end
3252

3353
time_and_units_str(x::Real) =
3454
trunc_time(string(compound_period(x, Dates.Second)))
@@ -51,46 +71,98 @@ get_Nh(us::UniversalSizesCC) = us.Nh
5171
get_Nh(::UniversalSizesStatic{Nv, Nij, Nh}) where {Nv, Nij, Nh} = Nh
5272
get_N(us::AbstractUniversalSizes{Nv, Nij}) where {Nv, Nij} =
5373
prod((Nv, Nij, Nij, 1, get_Nh(us)))
74+
Base.size(us::AbstractUniversalSizes{Nv, Nij}) where {Nv, Nij} =
75+
(Nv, Nij, Nij, 1, get_Nh(us))
5476
UniversalSizesCC(Nv, Nij, Nh) = UniversalSizesCC{Nv, Nij}(Nh)
5577
UniversalSizesStatic(Nv, Nij, Nh) = UniversalSizesStatic{Nv, Nij, Nh}()
5678

5779
import PrettyTables
5880
function tabulate_benchmark(bm)
59-
funcs = map(x -> x.caller, bm.data)
81+
funcs = map(x -> strip(x.caller), bm.data)
6082
timings = map(x -> time_and_units_str(x.kernel_time_s), bm.data)
6183
n_reads_writes = map(x -> x.n_reads_writes, bm.data)
6284
nreps = map(x -> x.nreps, bm.data)
85+
dinfo = device_info(bm.device_name)
6386
achieved_bandwidth_GBs = map(x -> x.achieved_bandwidth_GBs, bm.data)
64-
bandwidth_efficiency = map(x -> x.bandwidth_efficiency, bm.data)
87+
bandwidth_efficiency = if dinfo.exists
88+
map(x -> x / dinfo.device_bandwidth_GBs * 100, achieved_bandwidth_GBs)
89+
else
90+
()
91+
end
92+
problem_size = map(x -> x.problem_size, bm.data)
93+
# if we specify the problem size up front, then make
94+
# sure that there is no variation when collecting:
95+
if !isnothing(bm.problem_size)
96+
@assert all(prod.(problem_size) .== prod(bm.problem_size))
97+
end
98+
N = map(x -> prod(x), problem_size)
99+
no_bw_efficiency = length(bandwidth_efficiency) == 0
65100
header = [
66101
"funcs",
67102
"time per call",
68-
"bw %",
103+
(no_bw_efficiency ? () : ("bw %",))...,
69104
"achieved bw",
70-
"n-reads/writes",
71-
"n-reps",
105+
(allequal(n_reads_writes) ? () : ("N reads-writes",))...,
106+
(allequal(N) ? () : ("problem size",))...,
107+
(allequal(nreps) ? () : ("n-reps",))...,
72108
]
73-
data = hcat(
109+
args = (
74110
funcs,
75111
timings,
76-
bandwidth_efficiency,
112+
(no_bw_efficiency ? () : (bandwidth_efficiency,))...,
77113
achieved_bandwidth_GBs,
78-
n_reads_writes,
79-
nreps,
114+
(allequal(n_reads_writes) ? () : (n_reads_writes,))...,
115+
(allequal(N) ? () : (problem_size,))...,
116+
(allequal(nreps) ? () : (nreps,))...,
117+
)
118+
data = hcat(args...)
119+
n_reads_writes_str =
120+
allequal(n_reads_writes) ? "N reads-writes: $(n_reads_writes[1]), " : ""
121+
problem_size_str = allequal(N) ? "Problem size: $(problem_size[1]), " : ""
122+
nreps_str = allequal(nreps) ? "N-reps: $(nreps[1]), " : ""
123+
device_bandwidth_GBs_str =
124+
dinfo.exists ? "Device_bandwidth_GBs=$(dinfo.device_bandwidth_GBs)" : ""
125+
print_unfound_devices(bm)
126+
title = strip(
127+
"$problem_size_str$n_reads_writes_str$nreps_str Float_type = $(bm.float_type), $device_bandwidth_GBs_str",
80128
)
81-
title = "Problem size: $(bm.problem_size), float_type = $(bm.float_type), device_bandwidth_GBs=$(bm.device_bandwidth_GBs)"
82129
PrettyTables.pretty_table(data; title, header, alignment = :l, crop = :none)
83130
end
84131

85-
push_info(bm::Nothing; e, nreps, caller, n_reads_writes) = nothing
86-
function push_info(bm; e, nreps, caller, n_reads_writes)
87-
kernel_time_s = e / nreps
132+
push_info(
133+
bm::Nothing;
134+
kernel_time_s,
135+
nreps,
136+
caller,
137+
n_reads_writes,
138+
problem_size,
139+
) = nothing
140+
function push_info(
141+
bm;
142+
kernel_time_s,
143+
nreps,
144+
caller,
145+
n_reads_writes,
146+
problem_size,
147+
)
148+
N = prod(problem_size)
149+
GB = N * n_reads_writes * sizeof(bm.float_type) / 1024^3
150+
achieved_bandwidth_GBs = GB / kernel_time_s
151+
dinfo = device_info(bm.device_name)
152+
if !dinfo.exists
153+
bm.unfound_device = true
154+
bm.unfound_device_name = dinfo.name
155+
end
156+
88157
nt = (;
89158
caller,
90159
kernel_time_s,
91160
n_reads_writes,
92161
nreps,
93-
perf_stats(; bm, kernel_time_s, n_reads_writes)...,
162+
problem_size,
163+
N,
164+
GB,
165+
achieved_bandwidth_GBs,
94166
)
95167
push!(bm.data, nt)
96168
end

0 commit comments

Comments
 (0)