Skip to content

Commit bc1a12e

Browse files
Fix and update benchmark script
1 parent e7b2c9b commit bc1a12e

File tree

1 file changed

+91
-33
lines changed

1 file changed

+91
-33
lines changed

benchmarks/scripts/linear_vs_cartesian_indexing.jl

Lines changed: 91 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@ julia --project=.buildkite
33
using Revise; include(joinpath("benchmarks", "scripts", "linear_vs_cartesian_indexing.jl"))
44
55
# Info:
6+
This script compares two things:
7+
- linear vs cartesian indexing
8+
- impact of static vs dynamic NDRanges (https://juliagpu.github.io/KernelAbstractions.jl/dev/examples/memcopy_static/)
9+
610
Linear indexing, when possible, has performance advantages
711
over using Cartesian indexing. Julia Base's Broadcast only
812
supports Cartesian indexing as it provides more general support
@@ -13,6 +17,9 @@ This script (re-)defines some broadcast machinery and tests
1317
the performance of vector vs array operations in a broadcast
1418
setting where linear indexing is allowed.
1519
20+
# Summary:
21+
- Pointwise
22+
1623
# References:
1724
- https://github.com/CliMA/ClimaCore.jl/issues/1889
1825
- https://github.com/JuliaLang/julia/issues/28126
@@ -23,27 +30,43 @@ setting where linear indexing is allowed.
2330
Local Apple M1 Mac (CPU):
2431
```
2532
at_dot_call!($X_array, $Y_array):
26-
146 milliseconds, 558 microseconds
33+
143 milliseconds, 774 microseconds
2734
at_dot_call!($X_vector, $Y_vector):
28-
65 milliseconds, 531 microseconds
29-
custom_kernel_bc!($X_vector, $Y_vector, $(Val(length(X_vector.x1))); printtb = false):
30-
66 milliseconds, 735 microseconds
31-
custom_kernel_bc!($X_array, $Y_array, $(Val(length(X_vector.x1))); printtb = false, use_pw = false):
32-
145 milliseconds, 957 microseconds
33-
custom_kernel_bc!($X_array, $Y_array, $(Val(length(X_vector.x1))); printtb = false, use_pw = true):
34-
66 milliseconds, 320 microseconds
35+
65 milliseconds, 567 microseconds
36+
custom_kernel_bc!($X_vector, $Y_vector, $us; printtb = false):
37+
66 milliseconds, 870 microseconds
38+
custom_kernel_bc!($X_array, $Y_array, $us; printtb = false, use_pw = false):
39+
143 milliseconds, 643 microseconds
40+
custom_kernel_bc!($X_array, $Y_array, $us; printtb = false, use_pw = true):
41+
65 milliseconds, 778 microseconds
42+
custom_kernel_bc!($X_vector, $Y_vector, $uss; printtb = false):
43+
65 milliseconds, 765 microseconds
44+
custom_kernel_bc!($X_array, $Y_array, $uss; printtb = false, use_pw = false):
45+
144 milliseconds, 271 microseconds
46+
custom_kernel_bc!($X_array, $Y_array, $uss; printtb = false, use_pw = true):
47+
66 milliseconds, 376 microseconds
3548
```
3649
3750
Clima A100
3851
```
52+
at_dot_call!($X_array, $Y_array):
53+
6 milliseconds, 775 microseconds
3954
at_dot_call!($X_vector, $Y_vector):
40-
2 milliseconds, 848 microseconds
41-
custom_kernel_bc!($X_vector, $Y_vector, $(Val(length(X_vector.x1))); printtb = false):
42-
2 milliseconds, 537 microseconds
43-
custom_kernel_bc!($X_array, $Y_array, $(Val(length(X_vector.x1))); printtb = false, use_pw = false):
44-
8 milliseconds, 804 microseconds
45-
custom_kernel_bc!($X_array, $Y_array, $(Val(length(X_vector.x1))); printtb = false, use_pw = true):
46-
2 milliseconds, 545 microseconds
55+
2 milliseconds, 834 microseconds
56+
custom_sol_kernel!($X_vector, $Y_vector, $(Val(N))):
57+
2 milliseconds, 547 microseconds
58+
custom_kernel_bc!($X_vector, $Y_vector, $us; printtb = false):
59+
2 milliseconds, 561 microseconds
60+
custom_kernel_bc!($X_array, $Y_array, $us; printtb = false, use_pw = false):
61+
4 milliseconds, 160 microseconds
62+
custom_kernel_bc!($X_array, $Y_array, $us; printtb = false, use_pw = true):
63+
2 milliseconds, 584 microseconds
64+
custom_kernel_bc!($X_vector, $Y_vector, $uss; printtb = false):
65+
2 milliseconds, 540 microseconds
66+
custom_kernel_bc!($X_array, $Y_array, $uss; printtb = false, use_pw = false):
67+
2 milliseconds, 715 microseconds
68+
custom_kernel_bc!($X_array, $Y_array, $uss; printtb = false, use_pw = true):
69+
2 milliseconds, 547 microseconds
4770
```
4871
=#
4972

@@ -239,7 +262,7 @@ function at_dot_call!(X, Y)
239262
return nothing
240263
end;
241264

242-
function custom_kernel!(X, Y, ::Val{N}) where {N}
265+
function custom_sol_kernel!(X, Y, ::Val{N}) where {N}
243266
(; x1, x2, x3) = X
244267
(; y1) = Y
245268
kernel = CUDA.@cuda always_inline = true launch = false custom_kernel_knl!(
@@ -267,7 +290,27 @@ function custom_kernel_knl!(y1, x1, x2, x3, ::Val{N}) where {N}
267290
return nothing
268291
end;
269292

270-
function custom_kernel_bc!(X, Y, ::Val{N}; printtb=true, use_pw=true) where {N}
293+
abstract type AbstractUniversalSizes{Nv, Nij} end
294+
struct UniversalSizesCC{Nv, Nij} <: AbstractUniversalSizes{Nv, Nij}
295+
Nh::Int
296+
end
297+
struct UniversalSizesStatic{Nv, Nij, Nh} <: AbstractUniversalSizes{Nv, Nij} end
298+
299+
get_Nv(::AbstractUniversalSizes{Nv}) where {Nv} = Nv
300+
get_Nij(::AbstractUniversalSizes{Nv, Nij}) where {Nv, Nij} = Nij
301+
get_Nh(us::UniversalSizesCC) = us.Nh
302+
get_Nh(::UniversalSizesStatic{Nv, Nij, Nh}) where {Nv, Nij, Nh} = Nh
303+
get_N(us::AbstractUniversalSizes{Nv, Nij}) where {Nv, Nij} = prod((Nv,Nij,Nij,1,get_Nh(us)))
304+
UniversalSizesCC(Nv, Nij, Nh) = UniversalSizesCC{Nv, Nij}(Nh)
305+
UniversalSizesStatic(Nv, Nij, Nh) = UniversalSizesStatic{Nv, Nij, Nh}()
306+
using Test
307+
us_tup = (1, 2, 3)
308+
@test get_Nv(UniversalSizesCC(us_tup...)) == get_Nv(UniversalSizesStatic(us_tup...))
309+
@test get_Nij(UniversalSizesCC(us_tup...)) == get_Nij(UniversalSizesStatic(us_tup...))
310+
@test get_Nh(UniversalSizesCC(us_tup...)) == get_Nh(UniversalSizesStatic(us_tup...))
311+
@test get_N(UniversalSizesCC(us_tup...)) == get_N(UniversalSizesStatic(us_tup...))
312+
313+
function custom_kernel_bc!(X, Y, us::AbstractUniversalSizes; printtb=true, use_pw=true)
271314
(; x1, x2, x3) = X
272315
(; y1) = Y
273316
bc_base = @lazy @. y1 = myadd(x1, x2, x3)
@@ -281,7 +324,7 @@ function custom_kernel_bc!(X, Y, ::Val{N}; printtb=true, use_pw=true) where {N}
281324
end
282325
else
283326
for i in 1:100 # reduce variance / impact of launch latency
284-
@inbounds @simd for j in 1:N
327+
@inbounds @simd for j in 1:get_N(us)
285328
y1[j] = bc[j]
286329
end
287330
end
@@ -291,28 +334,28 @@ function custom_kernel_bc!(X, Y, ::Val{N}; printtb=true, use_pw=true) where {N}
291334
CUDA.@cuda always_inline = true launch = false custom_kernel_knl_bc!(
292335
y1,
293336
bc,
294-
Val(N),
337+
us,
295338
)
296339
config = CUDA.launch_configuration(kernel.fun)
297340
threads = min(N, config.threads)
298341
blocks = cld(N, threads)
299342
printtb && @show blocks, threads
300343
for i in 1:100 # reduce variance / impact of launch latency
301-
kernel(y1, bc, Val(N); threads, blocks)
344+
kernel(y1, bc,us; threads, blocks)
302345
end
303346
end
304347
return nothing
305348
end;
306349
@inline get_cart_lin_index(bc, n, I) = I
307350
@inline get_cart_lin_index(bc::Base.Broadcast.Broadcasted, n, I) =
308351
CartesianIndices(map(x -> Base.OneTo(x), n))[I]
309-
function custom_kernel_knl_bc!(y1, bc, ::Val{N}) where {N}
352+
function custom_kernel_knl_bc!(y1, bc, us)
310353
@inbounds begin
311354
I = (CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x + CUDA.threadIdx().x
312-
n = size(y1)
313-
if 1 I N
314-
ind = get_cart_lin_index(bc, n, I)
315-
y1[ind] = bc[ind]
355+
if 1 I get_N(us)
356+
n = (get_Nv(us), get_Nij(us), get_Nij(us), 1, get_Nh(us))
357+
ci = get_cart_lin_index(bc, n, I)
358+
y1[ci] = bc[ci]
316359
end
317360
end
318361
return nothing
@@ -327,16 +370,31 @@ X_vector = to_vec(X_array);
327370
Y_vector = to_vec(Y_array);
328371
at_dot_call!(X_array, Y_array)
329372
at_dot_call!(X_vector, Y_vector)
330-
# custom_kernel!(X_vector, Y_vector, Val(length(X_vector.x1)))
331-
custom_kernel_bc!(X_vector, Y_vector, Val(length(X_vector.x1)))
332-
custom_kernel_bc!(X_array, Y_array, Val(length(X_vector.x1)); use_pw=false)
333-
custom_kernel_bc!(X_array, Y_array, Val(length(X_vector.x1)); use_pw=true)
373+
N = length(X_vector.x1)
374+
(Nv, Nij, _, Nf, Nh) = size(Y_array.y1);
375+
us = UniversalSizesCC(Nv, Nij, Nh);
376+
uss = UniversalSizesStatic(Nv, Nij, Nh);
377+
@test get_N(us) == N
378+
@test get_N(uss) == N
379+
iscpu = ArrayType === identity
380+
iscpu || custom_sol_kernel!(X_vector, Y_vector, Val(N))
381+
custom_kernel_bc!(X_vector, Y_vector, us)
382+
custom_kernel_bc!(X_array, Y_array, us; use_pw=false)
383+
custom_kernel_bc!(X_array, Y_array, us; use_pw=true)
384+
385+
custom_kernel_bc!(X_vector, Y_vector, uss)
386+
custom_kernel_bc!(X_array, Y_array, uss; use_pw=false)
387+
custom_kernel_bc!(X_array, Y_array, uss; use_pw=true)
334388

335389
@pretty_belapsed at_dot_call!($X_array, $Y_array) # slow
336390
@pretty_belapsed at_dot_call!($X_vector, $Y_vector) # fast
337-
# @pretty_belapsed custom_kernel!($X_vector, $Y_vector, $(Val(length(X_vector.x1))))
338-
@pretty_belapsed custom_kernel_bc!($X_vector, $Y_vector, $(Val(length(X_vector.x1)));printtb=false)
339-
@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $(Val(length(X_vector.x1)));printtb=false, use_pw=false)
340-
@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $(Val(length(X_vector.x1)));printtb=false, use_pw=true)
391+
iscpu || @pretty_belapsed custom_sol_kernel!($X_vector, $Y_vector, $(Val(N)))
392+
@pretty_belapsed custom_kernel_bc!($X_vector, $Y_vector, $us; printtb=false)
393+
@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $us; printtb=false, use_pw=false)
394+
@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $us; printtb=false, use_pw=true)
395+
396+
@pretty_belapsed custom_kernel_bc!($X_vector, $Y_vector, $uss; printtb=false)
397+
@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $uss; printtb=false, use_pw=false)
398+
@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $uss; printtb=false, use_pw=true)
341399

342400
#! format: on

0 commit comments

Comments
 (0)