@@ -3,6 +3,10 @@ julia --project=.buildkite
3
3
using Revise; include(joinpath("benchmarks", "scripts", "linear_vs_cartesian_indexing.jl"))
4
4
5
5
# Info:
6
+ This script compares two things:
7
+ - linear vs cartesian indexing
8
+ - impact of static vs dynamic NDRanges (https://juliagpu.github.io/KernelAbstractions.jl/dev/examples/memcopy_static/)
9
+
6
10
Linear indexing, when possible, has performance advantages
7
11
over using Cartesian indexing. Julia Base's Broadcast only
8
12
supports Cartesian indexing as it provides more general support
@@ -13,6 +17,9 @@ This script (re-)defines some broadcast machinery and tests
13
17
the performance of vector vs array operations in a broadcast
14
18
setting where linear indexing is allowed.
15
19
20
+ # Summary:
21
+ - Pointwise
22
+
16
23
# References:
17
24
- https://github.com/CliMA/ClimaCore.jl/issues/1889
18
25
- https://github.com/JuliaLang/julia/issues/28126
@@ -23,27 +30,43 @@ setting where linear indexing is allowed.
23
30
Local Apple M1 Mac (CPU):
24
31
```
25
32
at_dot_call!($X_array, $Y_array):
26
- 146 milliseconds, 558 microseconds
33
+ 143 milliseconds, 774 microseconds
27
34
at_dot_call!($X_vector, $Y_vector):
28
- 65 milliseconds, 531 microseconds
29
- custom_kernel_bc!($X_vector, $Y_vector, $(Val(length(X_vector.x1))); printtb = false):
30
- 66 milliseconds, 735 microseconds
31
- custom_kernel_bc!($X_array, $Y_array, $(Val(length(X_vector.x1))); printtb = false, use_pw = false):
32
- 145 milliseconds, 957 microseconds
33
- custom_kernel_bc!($X_array, $Y_array, $(Val(length(X_vector.x1))); printtb = false, use_pw = true):
34
- 66 milliseconds, 320 microseconds
35
+ 65 milliseconds, 567 microseconds
36
+ custom_kernel_bc!($X_vector, $Y_vector, $us; printtb = false):
37
+ 66 milliseconds, 870 microseconds
38
+ custom_kernel_bc!($X_array, $Y_array, $us; printtb = false, use_pw = false):
39
+ 143 milliseconds, 643 microseconds
40
+ custom_kernel_bc!($X_array, $Y_array, $us; printtb = false, use_pw = true):
41
+ 65 milliseconds, 778 microseconds
42
+ custom_kernel_bc!($X_vector, $Y_vector, $uss; printtb = false):
43
+ 65 milliseconds, 765 microseconds
44
+ custom_kernel_bc!($X_array, $Y_array, $uss; printtb = false, use_pw = false):
45
+ 144 milliseconds, 271 microseconds
46
+ custom_kernel_bc!($X_array, $Y_array, $uss; printtb = false, use_pw = true):
47
+ 66 milliseconds, 376 microseconds
35
48
```
36
49
37
50
Clima A100
38
51
```
52
+ at_dot_call!($X_array, $Y_array):
53
+ 6 milliseconds, 775 microseconds
39
54
at_dot_call!($X_vector, $Y_vector):
40
- 2 milliseconds, 848 microseconds
41
- custom_kernel_bc!($X_vector, $Y_vector, $(Val(length(X_vector.x1))); printtb = false):
42
- 2 milliseconds, 537 microseconds
43
- custom_kernel_bc!($X_array, $Y_array, $(Val(length(X_vector.x1))); printtb = false, use_pw = false):
44
- 8 milliseconds, 804 microseconds
45
- custom_kernel_bc!($X_array, $Y_array, $(Val(length(X_vector.x1))); printtb = false, use_pw = true):
46
- 2 milliseconds, 545 microseconds
55
+ 2 milliseconds, 834 microseconds
56
+ custom_sol_kernel!($X_vector, $Y_vector, $(Val(N))):
57
+ 2 milliseconds, 547 microseconds
58
+ custom_kernel_bc!($X_vector, $Y_vector, $us; printtb = false):
59
+ 2 milliseconds, 561 microseconds
60
+ custom_kernel_bc!($X_array, $Y_array, $us; printtb = false, use_pw = false):
61
+ 4 milliseconds, 160 microseconds
62
+ custom_kernel_bc!($X_array, $Y_array, $us; printtb = false, use_pw = true):
63
+ 2 milliseconds, 584 microseconds
64
+ custom_kernel_bc!($X_vector, $Y_vector, $uss; printtb = false):
65
+ 2 milliseconds, 540 microseconds
66
+ custom_kernel_bc!($X_array, $Y_array, $uss; printtb = false, use_pw = false):
67
+ 2 milliseconds, 715 microseconds
68
+ custom_kernel_bc!($X_array, $Y_array, $uss; printtb = false, use_pw = true):
69
+ 2 milliseconds, 547 microseconds
47
70
```
48
71
=#
49
72
@@ -239,7 +262,7 @@ function at_dot_call!(X, Y)
239
262
return nothing
240
263
end ;
241
264
242
- function custom_kernel ! (X, Y, :: Val{N} ) where {N}
265
+ function custom_sol_kernel ! (X, Y, :: Val{N} ) where {N}
243
266
(; x1, x2, x3) = X
244
267
(; y1) = Y
245
268
kernel = CUDA. @cuda always_inline = true launch = false custom_kernel_knl! (
@@ -267,7 +290,27 @@ function custom_kernel_knl!(y1, x1, x2, x3, ::Val{N}) where {N}
267
290
return nothing
268
291
end ;
269
292
270
- function custom_kernel_bc! (X, Y, :: Val{N} ; printtb= true , use_pw= true ) where {N}
293
+ abstract type AbstractUniversalSizes{Nv, Nij} end
294
+ struct UniversalSizesCC{Nv, Nij} <: AbstractUniversalSizes{Nv, Nij}
295
+ Nh:: Int
296
+ end
297
+ struct UniversalSizesStatic{Nv, Nij, Nh} <: AbstractUniversalSizes{Nv, Nij} end
298
+
299
+ get_Nv (:: AbstractUniversalSizes{Nv} ) where {Nv} = Nv
300
+ get_Nij (:: AbstractUniversalSizes{Nv, Nij} ) where {Nv, Nij} = Nij
301
+ get_Nh (us:: UniversalSizesCC ) = us. Nh
302
+ get_Nh (:: UniversalSizesStatic{Nv, Nij, Nh} ) where {Nv, Nij, Nh} = Nh
303
+ get_N (us:: AbstractUniversalSizes{Nv, Nij} ) where {Nv, Nij} = prod ((Nv,Nij,Nij,1 ,get_Nh (us)))
304
+ UniversalSizesCC (Nv, Nij, Nh) = UniversalSizesCC {Nv, Nij} (Nh)
305
+ UniversalSizesStatic (Nv, Nij, Nh) = UniversalSizesStatic {Nv, Nij, Nh} ()
306
+ using Test
307
+ us_tup = (1 , 2 , 3 )
308
+ @test get_Nv (UniversalSizesCC (us_tup... )) == get_Nv (UniversalSizesStatic (us_tup... ))
309
+ @test get_Nij (UniversalSizesCC (us_tup... )) == get_Nij (UniversalSizesStatic (us_tup... ))
310
+ @test get_Nh (UniversalSizesCC (us_tup... )) == get_Nh (UniversalSizesStatic (us_tup... ))
311
+ @test get_N (UniversalSizesCC (us_tup... )) == get_N (UniversalSizesStatic (us_tup... ))
312
+
313
+ function custom_kernel_bc! (X, Y, us:: AbstractUniversalSizes ; printtb= true , use_pw= true )
271
314
(; x1, x2, x3) = X
272
315
(; y1) = Y
273
316
bc_base = @lazy @. y1 = myadd (x1, x2, x3)
@@ -281,7 +324,7 @@ function custom_kernel_bc!(X, Y, ::Val{N}; printtb=true, use_pw=true) where {N}
281
324
end
282
325
else
283
326
for i in 1 : 100 # reduce variance / impact of launch latency
284
- @inbounds @simd for j in 1 : N
327
+ @inbounds @simd for j in 1 : get_N (us)
285
328
y1[j] = bc[j]
286
329
end
287
330
end
@@ -291,28 +334,28 @@ function custom_kernel_bc!(X, Y, ::Val{N}; printtb=true, use_pw=true) where {N}
291
334
CUDA. @cuda always_inline = true launch = false custom_kernel_knl_bc! (
292
335
y1,
293
336
bc,
294
- Val (N) ,
337
+ us ,
295
338
)
296
339
config = CUDA. launch_configuration (kernel. fun)
297
340
threads = min (N, config. threads)
298
341
blocks = cld (N, threads)
299
342
printtb && @show blocks, threads
300
343
for i in 1 : 100 # reduce variance / impact of launch latency
301
- kernel (y1, bc, Val (N) ; threads, blocks)
344
+ kernel (y1, bc,us ; threads, blocks)
302
345
end
303
346
end
304
347
return nothing
305
348
end ;
306
349
@inline get_cart_lin_index (bc, n, I) = I
307
350
@inline get_cart_lin_index (bc:: Base.Broadcast.Broadcasted , n, I) =
308
351
CartesianIndices (map (x -> Base. OneTo (x), n))[I]
309
- function custom_kernel_knl_bc! (y1, bc, :: Val{N} ) where {N}
352
+ function custom_kernel_knl_bc! (y1, bc, us)
310
353
@inbounds begin
311
354
I = (CUDA. blockIdx (). x - Int32 (1 )) * CUDA. blockDim (). x + CUDA. threadIdx (). x
312
- n = size (y1 )
313
- if 1 ≤ I ≤ N
314
- ind = get_cart_lin_index (bc, n, I)
315
- y1[ind ] = bc[ind ]
355
+ if 1 ≤ I ≤ get_N (us )
356
+ n = ( get_Nv (us), get_Nij (us), get_Nij (us), 1 , get_Nh (us))
357
+ ci = get_cart_lin_index (bc, n, I)
358
+ y1[ci ] = bc[ci ]
316
359
end
317
360
end
318
361
return nothing
@@ -327,16 +370,31 @@ X_vector = to_vec(X_array);
327
370
Y_vector = to_vec (Y_array);
328
371
at_dot_call! (X_array, Y_array)
329
372
at_dot_call! (X_vector, Y_vector)
330
- # custom_kernel!(X_vector, Y_vector, Val(length(X_vector.x1)))
331
- custom_kernel_bc! (X_vector, Y_vector, Val (length (X_vector. x1)))
332
- custom_kernel_bc! (X_array, Y_array, Val (length (X_vector. x1)); use_pw= false )
333
- custom_kernel_bc! (X_array, Y_array, Val (length (X_vector. x1)); use_pw= true )
373
+ N = length (X_vector. x1)
374
+ (Nv, Nij, _, Nf, Nh) = size (Y_array. y1);
375
+ us = UniversalSizesCC (Nv, Nij, Nh);
376
+ uss = UniversalSizesStatic (Nv, Nij, Nh);
377
+ @test get_N (us) == N
378
+ @test get_N (uss) == N
379
+ iscpu = ArrayType === identity
380
+ iscpu || custom_sol_kernel! (X_vector, Y_vector, Val (N))
381
+ custom_kernel_bc! (X_vector, Y_vector, us)
382
+ custom_kernel_bc! (X_array, Y_array, us; use_pw= false )
383
+ custom_kernel_bc! (X_array, Y_array, us; use_pw= true )
384
+
385
+ custom_kernel_bc! (X_vector, Y_vector, uss)
386
+ custom_kernel_bc! (X_array, Y_array, uss; use_pw= false )
387
+ custom_kernel_bc! (X_array, Y_array, uss; use_pw= true )
334
388
335
389
@pretty_belapsed at_dot_call! ($ X_array, $ Y_array) # slow
336
390
@pretty_belapsed at_dot_call! ($ X_vector, $ Y_vector) # fast
337
- # @pretty_belapsed custom_kernel!($X_vector, $Y_vector, $(Val(length(X_vector.x1))))
338
- @pretty_belapsed custom_kernel_bc! ($ X_vector, $ Y_vector, $ (Val (length (X_vector. x1)));printtb= false )
339
- @pretty_belapsed custom_kernel_bc! ($ X_array, $ Y_array, $ (Val (length (X_vector. x1)));printtb= false , use_pw= false )
340
- @pretty_belapsed custom_kernel_bc! ($ X_array, $ Y_array, $ (Val (length (X_vector. x1)));printtb= false , use_pw= true )
391
+ iscpu || @pretty_belapsed custom_sol_kernel! ($ X_vector, $ Y_vector, $ (Val (N)))
392
+ @pretty_belapsed custom_kernel_bc! ($ X_vector, $ Y_vector, $ us; printtb= false )
393
+ @pretty_belapsed custom_kernel_bc! ($ X_array, $ Y_array, $ us; printtb= false , use_pw= false )
394
+ @pretty_belapsed custom_kernel_bc! ($ X_array, $ Y_array, $ us; printtb= false , use_pw= true )
395
+
396
+ @pretty_belapsed custom_kernel_bc! ($ X_vector, $ Y_vector, $ uss; printtb= false )
397
+ @pretty_belapsed custom_kernel_bc! ($ X_array, $ Y_array, $ uss; printtb= false , use_pw= false )
398
+ @pretty_belapsed custom_kernel_bc! ($ X_array, $ Y_array, $ uss; printtb= false , use_pw= true )
341
399
342
400
# ! format: on
0 commit comments