@@ -14,32 +14,33 @@ using Revise; include(joinpath("benchmarks", "scripts", "benchmark_offset.jl"))
14
14
Clima A100:
15
15
```
16
16
[ Info: ArrayType = CuArray
17
- Problem size: (63, 4, 4, 1, 5400), float_type = Float32, device_bandwidth_GBs =2039
18
- ┌──────────────────────────────────────────────────────────────────── ┬──────────────────────────────────┬─────────┬─────────────┬────────────────┬ ────────┐
19
- │ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps │
20
- ├──────────────────────────────────────────────────────────────────── ┼──────────────────────────────────┼─────────┼─────────────┼────────────────┼ ────────┤
21
- │ BO.aos_cart_offset!(X_aos_ref, Y_aos_ref, us; bm, nreps = 100) │ 68 microseconds, 834 nanoseconds │ 57.7908 │ 1178.35 │ 4 │ 100 │
22
- │ BO.aos_lin_offset!(X_aos, Y_aos, us; bm, nreps = 100) │ 58 microseconds, 153 nanoseconds │ 68.4046 │ 1394.77 │ 4 │ 100 │
23
- │ BO.soa_linear_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 56 microseconds, 576 nanoseconds │ 70.3113 │ 1433.65 │ 4 │ 100 │
24
- │ BO.soa_cart_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 67 microseconds, 185 nanoseconds │ 59.2089 │ 1207.27 │ 4 │ 100 │
25
- └──────────────────────────────────────────────────────────────────── ┴──────────────────────────────────┴─────────┴─────────────┴────────────────┴ ────────┘
17
+ Problem size: (63, 4, 4, 1, 5400), N reads-writes: 4, N-reps: 100, Float_type = Float32, Device_bandwidth_GBs =2039
18
+ ┌────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┐
19
+ │ funcs │ time per call │ bw % │ achieved bw │
20
+ ├────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┤
21
+ │ BO.aos_cart_offset!(X_aos_ref, Y_aos_ref, us; bm, nreps = 100) │ 84 microseconds, 726 nanoseconds │ 46.9507 │ 957.324 │
22
+ │ BO.aos_lin_offset!(X_aos, Y_aos, us; bm, nreps = 100) │ 58 microseconds, 102 nanoseconds │ 68.4649 │ 1396.0 │
23
+ │ BO.soa_linear_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 56 microseconds, 331 nanoseconds │ 70.618 │ 1439.9 │
24
+ │ BO.soa_cart_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 67 microseconds, 390 nanoseconds │ 59.029 │ 1203.6 │
25
+ └────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┘
26
26
27
27
[ Info: ArrayType = CuArray
28
- Problem size: (63, 4, 4, 1, 5400), float_type = Float32, device_bandwidth_GBs =2039
29
- ┌──────────────────────────────────────────────────────────────────── ┬──────────────────────────────────┬───────── ┬───────────── ┬────────────────┬ ────────┐
30
- │ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps │
31
- ├──────────────────────────────────────────────────────────────────── ┼──────────────────────────────────┼───────── ┼───────────── ┼────────────────┼ ────────┤
32
- │ BO.aos_cart_offset!(X_aos_ref, Y_aos_ref, us; bm, nreps = 100) │ 68 microseconds, 967 nanoseconds │ 57.6793 │ 1176.08 │ 4 │ 100 │
33
- │ BO.aos_lin_offset!(X_aos, Y_aos, us; bm, nreps = 100) │ 58 microseconds, 82 nanoseconds │ 68.489 │ 1396.49 │ 4 │ 100 │
34
- │ BO.soa_linear_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 56 microseconds, 597 nanoseconds │ 70.2858 │ 1433.13 │ 4 │ 100 │
35
- │ BO.soa_cart_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 67 microseconds, 288 nanoseconds │ 59.1188 │ 1205.43 │ 4 │ 100 │
36
- └──────────────────────────────────────────────────────────────────── ┴──────────────────────────────────┴───────── ┴───────────── ┴────────────────┴ ────────┘
28
+ Problem size: (63, 4, 4, 1, 5400), N reads-writes: 4, N-reps: 100, Float_type = Float64, Device_bandwidth_GBs =2039
29
+ ┌────────────────────────────────────────────────────────────────┬─────────────────────────────────── ┬─────────┬─────────────┐
30
+ │ funcs │ time per call │ bw % │ achieved bw │
31
+ ├────────────────────────────────────────────────────────────────┼─────────────────────────────────── ┼─────────┼─────────────┤
32
+ │ BO.aos_cart_offset!(X_aos_ref, Y_aos_ref, us; bm, nreps = 100) │ 107 microseconds, 387 nanoseconds │ 74.086 │ 1510.61 │
33
+ │ BO.aos_lin_offset!(X_aos, Y_aos, us; bm, nreps = 100) │ 105 microseconds, 42 nanoseconds │ 75.7399 │ 1544.34 │
34
+ │ BO.soa_linear_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 102 microseconds, 636 nanoseconds │ 77.5157 │ 1580.54 │
35
+ │ BO.soa_cart_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 106 microseconds, 896 nanoseconds │ 74.4266 │ 1517.56 │
36
+ └────────────────────────────────────────────────────────────────┴─────────────────────────────────── ┴─────────┴─────────────┘
37
37
```
38
38
=#
39
39
40
40
# ! format: off
41
41
module BenchmarkOffset
42
42
43
+ import CUDA
43
44
include (" benchmark_utils.jl" )
44
45
45
46
add3 (x1, x2, x3) = x1 + x2 + x3
@@ -76,7 +77,7 @@ function aos_cart_offset!(X, Y, us; nreps = 1, bm=nothing, n_trials = 30)
76
77
e = min (e, et)
77
78
end
78
79
end
79
- push_info (bm; e , nreps, caller = @caller_name (@__FILE__ ),n_reads_writes= 4 )
80
+ push_info (bm; kernel_time_s = e / nreps , nreps, caller = @caller_name (@__FILE__ ),problem_size = size (us ),n_reads_writes= 4 )
80
81
return nothing
81
82
end ;
82
83
function aos_cart_offset_kernel! (X, Y, us)
@@ -131,7 +132,7 @@ function aos_lin_offset!(X, Y, us; nreps = 1, bm=nothing, n_trials = 30)
131
132
e = min (e, et)
132
133
end
133
134
end
134
- push_info (bm; e , nreps, caller = @caller_name (@__FILE__ ),n_reads_writes= 4 )
135
+ push_info (bm; kernel_time_s = e / nreps , nreps, caller = @caller_name (@__FILE__ ),problem_size = size (us ),n_reads_writes= 4 )
135
136
return nothing
136
137
end ;
137
138
function aos_lin_offset_kernel! (X, Y, us)
@@ -184,7 +185,7 @@ function soa_cart_index!(X, Y, us; nreps = 1, bm=nothing, n_trials = 30)
184
185
e = min (e, et)
185
186
end
186
187
end
187
- push_info (bm; e , nreps, caller = @caller_name (@__FILE__ ),n_reads_writes= 4 )
188
+ push_info (bm; kernel_time_s = e / nreps , nreps, caller = @caller_name (@__FILE__ ),problem_size = size (us ),n_reads_writes= 4 )
188
189
return nothing
189
190
end ;
190
191
function soa_cart_index_kernel! (X, Y, us)
@@ -229,7 +230,7 @@ function soa_linear_index!(X, Y, us; nreps = 1, bm=nothing, n_trials = 30)
229
230
e = min (e, et)
230
231
end
231
232
end
232
- push_info (bm; e , nreps, caller = @caller_name (@__FILE__ ),n_reads_writes= 4 )
233
+ push_info (bm; kernel_time_s = e / nreps , nreps, caller = @caller_name (@__FILE__ ),problem_size = size (us ),n_reads_writes= 4 )
233
234
return nothing
234
235
end ;
235
236
function soa_linear_index_kernel! (X, Y, us)
258
259
using CUDA
259
260
using Test
260
261
@testset " Offset benchmark" begin
261
- bm = BO. Benchmark (;problem_size= (63 ,4 ,4 ,1 ,5400 ), float_type= Float32) # size(problem_size, 4) == 1 to avoid double counting reads/writes
262
262
ArrayType = CUDA. CuArray;
263
263
# ArrayType = Base.identity;
264
+ device_name = CUDA. name (CUDA. device ())
265
+ bm = BO. Benchmark (;problem_size= (63 ,4 ,4 ,1 ,5400 ), device_name, float_type= Float32) # size(problem_size, 4) == 1 to avoid double counting reads/writes
264
266
arr (float_type, problem_size, T) = T (zeros (float_type, problem_size... ))
265
267
266
268
FT = Float64;
0 commit comments