@@ -219,22 +219,28 @@ bcs_tested(c, ::typeof(op_divgrad_uₕ!)) =
219
219
(; inner = (;), outer = set_value_divgrad_uₕ_maybe_field_bcs (c)),
220
220
)
221
221
222
- function benchmark_func! (t_min, trials, fun, c, f, verbose = false )
222
+ function benchmark_func! (t_min, trials, fun, c, f, verbose = false ; compile :: Bool )
223
223
device = ClimaComms. device (c)
224
224
for bcs in bcs_tested (c, fun)
225
225
h_space = nameof (typeof (axes (c)))
226
226
key = (h_space, fun, bc_name (bcs)... )
227
- verbose && @info " \n @benchmarking $key "
228
- trials[key] = BenchmarkTools. @benchmark ClimaComms. @cuda_sync $ device $ fun ($ c, $ f, $ bcs)
229
- verbose && show (stdout , MIME (" text/plain" ), trials[key])
227
+ if compile
228
+ fun (c, f, bcs)
229
+ else
230
+ verbose && @info " \n @benchmarking $key "
231
+ trials[key] = BenchmarkTools. @benchmark ClimaComms. @cuda_sync $ device $ fun ($ c, $ f, $ bcs)
232
+ end
233
+ if haskey (trials, key)
234
+ verbose && show (stdout , MIME (" text/plain" ), trials[key])
230
235
231
- t_min[key] = minimum (trials[key]. times) # nano seconds
232
- t_pretty = BenchmarkTools. prettytime (t_min[key])
233
- verbose || @info " $t_pretty <=> t_min[$key ]"
236
+ t_min[key] = minimum (trials[key]. times) # nano seconds
237
+ t_pretty = BenchmarkTools. prettytime (t_min[key])
238
+ verbose || @info " $t_pretty <=> t_min[$key ]"
239
+ end
234
240
end
235
241
end
236
242
237
- function column_benchmark_arrays (device, z_elems, :: Type{FT} ) where {FT}
243
+ function column_benchmark_arrays (device, z_elems, :: Type{FT} ; compile :: Bool ) where {FT}
238
244
ArrayType = ClimaComms. array_type (device)
239
245
L = ArrayType (zeros (FT, z_elems))
240
246
D = ArrayType (zeros (FT, z_elems))
@@ -243,6 +249,16 @@ function column_benchmark_arrays(device, z_elems, ::Type{FT}) where {FT}
243
249
uₕ_x = ArrayType (rand (FT, z_elems))
244
250
uₕ_y = ArrayType (rand (FT, z_elems))
245
251
yarr = ArrayType (rand (FT, z_elems + 1 ))
252
+ if compile
253
+ if device isa ClimaComms. CUDADevice
254
+ column_op_2mul_1add_cuda! (xarr, yarr, D, U)
255
+ else
256
+ column_op_2mul_1add! (xarr, yarr, D, U)
257
+ column_op_3mul_2add! (xarr, yarr, L, D, U)
258
+ column_curl_like! (xarr, uₕ_x, uₕ_y, D, U)
259
+ end
260
+ return nothing
261
+ end
246
262
247
263
if device isa ClimaComms. CUDADevice
248
264
println (" \n ############################ column 2-point stencil" )
@@ -265,7 +281,7 @@ function column_benchmark_arrays(device, z_elems, ::Type{FT}) where {FT}
265
281
end
266
282
end
267
283
268
- function sphere_benchmark_arrays (device, z_elems, helem, Nq, :: Type{FT} ) where {FT}
284
+ function sphere_benchmark_arrays (device, z_elems, helem, Nq, :: Type{FT} ; compile :: Bool ) where {FT}
269
285
ArrayType = ClimaComms. array_type (device)
270
286
# VIJFH
271
287
Nh = helem * helem * 6
@@ -280,42 +296,58 @@ function sphere_benchmark_arrays(device, z_elems, helem, Nq, ::Type{FT}) where {
280
296
yarr = ArrayType (rand (FT, fdims... ))
281
297
282
298
if device isa ClimaComms. CUDADevice
283
- println (" \n ############################ sphere 2-point stencil" )
284
- trial = BenchmarkTools. @benchmark ClimaComms. @cuda_sync $ device sphere_op_2mul_1add_cuda! ($ xarr, $ yarr, $ D, $ U)
285
- show (stdout , MIME (" text/plain" ), trial)
286
- println ()
299
+ if compile
300
+ sphere_op_2mul_1add_cuda! (xarr, yarr, D, U)
301
+ else
302
+ println (" \n ############################ sphere 2-point stencil" )
303
+ trial = BenchmarkTools. @benchmark ClimaComms. @cuda_sync $ device sphere_op_2mul_1add_cuda! ($ xarr, $ yarr, $ D, $ U)
304
+ show (stdout , MIME (" text/plain" ), trial)
305
+ println ()
306
+ end
287
307
else
288
308
@info " Sphere CPU kernels have not been added yet."
289
309
end
290
310
end
291
311
292
- function benchmark_operators (:: Type{FT} ; z_elems, helem, Nq) where {FT}
312
+ function benchmark_operators_column (:: Type{FT} ; z_elems, helem, Nq, compile :: Bool = false ) where {FT}
293
313
device = ClimaComms. device ()
294
314
@show device
295
315
trials = OrderedCollections. OrderedDict ()
296
316
t_min = OrderedCollections. OrderedDict ()
297
- column_benchmark_arrays (device, z_elems, FT)
298
- sphere_benchmark_arrays (device, z_elems, helem, Nq, FT)
317
+ column_benchmark_arrays (device, z_elems, FT; compile)
299
318
300
319
cspace = TU. ColumnCenterFiniteDifferenceSpace (FT; zelem= z_elems)
301
320
fspace = Spaces. FaceFiniteDifferenceSpace (cspace)
302
321
cfield = fill (field_vars (FT), cspace)
303
322
ffield = fill (field_vars (FT), fspace)
304
- benchmark_operators_base (trials, t_min, cfield, ffield, " column" )
323
+ benchmark_operators_base (trials, t_min, cfield, ffield, " column" ; compile)
324
+
325
+ # Tests are removed since they're flakey. And maintaining
326
+ # them before they're converged is a bit of work..
327
+ compile || test_results_column (t_min)
328
+ return (; trials, t_min)
329
+ end
330
+
331
+ function benchmark_operators_sphere (:: Type{FT} ; z_elems, helem, Nq, compile:: Bool = false ) where {FT}
332
+ device = ClimaComms. device ()
333
+ @show device
334
+ trials = OrderedCollections. OrderedDict ()
335
+ t_min = OrderedCollections. OrderedDict ()
336
+ sphere_benchmark_arrays (device, z_elems, helem, Nq, FT; compile)
305
337
306
338
cspace = TU. CenterExtrudedFiniteDifferenceSpace (FT; zelem= z_elems, helem, Nq)
307
339
fspace = Spaces. FaceExtrudedFiniteDifferenceSpace (cspace)
308
340
cfield = fill (field_vars (FT), cspace)
309
341
ffield = fill (field_vars (FT), fspace)
310
- benchmark_operators_base (trials, t_min, cfield, ffield, " sphere" )
342
+ benchmark_operators_base (trials, t_min, cfield, ffield, " sphere" ; compile )
311
343
312
344
# Tests are removed since they're flakey. And maintaining
313
345
# them before they're converged is a bit of work..
314
- test_results (t_min)
346
+ compile || test_results_sphere (t_min)
315
347
return (; trials, t_min)
316
348
end
317
349
318
- function benchmark_operators_base (trials, t_min, cfield, ffield, name)
350
+ function benchmark_operators_base (trials, t_min, cfield, ffield, name; compile :: Bool )
319
351
ops = [
320
352
# ### Core discrete operators
321
353
op_GradientF2C!,
@@ -351,13 +383,13 @@ function benchmark_operators_base(trials, t_min, cfield, ffield, name)
351
383
if uses_bycolumn (op) && axes (cfield) isa Spaces. FiniteDifferenceSpace
352
384
continue
353
385
end
354
- benchmark_func! (t_min, trials, op, cfield, ffield, #= verbose = =# false )
386
+ benchmark_func! (t_min, trials, op, cfield, ffield, #= verbose = =# false ; compile )
355
387
end
356
388
357
389
return nothing
358
390
end
359
391
360
- function test_results (t_min)
392
+ function test_results_column (t_min)
361
393
# If these tests fail, just update the numbers (or the
362
394
# buffer) so long its not an egregious regression.
363
395
buffer = 2
@@ -393,7 +425,22 @@ function test_results(t_min)
393
425
[(:FiniteDifferenceSpace , op_div_interp_FF!, :none , :SetValue , :SetValue ), 686.581 * ns* buffer],
394
426
[(:FiniteDifferenceSpace , op_divgrad_uₕ!, :none , :SetValue , :Extrapolate ), 4.960 * μs* buffer],
395
427
[(:FiniteDifferenceSpace , op_divgrad_uₕ!, :none , :SetValue , :SetValue ), 5.047 * μs* buffer],
428
+ ]
429
+ for (params, ref_time) in results
430
+ if ! (t_min[params] ≤ ref_time)
431
+ @warn " Possible regression: $params , time=$(t_min[params]) , ref_time=$ref_time "
432
+ end
433
+ end
434
+ end
396
435
436
+ function test_results_sphere (t_min)
437
+ # If these tests fail, just update the numbers (or the
438
+ # buffer) so long its not an egregious regression.
439
+ buffer = 2
440
+ ns = 1
441
+ μs = 10 ^ 3
442
+ ms = 10 ^ 6
443
+ results = [
397
444
[(:ExtrudedFiniteDifferenceSpace , op_GradientF2C!, :none ), 1.746 * ms* buffer],
398
445
[(:ExtrudedFiniteDifferenceSpace , op_GradientF2C!, :SetValue , :SetValue ), 1.754 * ms* buffer],
399
446
[(:ExtrudedFiniteDifferenceSpace , op_GradientC2F!, :SetGradient , :SetGradient ), 1.899 * ms* buffer],
0 commit comments