From b16482476582122386faf1931c7622c1795c47cb Mon Sep 17 00:00:00 2001 From: sriharshakandala Date: Thu, 29 Aug 2024 08:26:41 -0700 Subject: [PATCH 1/5] Add a benchmark script for `IJFVH` datalayout. --- benchmarks/scripts/benchmark_IJFVH.jl | 180 ++++++++++++++++++++++++++ 1 file changed, 180 insertions(+) create mode 100644 benchmarks/scripts/benchmark_IJFVH.jl diff --git a/benchmarks/scripts/benchmark_IJFVH.jl b/benchmarks/scripts/benchmark_IJFVH.jl new file mode 100644 index 0000000000..1d2365c7df --- /dev/null +++ b/benchmarks/scripts/benchmark_IJFVH.jl @@ -0,0 +1,180 @@ +#= +On A100 + +$ julia --project=.buildkite benchmarks/scripts/benchmark_IJFVH.jl +================================================================== +Dimensions, NI = 4, NJ = 4, NF = 3, NV = 63, NH = 5300; FT = Float32 +================================================================== +Test Summary: | Pass Total Time +Sum test with IJFVH layout, cartesian indexing | 1 1 2.3s +================================================================== +Cartesian indexing benchmark for IJFVH: +================================== +min = TrialEstimate(62.149 μs) +median = TrialEstimate(67.139 μs) +mean = TrialEstimate(67.478 μs), +ntrials = 10000 +--------------------------------------------- +Test Summary: | Pass Total Time +Sum test with IJFVH layout, linear indexing | 1 1 0.0s +================================================================== +linear indexing benchmark for IJFVH: +================================== +min = TrialEstimate(61.269 μs) +median = TrialEstimate(67.650 μs) +mean = TrialEstimate(67.788 μs)) +ntrials = 10000 +--------------------------------------------- +================================================================== +Dimensions, NI = 4, NJ = 4, NF = 3, NV = 63, NH = 5300; FT = Float64 +================================================================== +Test Summary: | Pass Total Time +Sum test with IJFVH layout, cartesian indexing | 1 1 1.3s +================================================================== +Cartesian indexing benchmark for IJFVH: +================================== +min = TrialEstimate(110.099 μs) +median = TrialEstimate(113.869 μs) +mean = TrialEstimate(114.574 μs), +ntrials = 10000 +--------------------------------------------- +Test Summary: | Pass Total Time +Sum test with IJFVH layout, linear indexing | 1 1 0.0s +================================================================== +linear indexing benchmark for IJFVH: +================================== +min = TrialEstimate(110.359 μs) +median = TrialEstimate(113.859 μs) +mean = TrialEstimate(116.783 μs)) +ntrials = 10000 +--------------------------------------------- + +=# +using CUDA +using Statistics +using Test +using BenchmarkTools + +max_threads_cuda() = 256 +# Cartesian indexing +function addf_cart_IJFVH_kernel!(sum_a, a) + (i, j, vid) = threadIdx() + (bv, bh) = blockIdx() + + nvt = blockDim().z + nv = size(a, 4) + v = vid + (bv - 1) * nvt + + if v ≤ nv + @inbounds sum_a[i, j, 1, v, bh] = + a[i, j, 1, v, bh] + a[i, j, 2, v, bh] + a[i, j, 3, v, bh] + end + return nothing +end + +# Add (3) components of a vector field in IJFVH layout using cartesian indexing +function addf_cart_IJFVH!(sum_A_IJFVH, A_IJFVH) + (NI, NJ, NF, NV, NH) = size(A_IJFVH) + NVT = min(Int(fld(max_threads_cuda(), NI * NJ)), NV) + NBV = cld(NV, NVT) + @cuda threads = (NI, NJ, NVT) blocks = (NBV, NH) addf_cart_IJFVH_kernel!( + sum_A_IJFVH, + A_IJFVH, + ) + return nothing +end + +# Linear indexing +function addf_linear_IJFVH_kernel!(sum_a, a) + (i, j, vid) = threadIdx() + (bv, bh) = blockIdx() + + nvt = blockDim().z + nbv = gridDim().x + (Ni, Nj, nf, nv, _) = size(a) + v = vid + (bv - 1) * nvt + + st_in = Ni * (Nj * (nv * (bh - 1) + (v - 1)) + (j - 1)) + i + + st = Ni * (Nj * nf * (nv * (bh - 1) + (v - 1)) + (j - 1)) + i + stride = Ni * Nj + if v ≤ nv + @inbounds sum_a[st_in] = + a[st] + a[st + stride] + a[st + stride + stride] + end + return nothing +end + +# Add (3) components of a vector field in IJFVH layout using linear indexing +function addf_linear_IJFVH!(sum_A_IJFVH, A_IJFVH) + (NI, NJ, NF, NV, NH) = size(A_IJFVH) + NVT = min(Int(fld(max_threads_cuda(), NI * NJ)), NV) + NBV = cld(NV, NVT) + @cuda threads = (NI, NJ, NVT) blocks = (NBV, NH) addf_linear_IJFVH_kernel!( + sum_A_IJFVH, + A_IJFVH, + ) + return nothing +end +# generate benchmarks +function generate_datalayout_benchmarks(::Type{DA}, ::Type{FT}) where {DA, FT} + NI = NJ = 4 # polynomial order of approximation + 1 + NV = 63 # number of vertical levels + NF = 3 # number of components in the velocity field (not # of fields) + NH = 5300 # number of spectral elements + + println( + "==================================================================", + ) + println( + "Dimensions, NI = $NI, NJ = $NJ, NF = $NF, NV = $NV, NH = $NH; FT = $FT", + ) + println( + "==================================================================", + ) + # IJFVH layout + A_IJFVH = DA(rand(FT, NI, NJ, NF, NV, NH)) + sum_A_IJFVH = DA{FT}(undef, NI, NJ, 1, NV, NH) + sum_A_IJFVH_ref = sum(A_IJFVH, dims = 3) + # use cartesian indexing for IJFVH layout + addf_cart_IJFVH!(sum_A_IJFVH, A_IJFVH) + + @testset "Sum test with IJFVH layout, cartesian indexing" begin + @test sum_A_IJFVH ≈ sum_A_IJFVH_ref + end + trial_cart_IJFVH = + @benchmark CUDA.@sync addf_cart_IJFVH!($sum_A_IJFVH, $A_IJFVH) + println( + "==================================================================", + ) + println("Cartesian indexing benchmark for IJFVH: +================================== +min = $(minimum(trial_cart_IJFVH)) +median = $(Statistics.median(trial_cart_IJFVH)) +mean = $(Statistics.mean(trial_cart_IJFVH)), +ntrials = $(length(trial_cart_IJFVH.times))") + println("---------------------------------------------") + + sum_A_IJFVH .= FT(0) + # use linear indexing for IJFVH layout + addf_linear_IJFVH!(sum_A_IJFVH, A_IJFVH) + @testset "Sum test with IJFVH layout, linear indexing" begin + @test sum_A_IJFVH ≈ sum_A_IJFVH_ref + end + trial_linear_IJFVH = + @benchmark CUDA.@sync addf_linear_IJFVH!($sum_A_IJFVH, $A_IJFVH) + println( + "==================================================================", + ) + println("linear indexing benchmark for IJFVH: +================================== +min = $(minimum(trial_linear_IJFVH)) +median = $(Statistics.median(trial_linear_IJFVH)) +mean = $(Statistics.mean(trial_linear_IJFVH))) +ntrials = $(length(trial_linear_IJFVH.times))") + println("---------------------------------------------") + return nothing +end + +generate_datalayout_benchmarks(CuArray, Float32) +generate_datalayout_benchmarks(CuArray, Float64) From 3d1f8f96fb791e247d08b34de19705979b6c4713 Mon Sep 17 00:00:00 2001 From: sriharshakandala Date: Thu, 29 Aug 2024 10:06:55 -0700 Subject: [PATCH 2/5] Use NH = 5400 --- benchmarks/scripts/benchmark_IJFVH.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/scripts/benchmark_IJFVH.jl b/benchmarks/scripts/benchmark_IJFVH.jl index 1d2365c7df..bee27809b8 100644 --- a/benchmarks/scripts/benchmark_IJFVH.jl +++ b/benchmarks/scripts/benchmark_IJFVH.jl @@ -3,7 +3,7 @@ On A100 $ julia --project=.buildkite benchmarks/scripts/benchmark_IJFVH.jl ================================================================== -Dimensions, NI = 4, NJ = 4, NF = 3, NV = 63, NH = 5300; FT = Float32 +Dimensions, NI = 4, NJ = 4, NF = 3, NV = 63, NH = 5400; FT = Float32 ================================================================== Test Summary: | Pass Total Time Sum test with IJFVH layout, cartesian indexing | 1 1 2.3s @@ -26,7 +26,7 @@ mean = TrialEstimate(67.788 μs)) ntrials = 10000 --------------------------------------------- ================================================================== -Dimensions, NI = 4, NJ = 4, NF = 3, NV = 63, NH = 5300; FT = Float64 +Dimensions, NI = 4, NJ = 4, NF = 3, NV = 63, NH = 5400; FT = Float64 ================================================================== Test Summary: | Pass Total Time Sum test with IJFVH layout, cartesian indexing | 1 1 1.3s @@ -121,7 +121,7 @@ function generate_datalayout_benchmarks(::Type{DA}, ::Type{FT}) where {DA, FT} NI = NJ = 4 # polynomial order of approximation + 1 NV = 63 # number of vertical levels NF = 3 # number of components in the velocity field (not # of fields) - NH = 5300 # number of spectral elements + NH = 5400 # number of spectral elements println( "==================================================================", From 0fc5f254d38ce32419ddf36530dfecc155460482 Mon Sep 17 00:00:00 2001 From: sriharshakandala Date: Thu, 29 Aug 2024 10:14:49 -0700 Subject: [PATCH 3/5] Update performance numbers for `NH = 5400` --- benchmarks/scripts/benchmark_IJFVH.jl | 31 +++++++++++++-------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/benchmarks/scripts/benchmark_IJFVH.jl b/benchmarks/scripts/benchmark_IJFVH.jl index bee27809b8..82b2d04bd1 100644 --- a/benchmarks/scripts/benchmark_IJFVH.jl +++ b/benchmarks/scripts/benchmark_IJFVH.jl @@ -1,18 +1,18 @@ #= On A100 -$ julia --project=.buildkite benchmarks/scripts/benchmark_IJFVH.jl +[skandala@clima ClimaCore.jl]$ julia --project=.buildkite benchmarks/scripts/benchmark_IJFVH.jl ================================================================== Dimensions, NI = 4, NJ = 4, NF = 3, NV = 63, NH = 5400; FT = Float32 ================================================================== Test Summary: | Pass Total Time -Sum test with IJFVH layout, cartesian indexing | 1 1 2.3s +Sum test with IJFVH layout, cartesian indexing | 1 1 2.1s ================================================================== Cartesian indexing benchmark for IJFVH: ================================== -min = TrialEstimate(62.149 μs) -median = TrialEstimate(67.139 μs) -mean = TrialEstimate(67.478 μs), +min = TrialEstimate(62.389 μs) +median = TrialEstimate(402.266 μs) +mean = TrialEstimate(357.741 μs), ntrials = 10000 --------------------------------------------- Test Summary: | Pass Total Time @@ -20,22 +20,22 @@ Sum test with IJFVH layout, linear indexing | 1 1 0.0s ================================================================== linear indexing benchmark for IJFVH: ================================== -min = TrialEstimate(61.269 μs) -median = TrialEstimate(67.650 μs) -mean = TrialEstimate(67.788 μs)) +min = TrialEstimate(61.729 μs) +median = TrialEstimate(395.396 μs) +mean = TrialEstimate(360.393 μs)) ntrials = 10000 --------------------------------------------- ================================================================== Dimensions, NI = 4, NJ = 4, NF = 3, NV = 63, NH = 5400; FT = Float64 ================================================================== Test Summary: | Pass Total Time -Sum test with IJFVH layout, cartesian indexing | 1 1 1.3s +Sum test with IJFVH layout, cartesian indexing | 1 1 1.5s ================================================================== Cartesian indexing benchmark for IJFVH: ================================== -min = TrialEstimate(110.099 μs) -median = TrialEstimate(113.869 μs) -mean = TrialEstimate(114.574 μs), +min = TrialEstimate(111.929 μs) +median = TrialEstimate(123.859 μs) +mean = TrialEstimate(228.541 μs), ntrials = 10000 --------------------------------------------- Test Summary: | Pass Total Time @@ -43,12 +43,11 @@ Sum test with IJFVH layout, linear indexing | 1 1 0.0s ================================================================== linear indexing benchmark for IJFVH: ================================== -min = TrialEstimate(110.359 μs) -median = TrialEstimate(113.859 μs) -mean = TrialEstimate(116.783 μs)) +min = TrialEstimate(110.649 μs) +median = TrialEstimate(115.179 μs) +mean = TrialEstimate(199.577 μs)) ntrials = 10000 --------------------------------------------- - =# using CUDA using Statistics From 3483bf90a56ca7a0a070f949c134ab07ad8a1f09 Mon Sep 17 00:00:00 2001 From: sriharshakandala Date: Thu, 29 Aug 2024 17:08:57 -0700 Subject: [PATCH 4/5] Add additional benchmark. --- benchmarks/scripts/benchmark_IJFVH.jl | 80 +++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/benchmarks/scripts/benchmark_IJFVH.jl b/benchmarks/scripts/benchmark_IJFVH.jl index 82b2d04bd1..f629b4f9ce 100644 --- a/benchmarks/scripts/benchmark_IJFVH.jl +++ b/benchmarks/scripts/benchmark_IJFVH.jl @@ -115,6 +115,81 @@ function addf_linear_IJFVH!(sum_A_IJFVH, A_IJFVH) ) return nothing end + +function multiaddf_linear_IJFVH!( + sum_A_IJFVH, + A_IJFVH; + nreps = 100, + n_trials = 30, +) + (NI, NJ, NF, NV, NH) = size(A_IJFVH) + NVT = min(Int(fld(max_threads_cuda(), NI * NJ)), NV) + NBV = cld(NV, NVT) + emin, emax = typemax(Float32), typemin(Float32) + @cuda threads = (NI, NJ, NVT) blocks = (NBV, NH) addf_linear_IJFVH_kernel!( + sum_A_IJFVH, + A_IJFVH, + ) + + for j in 1:n_trials + et = CUDA.@elapsed begin + for i in 1:nreps + @cuda threads = (NI, NJ, NVT) blocks = (NBV, NH) addf_linear_IJFVH_kernel!( + sum_A_IJFVH, + A_IJFVH, + ) + end + end + emin = min(emin, et) + emax = max(emax, et) + end + println( + "----multiaddf_linear_IJVFH, FT = $(eltype(A_IJFVH))-----------------------------------", + ) + println( + "emin = $(emin*1e6/nreps)μs; emax = $(emax*1e6/nreps)μs; nreps = $nreps, n_trials = $(n_trials)", + ) + println( + "------------------------------------------------------------------------------------------", + ) + return nothing +end + +function multiaddf_cart_IJFVH!(sum_A_IJFVH, A_IJFVH; nreps = 100, n_trials = 30) + (NI, NJ, NF, NV, NH) = size(A_IJFVH) + NVT = min(Int(fld(max_threads_cuda(), NI * NJ)), NV) + NBV = cld(NV, NVT) + emin, emax = typemax(Float32), typemin(Float32) + @cuda threads = (NI, NJ, NVT) blocks = (NBV, NH) addf_cart_IJFVH_kernel!( + sum_A_IJFVH, + A_IJFVH, + ) + emin, emax = typemax(Float32), typemin(Float32) + for j in 1:n_trials + et = CUDA.@elapsed begin + for i in 1:nreps + @cuda threads = (NI, NJ, NVT) blocks = (NBV, NH) addf_cart_IJFVH_kernel!( + sum_A_IJFVH, + A_IJFVH, + ) + end + end + emin = min(emin, et) + emax = max(emax, et) + end + println( + "----multiaddf_cart_IJVFH, FT = $(eltype(A_IJFVH))-------------------------------------", + ) + println( + "emin = $(emin*1e6/nreps)μs; emax = $(emax*1e6/nreps)μs; nreps = $nreps, n_trials = $(n_trials)", + ) + println( + "------------------------------------------------------------------------------------------", + ) + return nothing +end + + # generate benchmarks function generate_datalayout_benchmarks(::Type{DA}, ::Type{FT}) where {DA, FT} NI = NJ = 4 # polynomial order of approximation + 1 @@ -172,6 +247,11 @@ median = $(Statistics.median(trial_linear_IJFVH)) mean = $(Statistics.mean(trial_linear_IJFVH))) ntrials = $(length(trial_linear_IJFVH.times))") println("---------------------------------------------") + + multiaddf_linear_IJFVH!(sum_A_IJFVH, A_IJFVH, nreps = 100, n_trials = 30) + multiaddf_cart_IJFVH!(sum_A_IJFVH, A_IJFVH, nreps = 100, n_trials = 30) + + return nothing end From 08307cc701f244b2fd1eea8908107483aa802291 Mon Sep 17 00:00:00 2001 From: sriharshakandala Date: Thu, 29 Aug 2024 19:23:05 -0700 Subject: [PATCH 5/5] Add results to file --- benchmarks/scripts/benchmark_IJFVH.jl | 40 +++++++++++++++++---------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/benchmarks/scripts/benchmark_IJFVH.jl b/benchmarks/scripts/benchmark_IJFVH.jl index f629b4f9ce..d4056da819 100644 --- a/benchmarks/scripts/benchmark_IJFVH.jl +++ b/benchmarks/scripts/benchmark_IJFVH.jl @@ -6,13 +6,13 @@ On A100 Dimensions, NI = 4, NJ = 4, NF = 3, NV = 63, NH = 5400; FT = Float32 ================================================================== Test Summary: | Pass Total Time -Sum test with IJFVH layout, cartesian indexing | 1 1 2.1s +Sum test with IJFVH layout, cartesian indexing | 1 1 2.0s ================================================================== Cartesian indexing benchmark for IJFVH: ================================== -min = TrialEstimate(62.389 μs) -median = TrialEstimate(402.266 μs) -mean = TrialEstimate(357.741 μs), +min = TrialEstimate(63.210 μs) +median = TrialEstimate(68.149 μs) +mean = TrialEstimate(68.127 μs), ntrials = 10000 --------------------------------------------- Test Summary: | Pass Total Time @@ -20,22 +20,28 @@ Sum test with IJFVH layout, linear indexing | 1 1 0.0s ================================================================== linear indexing benchmark for IJFVH: ================================== -min = TrialEstimate(61.729 μs) -median = TrialEstimate(395.396 μs) -mean = TrialEstimate(360.393 μs)) +min = TrialEstimate(61.919 μs) +median = TrialEstimate(68.430 μs) +mean = TrialEstimate(68.769 μs)) ntrials = 10000 --------------------------------------------- +----multiaddf_linear_IJVFH, FT = Float32----------------------------------- +emin = 57.231360115110874μs; emax = 57.55904130637646μs; nreps = 100, n_trials = 30 +------------------------------------------------------------------------------------------ +----multiaddf_cart_IJVFH, FT = Float32------------------------------------- +emin = 57.57952108979225μs; emax = 57.95839708298445μs; nreps = 100, n_trials = 30 +------------------------------------------------------------------------------------------ ================================================================== Dimensions, NI = 4, NJ = 4, NF = 3, NV = 63, NH = 5400; FT = Float64 ================================================================== Test Summary: | Pass Total Time -Sum test with IJFVH layout, cartesian indexing | 1 1 1.5s +Sum test with IJFVH layout, cartesian indexing | 1 1 1.2s ================================================================== Cartesian indexing benchmark for IJFVH: ================================== -min = TrialEstimate(111.929 μs) -median = TrialEstimate(123.859 μs) -mean = TrialEstimate(228.541 μs), +min = TrialEstimate(112.079 μs) +median = TrialEstimate(115.559 μs) +mean = TrialEstimate(115.718 μs), ntrials = 10000 --------------------------------------------- Test Summary: | Pass Total Time @@ -43,11 +49,17 @@ Sum test with IJFVH layout, linear indexing | 1 1 0.0s ================================================================== linear indexing benchmark for IJFVH: ================================== -min = TrialEstimate(110.649 μs) -median = TrialEstimate(115.179 μs) -mean = TrialEstimate(199.577 μs)) +min = TrialEstimate(112.009 μs) +median = TrialEstimate(115.289 μs) +mean = TrialEstimate(118.641 μs)) ntrials = 10000 --------------------------------------------- +----multiaddf_linear_IJVFH, FT = Float64----------------------------------- +emin = 103.48544456064701μs; emax = 106.06592521071434μs; nreps = 100, n_trials = 30 +------------------------------------------------------------------------------------------ +----multiaddf_cart_IJVFH, FT = Float64------------------------------------- +emin = 104.05887849628925μs; emax = 105.89184239506721μs; nreps = 100, n_trials = 30 +------------------------------------------------------------------------------------------ =# using CUDA using Statistics