Skip to content

Commit d110151

Browse files
Drop field dimension to demo thermo benchmark
1 parent b1b5459 commit d110151

17 files changed

+794
-285
lines changed

benchmarks/scripts/thermo_bench_bw.jl

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ using BenchmarkTools
150150
import .TestUtilities as TU;
151151

152152
using Test
153-
@testset "Thermo state" begin
153+
# @testset "Thermo state" begin
154154
FT = Float32
155155
bm = TBB.Benchmark(;problem_size=(63,4,4,1,5400), float_type=FT)
156156
device = ClimaComms.device()
@@ -175,7 +175,8 @@ using Test
175175
)
176176
x = fill((; ts = zero(TBB.PhaseEquil{FT}), nt_core...), cspace)
177177
xv = fill((; ts = nt_ts, nt_core...), cspace)
178-
(_, Nij, _, Nv, Nh) = size(Fields.field_values(x.ts))
178+
fv_ts = Fields.field_values(x.ts)
179+
(_, Nij, _, Nv, Nh) = size(fv_ts)
179180
us = TBB.UniversalSizesStatic(Nv, Nij, Nh)
180181
function to_vec(ξ)
181182
pns = propertynames(ξ)
@@ -186,7 +187,7 @@ using Test
186187
end
187188
return (; zip(propertynames(ξ), dl_vals)...)
188189
end
189-
x_vec = to_vec(xv)
190+
# x_vec = to_vec(xv)
190191

191192
x_aos = fill((; ρ_read = FT(0), ρ_write = FT(0)), cspace)
192193
x_soa = (;
@@ -199,20 +200,21 @@ using Test
199200
@. x_aos.ρ_write = 7
200201
TBB.singlefield_bc!(x_soa, us; nreps=1, n_trials = 1)
201202
TBB.singlefield_bc!(x_aos, us; nreps=1, n_trials = 1)
202-
203+
203204
TBB.thermo_func_bc!(x, us; nreps=1, n_trials = 1)
204-
TBB.thermo_func_sol!(x_vec, us; nreps=1, n_trials = 1)
205+
# TBB.thermo_func_sol!(x_vec, us; nreps=1, n_trials = 1)
205206

206-
rc = Fields.rcompare(x_vec, to_vec(x))
207-
rc || Fields.@rprint_diff(x_vec, to_vec(x)) # test correctness (should print nothing)
208-
@test rc # test correctness
207+
# rc = Fields.rcompare(x_vec, to_vec(x))
208+
# rc || Fields.@rprint_diff(x_vec, to_vec(x)) # test correctness (should print nothing)
209+
# @test rc # test correctness
209210

210-
TBB.singlefield_bc!(x_soa, us; nreps=100, bm)
211-
TBB.singlefield_bc!(x_aos, us; nreps=100, bm)
211+
# TBB.singlefield_bc!(x_soa, us; nreps=100, bm)
212+
# TBB.singlefield_bc!(x_aos, us; nreps=100, bm)
212213
TBB.thermo_func_bc!(x, us; nreps=100, bm)
213-
TBB.thermo_func_sol!(x_vec, us; nreps=100, bm)
214+
@info "Success!"
215+
# TBB.thermo_func_sol!(x_vec, us; nreps=100, bm)
214216

215217
TBB.tabulate_benchmark(bm)
216218

217-
end
219+
# end
218220
#! format: on

ext/ClimaCoreCUDAExt.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import ClimaCore.Utilities: cart_ind, linear_ind
1717
import ClimaCore.RecursiveApply:
1818
, , , radd, rmul, rsub, rdiv, rmap, rzero, rmin, rmax
1919
import ClimaCore.DataLayouts: get_N, get_Nv, get_Nij, get_Nij, get_Nh
20+
import ClimaCore.DataLayouts: universal_size, UniversalSize
2021

2122
include(joinpath("cuda", "cuda_utils.jl"))
2223
include(joinpath("cuda", "data_layouts.jl"))

ext/cuda/data_layouts.jl

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ import CUDA
1313
parent_array_type(::Type{<:CUDA.CuArray{T, N, B} where {N}}) where {T, B} =
1414
CUDA.CuArray{T, N, B} where {N}
1515

16+
parent_array_type(::Type{<:CUDA.CuArray{T, N, B} where {N}}, ::Val{ND}) where {T, B, ND} =
17+
CUDA.CuArray{T, ND, B}
18+
1619
# Ensure that both parent array types have the same memory buffer type.
1720
promote_parent_array_type(
1821
::Type{CUDA.CuArray{T1, N, B} where {N}},
@@ -53,3 +56,16 @@ function Adapt.adapt_structure(
5356
end,
5457
)
5558
end
59+
60+
import Adapt
61+
import CUDA
62+
function Adapt.adapt_structure(
63+
to::CUDA.KernelAdaptor,
64+
bc::DataLayouts.NonExtrudedBroadcasted{Style},
65+
) where {Style}
66+
DataLayouts.NonExtrudedBroadcasted{Style}(
67+
adapt_f(to, bc.f),
68+
Adapt.adapt(to, bc.args),
69+
Adapt.adapt(to, bc.axes),
70+
)
71+
end

ext/cuda/data_layouts_copyto.jl

Lines changed: 54 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import ClimaCore.DataLayouts:
2+
to_non_extruded_broadcasted, has_uniform_datalayouts
13
DataLayouts._device_dispatch(x::CUDA.CuArray) = ToCUDA()
24

35
function knl_copyto!(dest, src)
@@ -15,36 +17,76 @@ function knl_copyto!(dest, src)
1517
return nothing
1618
end
1719

20+
function knl_copyto_toa!(dest, src, us)
21+
@inbounds begin
22+
tidx = thread_index()
23+
if tidx get_N(us)
24+
n = size(dest)
25+
I = kernel_indexes(tidx, n)
26+
dest[I] = src[I]
27+
end
28+
end
29+
return nothing
30+
end
31+
1832
function Base.copyto!(
1933
dest::IJFH{S, Nij, Nh},
2034
bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh},
2135
::ToCUDA,
2236
) where {S, Nij, Nh}
37+
us = DataLayouts.UniversalSize(dest)
2338
if Nh > 0
2439
auto_launch!(
25-
knl_copyto!,
26-
(dest, bc);
27-
threads_s = (Nij, Nij),
28-
blocks_s = (Nh, 1),
40+
knl_copyto_toa!,
41+
(dest, bc, us),
42+
prod(DataLayouts.universal_size(us));
43+
auto = true
2944
)
3045
end
3146
return dest
3247
end
3348

49+
function knl_copyto_linear!(dest::AbstractData, bc, us)
50+
@inbounds begin
51+
tidx = thread_index()
52+
if tidx get_N(us)
53+
dest[tidx] = bc[tidx]
54+
end
55+
end
56+
return nothing
57+
end
58+
59+
function knl_copyto_linear!(dest::DataF, bc, us)
60+
@inbounds dest[] = bc[tidx]
61+
return nothing
62+
end
63+
64+
function knl_copyto_cart!(dest, src, us)
65+
@inbounds begin
66+
tidx = thread_index()
67+
if tidx get_N(us)
68+
n = size(dest)
69+
I = kernel_indexes(tidx, n)
70+
dest[I] = src[I]
71+
end
72+
end
73+
return nothing
74+
end
75+
3476
function Base.copyto!(
3577
dest::VIJFH{S, Nv, Nij, Nh},
3678
bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh},
3779
::ToCUDA,
3880
) where {S, Nv, Nij, Nh}
3981
if Nv > 0 && Nh > 0
40-
Nv_per_block = min(Nv, fld(256, Nij * Nij))
41-
Nv_blocks = cld(Nv, Nv_per_block)
42-
auto_launch!(
43-
knl_copyto!,
44-
(dest, bc);
45-
threads_s = (Nij, Nij, Nv_per_block),
46-
blocks_s = (Nh, Nv_blocks),
47-
)
82+
us = DataLayouts.UniversalSize(dest)
83+
n = prod(DataLayouts.universal_size(us))
84+
if has_uniform_datalayouts(bc)
85+
bc′ = to_non_extruded_broadcasted(bc)
86+
auto_launch!(knl_copyto_linear!, (dest, bc′, us), n; auto = true)
87+
else
88+
auto_launch!(knl_copyto_cart!, (dest, bc, us), n; auto = true)
89+
end
4890
end
4991
return dest
5092
end

ext/cuda/data_layouts_fill.jl

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
function knl_fill_flat!(dest::AbstractData, val, us)
2-
@inbounds begin
3-
tidx = thread_index()
4-
if tidx get_N(us)
5-
n = size(dest)
6-
I = kernel_indexes(tidx, n)
7-
@inbounds dest[I] = val
8-
end
9-
end
2+
# @inbounds begin
3+
# tidx = thread_index()
4+
# if tidx ≤ get_N(us)
5+
# n = size(dest)
6+
# I = kernel_indexes(tidx, n)
7+
# @inbounds dest[I] = val
8+
# end
9+
# end
1010
return nothing
1111
end
1212

ext/cuda/topologies_dss.jl

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,8 @@ function dss_load_perimeter_data_kernel!(
4848
if gidx prod(sizep)
4949
(level, p, fidx, elem) = cart_ind(sizep, gidx).I
5050
(ip, jp) = perimeter[p]
51-
data_idx = linear_ind(sized, (level, ip, jp, fidx, elem))
52-
pperimeter_data[level, p, fidx, elem] = pdata[data_idx]
51+
data_idx = linear_ind(sized, (level, ip, jp, elem))
52+
pperimeter_data.arrays[fidx][level, p, elem] = pdata.arrays[fidx][data_idx]
5353
end
5454
return nothing
5555
end
@@ -89,7 +89,7 @@ function dss_unload_perimeter_data_kernel!(
8989
(level, p, fidx, elem) = cart_ind(sizep, gidx).I
9090
(ip, jp) = perimeter[p]
9191
data_idx = linear_ind(sized, (level, ip, jp, fidx, elem))
92-
pdata[data_idx] = pperimeter_data[level, p, fidx, elem]
92+
pdata.arrays[fidx][data_idx] = pperimeter_data.arrays[fidx][level, p, elem]
9393
end
9494
return nothing
9595
end
@@ -148,12 +148,12 @@ function dss_local_kernel!(
148148
for idx in st:(en - 1)
149149
(lidx, vert) = local_vertices[idx]
150150
ip = perimeter_vertex_node_index(vert)
151-
sum_data += pperimeter_data[level, ip, fidx, lidx]
151+
sum_data += pperimeter_data.arrays[fidx][level, ip, lidx]
152152
end
153153
for idx in st:(en - 1)
154154
(lidx, vert) = local_vertices[idx]
155155
ip = perimeter_vertex_node_index(vert)
156-
pperimeter_data[level, ip, fidx, lidx] = sum_data
156+
pperimeter_data.arrays[fidx][level, ip, lidx] = sum_data
157157
end
158158
elseif gidx nlevels * nfidx * (nlocalvertices + nlocalfaces) # interior faces
159159
nfacedof = div(nperimeter - 4, 4)
@@ -169,10 +169,10 @@ function dss_local_kernel!(
169169
ip1 = inc1 == 1 ? first1 + i - 1 : first1 - i + 1
170170
ip2 = inc2 == 1 ? first2 + i - 1 : first2 - i + 1
171171
val =
172-
pperimeter_data[level, ip1, fidx, lidx1] +
173-
pperimeter_data[level, ip2, fidx, lidx2]
174-
pperimeter_data[level, ip1, fidx, lidx1] = val
175-
pperimeter_data[level, ip2, fidx, lidx2] = val
172+
pperimeter_data.arrays[fidx][level, ip1, lidx1] +
173+
pperimeter_data.arrays[fidx][level, ip2, lidx2]
174+
pperimeter_data.arrays[fidx][level, ip1, lidx1] = val
175+
pperimeter_data.arrays[fidx][level, ip2, lidx2] = val
176176
end
177177
end
178178

@@ -683,7 +683,7 @@ function load_from_recv_buffer_kernel!(
683683
lidx = recv_buf_idx[irecv, 1]
684684
ip = recv_buf_idx[irecv, 2]
685685
idx = level + ((fidx - 1) + (irecv - 1) * nfid) * nlevels
686-
CUDA.@atomic pperimeter_data[level, ip, fidx, lidx] += recv_data[idx]
686+
CUDA.@atomic pperimeter_data.arrays[fidx][level, ip, lidx] += recv_data[idx]
687687
end
688688
return nothing
689689
end

0 commit comments

Comments
 (0)