Skip to content

Commit 2d8bb29

Browse files
Drop field dimension to demo thermo benchmark
Rename TupleOfArrays to FieldArrays Apply formatter wip Pass some unit tests, cleanup wip wip wip Apply formatter
1 parent d0680b8 commit 2d8bb29

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+1947
-832
lines changed

.buildkite/pipeline.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -478,15 +478,15 @@ steps:
478478
key: unit_field
479479
command:
480480
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/unit_field.jl"
481-
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/benchmark_field_multi_broadcast_fusion.jl"
481+
# - "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/benchmark_field_multi_broadcast_fusion.jl"
482482
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/convergence_field_integrals.jl"
483483

484484
- label: "Unit: field cuda"
485485
key: unit_field_cuda
486486
command:
487487
- "julia --project=.buildkite -e 'using CUDA; CUDA.versioninfo()'"
488488
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/unit_field.jl"
489-
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/benchmark_field_multi_broadcast_fusion.jl"
489+
# - "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/benchmark_field_multi_broadcast_fusion.jl"
490490
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/convergence_field_integrals.jl"
491491
env:
492492
CLIMACOMMS_DEVICE: "CUDA"
@@ -727,7 +727,7 @@ steps:
727727
command:
728728
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Operators/unit_thomas_algorithm.jl"
729729

730-
- label: "Unit: Thomas Algorithm"
730+
- label: "Unit: Thomas Algorithm (CUDA)"
731731
key: "gpu_thomas_algorithm"
732732
command:
733733
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Operators/unit_thomas_algorithm.jl"

benchmarks/scripts/thermo_bench_bw.jl

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,8 @@ using Test
180180
)
181181
x = fill((; ts = zero(TBB.PhaseEquil{FT}), nt_core...), cspace)
182182
xv = fill((; ts = nt_ts, nt_core...), cspace)
183-
(_, Nij, _, Nv, Nh) = size(Fields.field_values(x.ts))
183+
fv_ts = Fields.field_values(x.ts)
184+
(_, Nij, _, Nv, Nh) = size(fv_ts)
184185
us = TBB.UniversalSizesStatic(Nv, Nij, Nh)
185186
function to_vec(ξ)
186187
pns = propertynames(ξ)
@@ -191,7 +192,7 @@ using Test
191192
end
192193
return (; zip(propertynames(ξ), dl_vals)...)
193194
end
194-
x_vec = to_vec(xv)
195+
# x_vec = to_vec(xv)
195196

196197
x_aos = fill((; ρ_read = FT(0), ρ_write = FT(0)), cspace)
197198
x_soa = (;
@@ -204,20 +205,21 @@ using Test
204205
@. x_aos.ρ_write = 7
205206
TBB.singlefield_bc!(x_soa, us; nreps=1, n_trials = 1)
206207
TBB.singlefield_bc!(x_aos, us; nreps=1, n_trials = 1)
207-
208+
208209
TBB.thermo_func_bc!(x, us; nreps=1, n_trials = 1)
209-
TBB.thermo_func_sol!(x_vec, us; nreps=1, n_trials = 1)
210+
# TBB.thermo_func_sol!(x_vec, us; nreps=1, n_trials = 1)
210211

211-
rc = Fields.rcompare(x_vec, to_vec(x))
212-
rc || Fields.@rprint_diff(x_vec, to_vec(x)) # test correctness (should print nothing)
213-
@test rc # test correctness
212+
# rc = Fields.rcompare(x_vec, to_vec(x))
213+
# rc || Fields.@rprint_diff(x_vec, to_vec(x)) # test correctness (should print nothing)
214+
# @test rc # test correctness
214215

215-
TBB.singlefield_bc!(x_soa, us; nreps=100, bm)
216-
TBB.singlefield_bc!(x_aos, us; nreps=100, bm)
216+
# TBB.singlefield_bc!(x_soa, us; nreps=100, bm)
217+
# TBB.singlefield_bc!(x_aos, us; nreps=100, bm)
217218
TBB.thermo_func_bc!(x, us; nreps=100, bm)
218-
TBB.thermo_func_sol!(x_vec, us; nreps=100, bm)
219+
@info "Success!"
220+
# TBB.thermo_func_sol!(x_vec, us; nreps=100, bm)
219221

220222
TBB.tabulate_benchmark(bm)
221223

222-
end
224+
# end
223225
#! format: on

ext/ClimaCoreCUDAExt.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ import ClimaCore.Utilities: cart_ind, linear_ind
1717
import ClimaCore.RecursiveApply:
1818
, , , radd, rmul, rsub, rdiv, rmap, rzero, rmin, rmax
1919
import ClimaCore.DataLayouts: get_N, get_Nv, get_Nij, get_Nij, get_Nh
20+
import ClimaCore.DataLayouts: universal_size, UniversalSize
21+
import ClimaCore.DataLayouts: ArraySize
2022

2123
include(joinpath("cuda", "cuda_utils.jl"))
2224
include(joinpath("cuda", "data_layouts.jl"))

ext/cuda/data_layouts.jl

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,17 @@ import CUDA
1313
parent_array_type(::Type{<:CUDA.CuArray{T, N, B} where {N}}) where {T, B} =
1414
CUDA.CuArray{T, N, B} where {N}
1515

16+
# Can we remove this?
17+
# parent_array_type(
18+
# ::Type{<:CUDA.CuArray{T, N, B} where {N}},
19+
# ::Val{ND},
20+
# ) where {T, B, ND} = CUDA.CuArray{T, ND, B}
21+
22+
parent_array_type(
23+
::Type{<:CUDA.CuArray{T, N, B} where {N}},
24+
as::ArraySize,
25+
) where {T, B} = CUDA.CuArray{T, ndims(as), B}
26+
1627
# Ensure that both parent array types have the same memory buffer type.
1728
promote_parent_array_type(
1829
::Type{CUDA.CuArray{T1, N, B} where {N}},
@@ -53,3 +64,16 @@ function Adapt.adapt_structure(
5364
end,
5465
)
5566
end
67+
68+
import Adapt
69+
import CUDA
70+
function Adapt.adapt_structure(
71+
to::CUDA.KernelAdaptor,
72+
bc::DataLayouts.NonExtrudedBroadcasted{Style},
73+
) where {Style}
74+
DataLayouts.NonExtrudedBroadcasted{Style}(
75+
adapt_f(to, bc.f),
76+
Adapt.adapt(to, bc.args),
77+
Adapt.adapt(to, bc.axes),
78+
)
79+
end

ext/cuda/data_layouts_copyto.jl

Lines changed: 56 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -1,87 +1,60 @@
1+
import ClimaCore.DataLayouts:
2+
to_non_extruded_broadcasted, has_uniform_datalayouts
13
DataLayouts._device_dispatch(x::CUDA.CuArray) = ToCUDA()
24

3-
function knl_copyto!(dest, src)
5+
# function Base.copyto!(
6+
# dest::VIJFH{S, Nv, Nij, Nh},
7+
# bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh},
8+
# ::ToCUDA,
9+
# ) where {S, Nv, Nij, Nh}
10+
# if Nv > 0 && Nh > 0
11+
# us = DataLayouts.UniversalSize(dest)
12+
# n = prod(DataLayouts.universal_size(us))
13+
# if has_uniform_datalayouts(bc)
14+
# bc′ = to_non_extruded_broadcasted(bc)
15+
# auto_launch!(knl_copyto_linear!, (dest, bc′, us), n; auto = true)
16+
# else
17+
# auto_launch!(knl_copyto_cart!, (dest, bc, us), n; auto = true)
18+
# end
19+
# end
20+
# return dest
21+
# end
422

5-
i = CUDA.threadIdx().x
6-
j = CUDA.threadIdx().y
7-
8-
h = CUDA.blockIdx().x
9-
v = CUDA.blockDim().z * (CUDA.blockIdx().y - 1) + CUDA.threadIdx().z
10-
11-
if v <= size(dest, 4)
12-
I = CartesianIndex((i, j, 1, v, h))
13-
@inbounds dest[I] = src[I]
23+
function knl_copyto_linear!(dest::AbstractData, bc, us)
24+
@inbounds begin
25+
tidx = thread_index()
26+
if tidx get_N(us)
27+
dest[tidx] = bc[tidx]
28+
end
1429
end
1530
return nothing
1631
end
1732

18-
function Base.copyto!(
19-
dest::IJFH{S, Nij, Nh},
20-
bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh},
21-
::ToCUDA,
22-
) where {S, Nij, Nh}
23-
if Nh > 0
24-
auto_launch!(
25-
knl_copyto!,
26-
(dest, bc);
27-
threads_s = (Nij, Nij),
28-
blocks_s = (Nh, 1),
29-
)
30-
end
31-
return dest
32-
end
33-
34-
function Base.copyto!(
35-
dest::VIJFH{S, Nv, Nij, Nh},
36-
bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh},
37-
::ToCUDA,
38-
) where {S, Nv, Nij, Nh}
39-
if Nv > 0 && Nh > 0
40-
Nv_per_block = min(Nv, fld(256, Nij * Nij))
41-
Nv_blocks = cld(Nv, Nv_per_block)
42-
auto_launch!(
43-
knl_copyto!,
44-
(dest, bc);
45-
threads_s = (Nij, Nij, Nv_per_block),
46-
blocks_s = (Nh, Nv_blocks),
47-
)
48-
end
49-
return dest
33+
function knl_copyto_linear!(dest::DataF, bc, us)
34+
tidx = thread_index()
35+
@inbounds dest[] = bc[tidx]
36+
return nothing
5037
end
5138

52-
function Base.copyto!(
53-
dest::VF{S, Nv},
54-
bc::DataLayouts.BroadcastedUnionVF{S, Nv},
55-
::ToCUDA,
56-
) where {S, Nv}
57-
if Nv > 0
58-
auto_launch!(
59-
knl_copyto!,
60-
(dest, bc);
61-
threads_s = (1, 1),
62-
blocks_s = (1, Nv),
63-
)
39+
function knl_copyto_flat!(dest::AbstractData, bc, us)
40+
@inbounds begin
41+
tidx = thread_index()
42+
if tidx get_N(us)
43+
n = size(dest)
44+
I = kernel_indexes(tidx, n)
45+
dest[I] = bc[I]
46+
end
6447
end
65-
return dest
66-
end
67-
68-
function Base.copyto!(
69-
dest::DataF{S},
70-
bc::DataLayouts.BroadcastedUnionDataF{S},
71-
::ToCUDA,
72-
) where {S}
73-
auto_launch!(knl_copyto!, (dest, bc); threads_s = (1, 1), blocks_s = (1, 1))
74-
return dest
48+
return nothing
7549
end
7650

77-
import ClimaCore.DataLayouts: isascalar
78-
function knl_copyto_flat!(dest::AbstractData, bc, us)
51+
function knl_copyto_flat!(dest::DataF, bc, us)
7952
@inbounds begin
8053
tidx = thread_index()
8154
if tidx get_N(us)
8255
n = size(dest)
8356
I = kernel_indexes(tidx, n)
84-
dest[I] = bc[I]
57+
dest[] = bc[I]
8558
end
8659
end
8760
return nothing
@@ -90,22 +63,32 @@ end
9063
function cuda_copyto!(dest::AbstractData, bc)
9164
(_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
9265
us = DataLayouts.UniversalSize(dest)
66+
n = prod(DataLayouts.universal_size(us))
9367
if Nv > 0 && Nh > 0
94-
nitems = prod(DataLayouts.universal_size(dest))
95-
auto_launch!(knl_copyto_flat!, (dest, bc, us), nitems; auto = true)
68+
if has_uniform_datalayouts(bc)
69+
bc′ = to_non_extruded_broadcasted(bc)
70+
auto_launch!(
71+
knl_copyto_linear!,
72+
(dest, bc′, us),
73+
nitems;
74+
auto = true,
75+
)
76+
else
77+
auto_launch!(knl_copyto_flat!, (dest, bc, us), nitems; auto = true)
78+
end
9679
end
9780
return dest
9881
end
9982

10083
# TODO: can we use CUDA's luanch configuration for all data layouts?
10184
# Currently, it seems to have a slight performance degradation.
10285
#! format: off
103-
# Base.copyto!(dest::IJFH{S, Nij}, bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh}, ::ToCUDA) where {S, Nij, Nh} = cuda_copyto!(dest, bc)
86+
Base.copyto!(dest::IJFH{S, Nij}, bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh}, ::ToCUDA) where {S, Nij, Nh} = cuda_copyto!(dest, bc)
10487
Base.copyto!(dest::IFH{S, Ni, Nh}, bc::DataLayouts.BroadcastedUnionIFH{S, Ni, Nh}, ::ToCUDA) where {S, Ni, Nh} = cuda_copyto!(dest, bc)
10588
Base.copyto!(dest::IJF{S, Nij}, bc::DataLayouts.BroadcastedUnionIJF{S, Nij}, ::ToCUDA) where {S, Nij} = cuda_copyto!(dest, bc)
10689
Base.copyto!(dest::IF{S, Ni}, bc::DataLayouts.BroadcastedUnionIF{S, Ni}, ::ToCUDA) where {S, Ni} = cuda_copyto!(dest, bc)
10790
Base.copyto!(dest::VIFH{S, Nv, Ni, Nh}, bc::DataLayouts.BroadcastedUnionVIFH{S, Nv, Ni, Nh}, ::ToCUDA) where {S, Nv, Ni, Nh} = cuda_copyto!(dest, bc)
108-
# Base.copyto!(dest::VIJFH{S, Nv, Nij, Nh}, bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh}, ::ToCUDA) where {S, Nv, Nij, Nh} = cuda_copyto!(dest, bc)
109-
# Base.copyto!(dest::VF{S, Nv}, bc::DataLayouts.BroadcastedUnionVF{S, Nv}, ::ToCUDA) where {S, Nv} = cuda_copyto!(dest, bc)
110-
# Base.copyto!(dest::DataF{S}, bc::DataLayouts.BroadcastedUnionDataF{S}, ::ToCUDA) where {S} = cuda_copyto!(dest, bc)
91+
Base.copyto!(dest::VIJFH{S, Nv, Nij, Nh}, bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh}, ::ToCUDA) where {S, Nv, Nij, Nh} = cuda_copyto!(dest, bc)
92+
Base.copyto!(dest::VF{S, Nv}, bc::DataLayouts.BroadcastedUnionVF{S, Nv}, ::ToCUDA) where {S, Nv} = cuda_copyto!(dest, bc)
93+
Base.copyto!(dest::DataF{S}, bc::DataLayouts.BroadcastedUnionDataF{S}, ::ToCUDA) where {S} = cuda_copyto!(dest, bc)
11194
#! format: on

ext/cuda/data_layouts_fill.jl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,11 @@ function knl_fill_flat!(dest::AbstractData, val, us)
1010
return nothing
1111
end
1212

13+
function knl_fill_flat!(dest::DataF, val, us)
14+
@inbounds dest[] = val
15+
return nothing
16+
end
17+
1318
function cuda_fill!(dest::AbstractData, val)
1419
(_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
1520
us = DataLayouts.UniversalSize(dest)

0 commit comments

Comments
 (0)