Skip to content

Commit 223000d

Browse files
authored
Merge branch 'CliMA:main' into main
2 parents 673af0b + 9f94496 commit 223000d

24 files changed

+326
-455
lines changed

.buildkite/pipeline.yml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1179,6 +1179,14 @@ steps:
11791179
key: unit_spectralelement2d
11801180
command: "julia --color=yes --check-bounds=yes --project=.buildkite test/InputOutput/unit_spectralelement2d.jl"
11811181

1182+
- label: "Unit: spectralelement2d"
1183+
key: unit_spectralelement2d_gpu
1184+
command: "julia --color=yes --check-bounds=yes --project=.buildkite test/InputOutput/unit_spectralelement2d.jl"
1185+
env:
1186+
CLIMACOMMS_DEVICE: "CUDA"
1187+
agents:
1188+
slurm_gpus: 1
1189+
11821190
- label: "Unit: hybrid2dbox"
11831191
key: unit_hybrid2dbox
11841192
command: "julia --color=yes --check-bounds=yes --project=.buildkite test/InputOutput/unit_hybrid2dbox.jl"
@@ -1203,6 +1211,14 @@ steps:
12031211
key: unit_hybrid3dcubedsphere_topography
12041212
command: "julia --color=yes --check-bounds=yes --project=.buildkite test/InputOutput/unit_hybrid3dcubedsphere_topography.jl"
12051213

1214+
- label: "Unit: hybrid3dcubedsphere topography"
1215+
key: unit_hybrid3dcubedsphere_topography_gpu
1216+
command: "julia --color=yes --check-bounds=yes --project=.buildkite test/InputOutput/unit_hybrid3dcubedsphere_topography.jl"
1217+
env:
1218+
CLIMACOMMS_DEVICE: "CUDA"
1219+
agents:
1220+
slurm_gpus: 1
1221+
12061222
- label: "Unit: finitedifference"
12071223
key: unit_finitedifference
12081224
command: "julia --color=yes --check-bounds=yes --project=.buildkite test/InputOutput/unit_finitedifference.jl"

NEWS.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,30 @@ ClimaCore.jl Release Notes
44
main
55
-------
66

7+
- Fixed missing method for `Topologies.mesh(Topology2D)`
8+
[2288](https://github.com/CliMA/ClimaCore.jl/pull/2288).
9+
10+
v0.14.31
11+
-------
12+
13+
- GPU performance should now be more robust
14+
[2296](https://github.com/CliMA/ClimaCore.jl/pull/2296).
15+
16+
- Remapping is now protected for masked operations
17+
[2292](https://github.com/CliMA/ClimaCore.jl/pull/2292).
18+
19+
- Shmem support for InterpolateC2F was added
20+
[2290](https://github.com/CliMA/ClimaCore.jl/pull/2290).
21+
22+
- Some masked operations were fixed
23+
[2285](https://github.com/CliMA/ClimaCore.jl/pull/2285).
24+
25+
- Internal refactoring should help reduce latency and improve gpu performance
26+
[2284](https://github.com/CliMA/ClimaCore.jl/pull/2284).
27+
28+
- Masks now support restart
29+
[2212](https://github.com/CliMA/ClimaCore.jl/pull/2212).
30+
731
v0.14.30
832
-------
933

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "ClimaCore"
22
uuid = "d414da3d-4745-48bb-8d80-42e94e092884"
33
authors = ["CliMA Contributors <clima-software@caltech.edu>"]
4-
version = "0.14.30"
4+
version = "0.14.31"
55

66
[deps]
77
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"

docs/src/shmem_design.md

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,12 +35,40 @@ The high-level view of the design is:
3535
(different operators require different arguments, and therefore different
3636
types and amounts of shmem).
3737
- Recursively fill the shmem for all `StencilBroadcasted`. This is done
38-
by reading the argument data from `getidx`
38+
by reading the argument data from `getidx`. See the section discussion below for more details.
3939
- The destination field is filled with the result of `getidx` (as it is without
4040
shmem), except that we overload `getidx` (for supported `StencilBroadcasted`
4141
types) to retrieve the result of `getidx` via `fd_operator_evaluate`, which
4242
retrieves the result from the shmem, instead of global memory.
4343

44+
### Populating shared memory, and memory access safety
4445

46+
We use tail-recursion when filling shared memory of the broadcast expressions.
47+
That is, we visit leaves of the broadcast expression, then work our way up.
48+
It's important to note that the `StencilBroadcasted` and `Broadcasted` can be
49+
interleaved.
4550

51+
Let's take `DivergenceF2C()(f*GradientC2F()(a*b)))` as an example (depicted in
52+
the image below).
4653

54+
Recursion must go through the entire expression in order to ensure that we've
55+
reached all of the leaves of the `StencilBroadcasted` objects (otherwise, we
56+
could introduce race conditions with memory access). The leaves of the
57+
`StencilBroadcasted` will call `getidx`, below which there are (by definition)
58+
no more `StencilBroadcasted`, and those `getidx` calls will read from global
59+
memory. All subsequent reads will be from shmem(as they will be caught by the
60+
`getidx(parent_space, bc::StencilBroadcasted
61+
{CUDAWithShmemColumnStencilStyle}, idx, hidx)` defined in the
62+
`ClimaCoreCUDAExt` module).
63+
64+
In the diagram below, we traverse and fill the yellow highlighted sections
65+
(bottom first and top last). The algorithmic impact of using shared memory is
66+
that the duplicate global memory reads (highlighted in red circles) become one
67+
global memory read (performed in `fd_operator_fill_shmem!`).
68+
69+
Finally, its important to note that threads must by syncrhonized after each node
70+
in the tree is filled, to avoid race conditions for subsequent `getidx
71+
(parent_space, bc::StencilBroadcasted{CUDAWithShmemColumnStencilStyle}, idx,
72+
hidx)` calls (which are retrieved via shmem).
73+
74+
![](shmem_diagram_example.png)

docs/src/shmem_diagram_example.png

215 KB
Loading

ext/cuda/data_layouts.jl

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,12 @@ Base.similar(
2727
dims::Dims{N},
2828
) where {T, N, B} = similar(CUDA.CuArray{T, N, B}, dims)
2929

30+
unval(::Val{CI}) where {CI} = CI
31+
unval(CI) = CI
32+
33+
@inline linear_thread_idx() =
34+
threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
35+
3036
include("data_layouts_fill.jl")
3137
include("data_layouts_copyto.jl")
3238
include("data_layouts_fused_copyto.jl")

ext/cuda/data_layouts_copyto.jl

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,20 @@
11
DataLayouts.device_dispatch(x::CUDA.CuArray) = ToCUDA()
22

3-
function knl_copyto!(dest, src, us, mask)
4-
I = if mask isa NoMask
5-
universal_index(dest)
6-
else
7-
masked_universal_index(mask)
8-
end
9-
if is_valid_index(dest, I, us)
3+
function knl_copyto!(dest, src, us, mask, cart_inds)
4+
tidx = linear_thread_idx()
5+
if linear_is_valid_index(tidx, us) && tidx length(unval(cart_inds))
6+
I = if mask isa NoMask
7+
unval(cart_inds)[tidx]
8+
else
9+
masked_universal_index(mask, cart_inds)
10+
end
1011
@inbounds dest[I] = src[I]
1112
end
1213
return nothing
1314
end
1415

1516
function knl_copyto_linear!(dest, src, us)
16-
i = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
17+
i = linear_thread_idx()
1718
if linear_is_valid_index(i, us)
1819
@inbounds dest[i] = src[i]
1920
end
@@ -32,13 +33,18 @@ if VERSION ≥ v"1.11.0-beta"
3233
(_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
3334
us = DataLayouts.UniversalSize(dest)
3435
if Nv > 0 && Nh > 0
35-
args = (dest, bc, us, mask)
36+
cart_inds = if mask isa NoMask
37+
cartesian_indices(us)
38+
else
39+
cartesian_indicies_mask(us, mask)
40+
end
41+
args = (dest, bc, us, mask, cart_inds)
3642
threads = threads_via_occupancy(knl_copyto!, args)
3743
n_max_threads = min(threads, get_N(us))
3844
p = if mask isa NoMask
39-
partition(dest, n_max_threads)
45+
linear_partition(prod(size(dest)), n_max_threads)
4046
else
41-
masked_partition(us, n_max_threads, mask)
47+
masked_partition(mask, n_max_threads, us)
4248
end
4349
auto_launch!(
4450
knl_copyto!,
@@ -72,13 +78,18 @@ else
7278
blocks_s = p.blocks,
7379
)
7480
else
75-
args = (dest, bc, us, mask)
81+
cart_inds = if mask isa NoMask
82+
cartesian_indices(us)
83+
else
84+
cartesian_indicies_mask(us, mask)
85+
end
86+
args = (dest, bc, us, mask, cart_inds)
7687
threads = threads_via_occupancy(knl_copyto!, args)
7788
n_max_threads = min(threads, get_N(us))
7889
p = if mask isa NoMask
79-
partition(dest, n_max_threads)
90+
linear_partition(prod(size(dest)), n_max_threads)
8091
else
81-
masked_partition(us, n_max_threads, mask)
92+
masked_partition(mask, n_max_threads, us)
8293
end
8394
auto_launch!(
8495
knl_copyto!,

ext/cuda/data_layouts_fill.jl

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,26 @@
1-
function knl_fill!(dest, val, us, mask)
2-
I = if mask isa NoMask
3-
universal_index(dest)
4-
else
5-
masked_universal_index(mask)
6-
end
7-
if is_valid_index(dest, I, us)
1+
function knl_fill!(dest, val, us, mask, cart_inds)
2+
tidx = linear_thread_idx()
3+
if linear_is_valid_index(tidx, us) && tidx length(unval(cart_inds))
4+
I = if mask isa NoMask
5+
unval(cart_inds)[tidx]
6+
else
7+
masked_universal_index(mask, cart_inds)
8+
end
89
@inbounds dest[I] = val
910
end
1011
return nothing
1112
end
1213

1314
function knl_fill_linear!(dest, val, us)
14-
i = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
15+
i = linear_thread_idx()
1516
if linear_is_valid_index(i, us)
1617
@inbounds dest[i] = val
1718
end
1819
return nothing
1920
end
2021

2122
function Base.fill!(dest::AbstractData, bc, to::ToCUDA, mask = NoMask())
22-
(_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
23+
(Ni, Nj, Nv, _, Nh) = DataLayouts.universal_size(dest)
2324
us = DataLayouts.UniversalSize(dest)
2425
if Nv > 0 && Nh > 0
2526
if !(VERSION v"1.11.0-beta") &&
@@ -36,13 +37,18 @@ function Base.fill!(dest::AbstractData, bc, to::ToCUDA, mask = NoMask())
3637
blocks_s = p.blocks,
3738
)
3839
else
39-
args = (dest, bc, us, mask)
40+
cart_inds = if mask isa NoMask
41+
cartesian_indices(us)
42+
else
43+
cartesian_indicies_mask(us, mask)
44+
end
45+
args = (dest, bc, us, mask, cart_inds)
4046
threads = threads_via_occupancy(knl_fill!, args)
4147
n_max_threads = min(threads, get_N(us))
4248
p = if mask isa NoMask
43-
partition(dest, n_max_threads)
49+
linear_partition(prod(size(dest)), n_max_threads)
4450
else
45-
masked_partition(us, n_max_threads, mask)
51+
masked_partition(mask, n_max_threads, us)
4652
end
4753
auto_launch!(
4854
knl_fill!,

ext/cuda/data_layouts_fused_copyto.jl

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,44 @@
11
Base.@propagate_inbounds function rcopyto_at!(
22
pair::Pair{<:AbstractData, <:Any},
3-
I,
3+
cart_inds,
4+
tidx,
45
us,
56
)
67
dest, bc = pair.first, pair.second
7-
if is_valid_index(dest, I, us)
8+
if linear_is_valid_index(tidx, us) && tidx length(unval(cart_inds))
9+
I = unval(cart_inds)[tidx]
810
dest[I] = isascalar(bc) ? bc[] : bc[I]
911
end
1012
return nothing
1113
end
12-
Base.@propagate_inbounds function rcopyto_at!(pair::Pair{<:DataF, <:Any}, I, us)
14+
Base.@propagate_inbounds function rcopyto_at!(
15+
pair::Pair{<:DataF, <:Any},
16+
cart_inds,
17+
tidx,
18+
us,
19+
)
1320
dest, bc = pair.first, pair.second
14-
if is_valid_index(dest, I, us)
21+
if linear_is_valid_index(tidx, us) && tidx length(unval(cart_inds))
22+
I = unval(cart_inds)[tidx]
1523
bcI = isascalar(bc) ? bc[] : bc[I]
1624
dest[] = bcI
1725
end
1826
return nothing
1927
end
20-
Base.@propagate_inbounds function rcopyto_at!(pairs::Tuple, I, us)
21-
rcopyto_at!(first(pairs), I, us)
22-
rcopyto_at!(Base.tail(pairs), I, us)
28+
Base.@propagate_inbounds function rcopyto_at!(pairs::Tuple, cart_inds, tidx, us)
29+
rcopyto_at!(first(pairs), cart_inds, tidx, us)
30+
rcopyto_at!(Base.tail(pairs), cart_inds, tidx, us)
2331
end
24-
Base.@propagate_inbounds rcopyto_at!(pairs::Tuple{<:Any}, I, us) =
25-
rcopyto_at!(first(pairs), I, us)
26-
@inline rcopyto_at!(pairs::Tuple{}, I, us) = nothing
32+
Base.@propagate_inbounds rcopyto_at!(pairs::Tuple{<:Any}, cart_inds, tidx, us) =
33+
rcopyto_at!(first(pairs), cart_inds, tidx, us)
34+
@inline rcopyto_at!(pairs::Tuple{}, cart_inds, tidx, us) = nothing
2735

28-
function knl_fused_copyto!(fmbc::FusedMultiBroadcast, dest1, us)
36+
function knl_fused_copyto!(fmbc::FusedMultiBroadcast, dest1, us, cart_inds)
2937
@inbounds begin
30-
I = universal_index(dest1)
31-
if is_valid_index(dest1, I, us)
38+
tidx = linear_thread_idx()
39+
if linear_is_valid_index(tidx, us) && tidx length(unval(cart_inds))
3240
(; pairs) = fmbc
33-
rcopyto_at!(pairs, I, us)
41+
rcopyto_at!(pairs, cart_inds, tidx, us)
3442
end
3543
end
3644
return nothing
@@ -138,10 +146,11 @@ function launch_fused_copyto!(fmb::FusedMultiBroadcast)
138146
blocks_s = p.blocks,
139147
)
140148
else
141-
args = (fmb, dest1, us)
149+
cart_inds = cartesian_indices(us)
150+
args = (fmb, dest1, us, cart_inds)
142151
threads = threads_via_occupancy(knl_fused_copyto!, args)
143152
n_max_threads = min(threads, get_N(us))
144-
p = partition(dest1, n_max_threads)
153+
p = linear_partition(prod(size(dest1)), n_max_threads)
145154
auto_launch!(
146155
knl_fused_copyto!,
147156
args;

0 commit comments

Comments
 (0)