CliMA
diff --git a/‎.buildkite/pipeline.yml
Lines changed: 16 additions & 0 deletions b/‎.buildkite/pipeline.yml
Lines changed: 16 additions & 0 deletions
diff --git a/‎NEWS.md
Lines changed: 24 additions & 0 deletions b/‎NEWS.md
Lines changed: 24 additions & 0 deletions
diff --git a/‎Project.toml
Lines changed: 1 addition & 1 deletion b/‎Project.toml
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/src/shmem_design.md
Lines changed: 29 additions & 1 deletion b/‎docs/src/shmem_design.md
Lines changed: 29 additions & 1 deletion
diff --git a/‎docs/src/shmem_diagram_example.png
215 KB b/‎docs/src/shmem_diagram_example.png
215 KB
diff --git a/‎ext/cuda/data_layouts.jl
Lines changed: 6 additions & 0 deletions b/‎ext/cuda/data_layouts.jl
Lines changed: 6 additions & 0 deletions
diff --git a/‎ext/cuda/data_layouts_copyto.jl
Lines changed: 25 additions & 14 deletions b/‎ext/cuda/data_layouts_copyto.jl
Lines changed: 25 additions & 14 deletions
diff --git a/‎ext/cuda/data_layouts_fill.jl
Lines changed: 18 additions & 12 deletions b/‎ext/cuda/data_layouts_fill.jl
Lines changed: 18 additions & 12 deletions
diff --git a/‎ext/cuda/data_layouts_fused_copyto.jl
Lines changed: 25 additions & 16 deletions b/‎ext/cuda/data_layouts_fused_copyto.jl
Lines changed: 25 additions & 16 deletions
@@ -1179,6 +1179,14 @@ steps:
         key: unit_spectralelement2d
         command: "julia --color=yes --check-bounds=yes --project=.buildkite test/InputOutput/unit_spectralelement2d.jl"
 
+      - label: "Unit: spectralelement2d"
+        key: unit_spectralelement2d_gpu
+        command: "julia --color=yes --check-bounds=yes --project=.buildkite test/InputOutput/unit_spectralelement2d.jl"
+        env:
+          CLIMACOMMS_DEVICE: "CUDA"
+        agents:
+          slurm_gpus: 1
+
       - label: "Unit: hybrid2dbox"
         key: unit_hybrid2dbox
         command: "julia --color=yes --check-bounds=yes --project=.buildkite test/InputOutput/unit_hybrid2dbox.jl"
@@ -1203,6 +1211,14 @@ steps:
         key: unit_hybrid3dcubedsphere_topography
         command: "julia --color=yes --check-bounds=yes --project=.buildkite test/InputOutput/unit_hybrid3dcubedsphere_topography.jl"
 
+      - label: "Unit: hybrid3dcubedsphere topography"
+        key: unit_hybrid3dcubedsphere_topography_gpu
+        command: "julia --color=yes --check-bounds=yes --project=.buildkite test/InputOutput/unit_hybrid3dcubedsphere_topography.jl"
+        env:
+          CLIMACOMMS_DEVICE: "CUDA"
+        agents:
+          slurm_gpus: 1
+
       - label: "Unit: finitedifference"
         key: unit_finitedifference
         command: "julia --color=yes --check-bounds=yes --project=.buildkite test/InputOutput/unit_finitedifference.jl"
 
@@ -4,6 +4,30 @@ ClimaCore.jl Release Notes
 main
 -------
 
+- Fixed missing method for `Topologies.mesh(Topology2D)`
+   [2288](https://github.com/CliMA/ClimaCore.jl/pull/2288).
+
+v0.14.31
+-------
+
+ - GPU performance should now be more robust
+   [2296](https://github.com/CliMA/ClimaCore.jl/pull/2296).
+
+ - Remapping is now protected for masked operations
+   [2292](https://github.com/CliMA/ClimaCore.jl/pull/2292).
+
+ - Shmem support for InterpolateC2F was added
+   [2290](https://github.com/CliMA/ClimaCore.jl/pull/2290).
+
+ - Some masked operations were fixed
+   [2285](https://github.com/CliMA/ClimaCore.jl/pull/2285).
+
+ - Internal refactoring should help reduce latency and improve gpu performance
+   [2284](https://github.com/CliMA/ClimaCore.jl/pull/2284).
+
+ - Masks now support restart
+   [2212](https://github.com/CliMA/ClimaCore.jl/pull/2212).
+
 v0.14.30
 -------
 
 
@@ -1,7 +1,7 @@
 name = "ClimaCore"
 uuid = "d414da3d-4745-48bb-8d80-42e94e092884"
 authors = ["CliMA Contributors <clima-software@caltech.edu>"]
-version = "0.14.30"
+version = "0.14.31"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 
@@ -35,12 +35,40 @@ The high-level view of the design is:
    (different operators require different arguments, and therefore different
    types and amounts of shmem).
  - Recursively fill the shmem for all `StencilBroadcasted`. This is done
-   by reading the argument data from `getidx`
+   by reading the argument data from `getidx`. See the section discussion below for more details.
  - The destination field is filled with the result of `getidx` (as it is without
    shmem), except that we overload `getidx` (for supported `StencilBroadcasted`
    types) to retrieve the result of `getidx` via `fd_operator_evaluate`, which
    retrieves the result from the shmem, instead of global memory.
 
+### Populating shared memory, and memory access safety
 
+We use tail-recursion when filling shared memory of the broadcast expressions.
+That is, we visit leaves of the broadcast expression, then work our way up.
+It's important to note that the `StencilBroadcasted` and `Broadcasted` can be
+interleaved.
 
+Let's take `DivergenceF2C()(f*GradientC2F()(a*b)))` as an example (depicted in
+the image below).
 
+Recursion must go through the entire expression in order to ensure that we've
+reached all of the leaves of the `StencilBroadcasted` objects (otherwise, we
+could introduce race conditions with memory access). The leaves of the
+`StencilBroadcasted` will call `getidx`, below which there are (by definition)
+no more `StencilBroadcasted`, and those `getidx` calls will read from global
+memory. All subsequent reads will be from shmem(as they will be caught by the
+`getidx(parent_space, bc::StencilBroadcasted
+{CUDAWithShmemColumnStencilStyle}, idx, hidx)` defined in the
+`ClimaCoreCUDAExt` module).
+
+In the diagram below, we traverse and fill the yellow highlighted sections
+(bottom first and top last). The algorithmic impact of using shared memory is
+that the duplicate global memory reads (highlighted in red circles) become one
+global memory read (performed in `fd_operator_fill_shmem!`).
+
+Finally, its important to note that threads must by syncrhonized after each node
+in the tree is filled, to avoid race conditions for subsequent `getidx
+(parent_space, bc::StencilBroadcasted{CUDAWithShmemColumnStencilStyle}, idx,
+hidx)` calls (which are retrieved via shmem).
+
+![](shmem_diagram_example.png)
@@ -27,6 +27,12 @@ Base.similar(
     dims::Dims{N},
 ) where {T, N, B} = similar(CUDA.CuArray{T, N, B}, dims)
 
+unval(::Val{CI}) where {CI} = CI
+unval(CI) = CI
+
+@inline linear_thread_idx() =
+    threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
+
 include("data_layouts_fill.jl")
 include("data_layouts_copyto.jl")
 include("data_layouts_fused_copyto.jl")
 
@@ -1,19 +1,20 @@
 DataLayouts.device_dispatch(x::CUDA.CuArray) = ToCUDA()
 
-function knl_copyto!(dest, src, us, mask)
-    I = if mask isa NoMask
-        universal_index(dest)
-    else
-        masked_universal_index(mask)
-    end
-    if is_valid_index(dest, I, us)
+function knl_copyto!(dest, src, us, mask, cart_inds)
+    tidx = linear_thread_idx()
+    if linear_is_valid_index(tidx, us) && tidx ≤ length(unval(cart_inds))
+        I = if mask isa NoMask
+            unval(cart_inds)[tidx]
+        else
+            masked_universal_index(mask, cart_inds)
+        end
         @inbounds dest[I] = src[I]
     end
     return nothing
 end
 
 function knl_copyto_linear!(dest, src, us)
-    i = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
+    i = linear_thread_idx()
     if linear_is_valid_index(i, us)
         @inbounds dest[i] = src[i]
     end
@@ -32,13 +33,18 @@ if VERSION ≥ v"1.11.0-beta"
         (_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
         us = DataLayouts.UniversalSize(dest)
         if Nv > 0 && Nh > 0
-            args = (dest, bc, us, mask)
+            cart_inds = if mask isa NoMask
+                cartesian_indices(us)
+            else
+                cartesian_indicies_mask(us, mask)
+            end
+            args = (dest, bc, us, mask, cart_inds)
             threads = threads_via_occupancy(knl_copyto!, args)
             n_max_threads = min(threads, get_N(us))
             p = if mask isa NoMask
-                partition(dest, n_max_threads)
+                linear_partition(prod(size(dest)), n_max_threads)
             else
-                masked_partition(us, n_max_threads, mask)
+                masked_partition(mask, n_max_threads, us)
             end
             auto_launch!(
                 knl_copyto!,
@@ -72,13 +78,18 @@ else
                     blocks_s = p.blocks,
                 )
             else
-                args = (dest, bc, us, mask)
+                cart_inds = if mask isa NoMask
+                    cartesian_indices(us)
+                else
+                    cartesian_indicies_mask(us, mask)
+                end
+                args = (dest, bc, us, mask, cart_inds)
                 threads = threads_via_occupancy(knl_copyto!, args)
                 n_max_threads = min(threads, get_N(us))
                 p = if mask isa NoMask
-                    partition(dest, n_max_threads)
+                    linear_partition(prod(size(dest)), n_max_threads)
                 else
-                    masked_partition(us, n_max_threads, mask)
+                    masked_partition(mask, n_max_threads, us)
                 end
                 auto_launch!(
                     knl_copyto!,
 
@@ -1,25 +1,26 @@
-function knl_fill!(dest, val, us, mask)
-    I = if mask isa NoMask
-        universal_index(dest)
-    else
-        masked_universal_index(mask)
-    end
-    if is_valid_index(dest, I, us)
+function knl_fill!(dest, val, us, mask, cart_inds)
+    tidx = linear_thread_idx()
+    if linear_is_valid_index(tidx, us) && tidx ≤ length(unval(cart_inds))
+        I = if mask isa NoMask
+            unval(cart_inds)[tidx]
+        else
+            masked_universal_index(mask, cart_inds)
+        end
         @inbounds dest[I] = val
     end
     return nothing
 end
 
 function knl_fill_linear!(dest, val, us)
-    i = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
+    i = linear_thread_idx()
     if linear_is_valid_index(i, us)
         @inbounds dest[i] = val
     end
     return nothing
 end
 
 function Base.fill!(dest::AbstractData, bc, to::ToCUDA, mask = NoMask())
-    (_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
+    (Ni, Nj, Nv, _, Nh) = DataLayouts.universal_size(dest)
     us = DataLayouts.UniversalSize(dest)
     if Nv > 0 && Nh > 0
         if !(VERSION ≥ v"1.11.0-beta") &&
@@ -36,13 +37,18 @@ function Base.fill!(dest::AbstractData, bc, to::ToCUDA, mask = NoMask())
                 blocks_s = p.blocks,
             )
         else
-            args = (dest, bc, us, mask)
+            cart_inds = if mask isa NoMask
+                cartesian_indices(us)
+            else
+                cartesian_indicies_mask(us, mask)
+            end
+            args = (dest, bc, us, mask, cart_inds)
             threads = threads_via_occupancy(knl_fill!, args)
             n_max_threads = min(threads, get_N(us))
             p = if mask isa NoMask
-                partition(dest, n_max_threads)
+                linear_partition(prod(size(dest)), n_max_threads)
             else
-                masked_partition(us, n_max_threads, mask)
+                masked_partition(mask, n_max_threads, us)
             end
             auto_launch!(
                 knl_fill!,
 
@@ -1,36 +1,44 @@
 Base.@propagate_inbounds function rcopyto_at!(
     pair::Pair{<:AbstractData, <:Any},
-    I,
+    cart_inds,
+    tidx,
     us,
 )
     dest, bc = pair.first, pair.second
-    if is_valid_index(dest, I, us)
+    if linear_is_valid_index(tidx, us) && tidx ≤ length(unval(cart_inds))
+        I = unval(cart_inds)[tidx]
         dest[I] = isascalar(bc) ? bc[] : bc[I]
     end
     return nothing
 end
-Base.@propagate_inbounds function rcopyto_at!(pair::Pair{<:DataF, <:Any}, I, us)
+Base.@propagate_inbounds function rcopyto_at!(
+    pair::Pair{<:DataF, <:Any},
+    cart_inds,
+    tidx,
+    us,
+)
     dest, bc = pair.first, pair.second
-    if is_valid_index(dest, I, us)
+    if linear_is_valid_index(tidx, us) && tidx ≤ length(unval(cart_inds))
+        I = unval(cart_inds)[tidx]
         bcI = isascalar(bc) ? bc[] : bc[I]
         dest[] = bcI
     end
     return nothing
 end
-Base.@propagate_inbounds function rcopyto_at!(pairs::Tuple, I, us)
-    rcopyto_at!(first(pairs), I, us)
-    rcopyto_at!(Base.tail(pairs), I, us)
+Base.@propagate_inbounds function rcopyto_at!(pairs::Tuple, cart_inds, tidx, us)
+    rcopyto_at!(first(pairs), cart_inds, tidx, us)
+    rcopyto_at!(Base.tail(pairs), cart_inds, tidx, us)
 end
-Base.@propagate_inbounds rcopyto_at!(pairs::Tuple{<:Any}, I, us) =
-    rcopyto_at!(first(pairs), I, us)
-@inline rcopyto_at!(pairs::Tuple{}, I, us) = nothing
+Base.@propagate_inbounds rcopyto_at!(pairs::Tuple{<:Any}, cart_inds, tidx, us) =
+    rcopyto_at!(first(pairs), cart_inds, tidx, us)
+@inline rcopyto_at!(pairs::Tuple{}, cart_inds, tidx, us) = nothing
 
-function knl_fused_copyto!(fmbc::FusedMultiBroadcast, dest1, us)
+function knl_fused_copyto!(fmbc::FusedMultiBroadcast, dest1, us, cart_inds)
     @inbounds begin
-        I = universal_index(dest1)
-        if is_valid_index(dest1, I, us)
+        tidx = linear_thread_idx()
+        if linear_is_valid_index(tidx, us) && tidx ≤ length(unval(cart_inds))
             (; pairs) = fmbc
-            rcopyto_at!(pairs, I, us)
+            rcopyto_at!(pairs, cart_inds, tidx, us)
         end
     end
     return nothing
@@ -138,10 +146,11 @@ function launch_fused_copyto!(fmb::FusedMultiBroadcast)
             blocks_s = p.blocks,
         )
     else
-        args = (fmb, dest1, us)
+        cart_inds = cartesian_indices(us)
+        args = (fmb, dest1, us, cart_inds)
         threads = threads_via_occupancy(knl_fused_copyto!, args)
         n_max_threads = min(threads, get_N(us))
-        p = partition(dest1, n_max_threads)
+        p = linear_partition(prod(size(dest1)), n_max_threads)
         auto_launch!(
             knl_fused_copyto!,
             args;