CliMA
diff --git a/‎.buildkite/pipeline.yml
Lines changed: 10 additions & 0 deletions b/‎.buildkite/pipeline.yml
Lines changed: 10 additions & 0 deletions
diff --git a/‎docs/make.jl
Lines changed: 4 additions & 1 deletion b/‎docs/make.jl
Lines changed: 4 additions & 1 deletion
diff --git a/‎docs/src/shmem_design.md
Lines changed: 46 additions & 0 deletions b/‎docs/src/shmem_design.md
Lines changed: 46 additions & 0 deletions
diff --git a/‎ext/ClimaCoreCUDAExt.jl
Lines changed: 2 additions & 0 deletions b/‎ext/ClimaCoreCUDAExt.jl
Lines changed: 2 additions & 0 deletions
diff --git a/‎ext/cuda/data_layouts_threadblock.jl
Lines changed: 30 additions & 0 deletions b/‎ext/cuda/data_layouts_threadblock.jl
Lines changed: 30 additions & 0 deletions
diff --git a/‎ext/cuda/operators_fd_shmem.jl
Lines changed: 94 additions & 0 deletions b/‎ext/cuda/operators_fd_shmem.jl
Lines changed: 94 additions & 0 deletions
@@ -610,6 +610,16 @@ steps:
         key: unit_spec_ops_plane
         command: "julia --color=yes --check-bounds=yes --project=.buildkite test/Operators/spectralelement/plane.jl"
 
+      - label: "Unit: FD operator (shmem)"
+        key: unit_fd_operator_shmem
+        command:
+          - "julia --color=yes --check-bounds=yes --project=.buildkite test/Operators/finitedifference/unit_fd_ops_shared_memory.jl"
+          - "julia --color=yes --project=.buildkite test/Operators/finitedifference/benchmark_fd_ops_shared_memory.jl"
+        env:
+          CLIMACOMMS_DEVICE: "CUDA"
+        agents:
+          slurm_gpus: 1
+
       - label: "Unit: column"
         key: unit_column
         command:
 
@@ -80,7 +80,10 @@ withenv("GKSwstype" => "nul") do
             "Remapping" => "remapping.md",
             "MatrixFields" => "matrix_fields.md",
             "API" => "api.md",
-            "Developer docs" => ["Performance tips" => "performance_tips.md"],
+            "Developer docs" => [
+                "Performance tips" => "performance_tips.md"
+                "Shared memory design" => "shmem_design.md"
+            ],
             "Tutorials" => [
                 joinpath("tutorials", tutorial * ".md") for
                 tutorial in TUTORIALS
 
@@ -0,0 +1,46 @@
+# Shared memory design
+
+ClimaCore stencil operators support staggered (or collocated) finite difference
+and interpolation operations. For example, the `DivergenceF2C` operator takes
+an argument that lives on the cell faces and the resulting divergence
+calculation lives on the cell centers. Such operations are effectively
+matrix-vector multiplication and are often a significant portion of the runtime
+cost for users.
+
+Here, we outline an optimization, shared memory (or, "shmem" for short), that we
+use to improve the performance of these operations.
+
+## Motivation
+
+A naive and simplified implementation of this operation looks like `div[i] = (f
+[i+1] - f[i]) / dz[i]`. Such a calculation on the gpu (or cpu) requires `f[i]`
+be read from global memory to compute the result of `div[i]` and `div[i-1]`. Not
+to mention, if `f` is a `Broadcasted` object (`Broadcasted` objects behave like
+arrays, and support `f[i]` behavior), then `f[i]` may require several reads and
+or computations.
+
+Reading data from global memory is often the main bottleneck for
+bandwidth-limited cuda kernels. As such, we use shmem to reduce the number of global memory reads (and compute) in our kernels.
+
+## High-level design
+
+The high-level view of the design is:
+
+ - The `bc::StencilBroadcasted` type has a `work` field, which is used to store
+   shmem for the `bc.op` operator. The element type of the `work`
+   (or parts of `work` if there are multiple parts) is the type returned by the
+   `bc.op`'s `Operator.return_eltype`.
+ - Recursively reconstruct the broadcasted object, allocating shmem for
+   each `StencilBroadcasted` along the way that supports shmem
+   (different operators require different arguments, and therefore different
+   types and amounts of shmem).
+ - Recursively fill the shmem for all `StencilBroadcasted`. This is done
+   by reading the argument data from `getidx`
+ - The destination field is filled with the result of `getidx` (as it is without
+   shmem), except that we overload `getidx` (for supported `StencilBroadcasted`
+   types) to retrieve the result of `getidx` via `fd_operator_evaluate`, which
+   retrieves the result from the shmem, instead of global memory.
+
+
+
+
@@ -33,6 +33,8 @@ include(joinpath("cuda", "operators_integral.jl"))
 include(joinpath("cuda", "remapping_interpolate_array.jl"))
 include(joinpath("cuda", "limiters.jl"))
 include(joinpath("cuda", "operators_sem_shmem.jl"))
+include(joinpath("cuda", "operators_fd_shmem_common.jl"))
+include(joinpath("cuda", "operators_fd_shmem.jl"))
 include(joinpath("cuda", "matrix_fields_single_field_solve.jl"))
 include(joinpath("cuda", "matrix_fields_multiple_field_solve.jl"))
 include(joinpath("cuda", "operators_spectral_element.jl"))
 
@@ -313,3 +313,33 @@ end
     ij,
     slabidx,
 ) = Operators.is_valid_index(space, ij, slabidx)
+
+##### shmem fd kernel partition
+@inline function fd_stencil_partition(
+    us::DataLayouts.UniversalSize,
+    n_face_levels::Integer,
+    n_max_threads::Integer = 256;
+)
+    (Nq, _, _, Nv, Nh) = DataLayouts.universal_size(us)
+    Nvthreads = n_face_levels
+    @assert Nvthreads <= maximum_allowable_threads()[1] "Number of vertical face levels cannot exceed $(maximum_allowable_threads()[1])"
+    Nvblocks = cld(Nv, Nvthreads) # +1 may be needed to guarantee that shared memory is populated at the last cell face
+    return (;
+        threads = (Nvthreads,),
+        blocks = (Nh, Nvblocks, Nq * Nq),
+        Nvthreads,
+    )
+end
+@inline function fd_stencil_universal_index(space::Spaces.AbstractSpace, us)
+    (tv,) = CUDA.threadIdx()
+    (h, bv, ij) = CUDA.blockIdx()
+    v = tv + (bv - 1) * CUDA.blockDim().x
+    (Nq, _, _, _, _) = DataLayouts.universal_size(us)
+    if Nq * Nq < ij
+        return CartesianIndex((-1, -1, 1, -1, -1))
+    end
+    @inbounds (i, j) = CartesianIndices((Nq, Nq))[ij].I
+    return CartesianIndex((i, j, 1, v, h))
+end
+@inline fd_stencil_is_valid_index(I::CI5, us::UniversalSize) =
+    1 ≤ I[5] ≤ DataLayouts.get_Nh(us)
@@ -0,0 +1,94 @@
+import ClimaCore: DataLayouts, Spaces, Geometry, RecursiveApply, DataLayouts
+import CUDA
+import ClimaCore.Operators: return_eltype, get_local_geometry
+
+Base.@propagate_inbounds function fd_operator_shmem(
+    space,
+    ::Val{Nvt},
+    op::Operators.DivergenceF2C,
+    args...,
+) where {Nvt}
+    # allocate temp output
+    RT = return_eltype(op, args...)
+    Ju³ = CUDA.CuStaticSharedArray(RT, (Nvt,))
+    return Ju³
+end
+
+Base.@propagate_inbounds function fd_operator_fill_shmem_interior!(
+    op::Operators.DivergenceF2C,
+    Ju³,
+    loc, # can be any location
+    space,
+    idx::Utilities.PlusHalf,
+    hidx,
+    arg,
+)
+    @inbounds begin
+        vt = threadIdx().x
+        lg = Geometry.LocalGeometry(space, idx, hidx)
+        u³ = Operators.getidx(space, arg, loc, idx, hidx)
+        Ju³[vt] = Geometry.Jcontravariant3(u³, lg)
+    end
+    return nothing
+end
+
+Base.@propagate_inbounds function fd_operator_fill_shmem_left_boundary!(
+    op::Operators.DivergenceF2C,
+    bc::Operators.SetValue,
+    Ju³,
+    loc,
+    space,
+    idx::Utilities.PlusHalf,
+    hidx,
+    arg,
+)
+    idx == Operators.left_face_boundary_idx(space) ||
+        error("Incorrect left idx")
+    @inbounds begin
+        vt = threadIdx().x
+        lg = Geometry.LocalGeometry(space, idx, hidx)
+        u³ = Operators.getidx(space, bc.val, loc, nothing, hidx)
+        Ju³[vt] = Geometry.Jcontravariant3(u³, lg)
+    end
+    return nothing
+end
+
+Base.@propagate_inbounds function fd_operator_fill_shmem_right_boundary!(
+    op::Operators.DivergenceF2C,
+    bc::Operators.SetValue,
+    Ju³,
+    loc,
+    space,
+    idx::Utilities.PlusHalf,
+    hidx,
+    arg,
+)
+    # The right boundary is called at `idx + 1`, so we need to subtract 1 from idx (shmem is loaded at vt+1)
+    idx == Operators.right_face_boundary_idx(space) ||
+        error("Incorrect right idx")
+    @inbounds begin
+        vt = threadIdx().x
+        lg = Geometry.LocalGeometry(space, idx, hidx)
+        u³ = Operators.getidx(space, bc.val, loc, nothing, hidx)
+        Ju³[vt] = Geometry.Jcontravariant3(u³, lg)
+    end
+    return nothing
+end
+
+Base.@propagate_inbounds function fd_operator_evaluate(
+    op::Operators.DivergenceF2C,
+    Ju³,
+    loc,
+    space,
+    idx::Integer,
+    hidx,
+    args...,
+)
+    @inbounds begin
+        vt = threadIdx().x
+        local_geometry = Geometry.LocalGeometry(space, idx, hidx)
+        Ju³₋ = Ju³[vt]   # corresponds to idx - half
+        Ju³₊ = Ju³[vt + 1] # corresponds to idx + half
+        return (Ju³₊ ⊟ Ju³₋) ⊠ local_geometry.invJ
+    end
+end