CliMA
diff --git a/‎ext/ClimaCoreCUDAExt.jl
Lines changed: 2 additions & 1 deletion b/‎ext/ClimaCoreCUDAExt.jl
Lines changed: 2 additions & 1 deletion
diff --git a/‎ext/cuda/cuda_utils.jl
Lines changed: 94 additions & 0 deletions b/‎ext/cuda/cuda_utils.jl
Lines changed: 94 additions & 0 deletions
diff --git a/‎ext/cuda/data_layouts.jl
Lines changed: 50 additions & 22 deletions b/‎ext/cuda/data_layouts.jl
Lines changed: 50 additions & 22 deletions
diff --git a/‎ext/cuda/limiters.jl
Lines changed: 24 additions & 3 deletions b/‎ext/cuda/limiters.jl
Lines changed: 24 additions & 3 deletions
diff --git a/‎ext/cuda/matrix_fields_multiple_field_solve.jl
Lines changed: 11 additions & 5 deletions b/‎ext/cuda/matrix_fields_multiple_field_solve.jl
Lines changed: 11 additions & 5 deletions
diff --git a/‎ext/cuda/operators_finite_difference.jl
Lines changed: 8 additions & 1 deletion b/‎ext/cuda/operators_finite_difference.jl
Lines changed: 8 additions & 1 deletion
@@ -13,6 +13,7 @@ import ClimaCore.Utilities: half
 import ClimaCore.RecursiveApply:
     ⊠, ⊞, ⊟, radd, rmul, rsub, rdiv, rmap, rzero, rmin, rmax
 
+include(joinpath("cuda", "cuda_utils.jl"))
 include(joinpath("cuda", "data_layouts.jl"))
 include(joinpath("cuda", "fields.jl"))
 include(joinpath("cuda", "topologies_dss.jl"))
@@ -23,8 +24,8 @@ include(joinpath("cuda", "remapping_interpolate_array.jl"))
 include(joinpath("cuda", "limiters.jl"))
 include(joinpath("cuda", "operators_sem_shmem.jl"))
 include(joinpath("cuda", "operators_thomas_algorithm.jl"))
+include(joinpath("cuda", "matrix_fields_single_field_solve.jl"))
 include(joinpath("cuda", "matrix_fields_multiple_field_solve.jl"))
 include(joinpath("cuda", "operators_spectral_element.jl"))
-include(joinpath("cuda", "matrix_fields_single_field_solve.jl"))
 
 end
@@ -0,0 +1,94 @@
+import CUDA
+import ClimaCore.Fields
+import ClimaCore.DataLayouts
+
+get_n_items(field::Fields.Field) =
+    prod(size(parent(Fields.field_values(field))))
+get_n_items(data::DataLayouts.AbstractData) = prod(size(parent(data)))
+get_n_items(arr::AbstractArray) = prod(size(parent(arr)))
+get_n_items(tup::Tuple) = prod(tup)
+
+"""
+    auto_launch!(f!::F!, args,
+        ::Union{
+            Int,
+            NTuple{N, <:Int},
+            AbstractArray,
+            AbstractData,
+            Field,
+        };
+        threads_s,
+        blocks_s,
+        always_inline = true
+    )
+
+Launch a cuda kernel, using `CUDA.launch_configuration`
+to determine the number of threads/blocks.
+
+Suggested threads and blocks (`threads_s`, `blocks_s`) can be given
+to benchmark compare against auto-determined threads/blocks.
+"""
+function auto_launch!(
+    f!::F!,
+    args,
+    data;
+    threads_s,
+    blocks_s,
+    always_inline = true,
+) where {F!}
+    nitems = get_n_items(data)
+    # For now, we'll simply use the
+    # suggested threads and blocks:
+    CUDA.@cuda always_inline = always_inline threads = threads_s blocks =
+        blocks_s f!(args...)
+
+    # Soon, we'll experiment with `CUDA.launch_configuration`
+    # kernel = CUDA.@cuda always_inline = true launch = false f!(args...)
+    # config = CUDA.launch_configuration(kernel.fun)
+    # threads = min(nitems, config.threads)
+    # blocks = cld(nitems, threads)
+    # s = ""
+    # s *= "Launching kernel $f! with following config:\n"
+    # s *= "     nitems: $nitems\n"
+    # s *= "     threads: $threads\n"
+    # s *= "     blocks: $blocks\n"
+    # @info s
+    # kernel(args...; threads, blocks) # This knows to use always_inline from above.
+end
+
+"""
+    kernel_indexes(n)
+Return a tuple of indexes from the kernel,
+where `n` is a tuple of max lengths along each
+dimension of the accessed data.
+"""
+function kernel_indexes(n::Tuple)
+    tidx = (CUDA.blockIdx().x - 1) * CUDA.blockDim().x + CUDA.threadIdx().x
+    inds = if 1 ≤ tidx ≤ prod(n)
+        CartesianIndices(map(x -> Base.OneTo(x), n))[tidx].I
+    else
+        ntuple(x -> -1, length(n))
+    end
+    return inds
+end
+
+"""
+    valid_range(inds, n)
+Returns a `Bool` indicating if the thread index
+is in the valid range, based on `inds` (the result
+of `kernel_indexes`) and `n`, a tuple of max lengths
+along each dimension of the accessed data.
+```julia
+function kernel!(data, n)
+    inds = kernel_indexes(n)
+    if valid_range(inds, n)
+        do_work!(data[inds...])
+    end
+end
+```
+"""
+valid_range(inds::NTuple, n::Tuple) = all(i -> 1 ≤ inds[i] ≤ n[i], 1:length(n))
+function valid_range(n::Tuple)
+    inds = kernel_indexes(n)
+    return all(i -> 1 ≤ inds[i] ≤ n[i], 1:length(n))
+end
@@ -56,9 +56,12 @@ function Base.copyto!(
 ) where {S, Nij, A <: CUDA.CuArray}
     _, _, _, _, Nh = size(bc)
     if Nh > 0
-        CUDA.@cuda always_inline = true threads = (Nij, Nij) blocks = (Nh, 1) knl_copyto!(
-            dest,
-            bc,
+        auto_launch!(
+            knl_copyto!,
+            (dest, bc),
+            dest;
+            threads_s = (Nij, Nij),
+            blocks_s = (Nh, 1),
         )
     end
     return dest
@@ -73,9 +76,12 @@ function Base.fill!(
 }
     _, _, _, _, Nh = size(dest)
     if Nh > 0
-        CUDA.@cuda always_inline = true threads = (Nij, Nij) blocks = (Nh, 1) knl_fill!(
-            dest,
-            val,
+        auto_launch!(
+            knl_fill!,
+            (dest, val),
+            dest;
+            threads_s = (Nij, Nij),
+            blocks_s = (Nh, 1),
         )
     end
     return dest
@@ -91,8 +97,13 @@ function Base.copyto!(
     if Nv > 0 && Nh > 0
         Nv_per_block = min(Nv, fld(256, Nij * Nij))
         Nv_blocks = cld(Nv, Nv_per_block)
-        CUDA.@cuda always_inline = true threads = (Nij, Nij, Nv_per_block) blocks =
-            (Nh, Nv_blocks) knl_copyto!(dest, bc)
+        auto_launch!(
+            knl_copyto!,
+            (dest, bc),
+            dest;
+            threads_s = (Nij, Nij, Nv_per_block),
+            blocks_s = (Nh, Nv_blocks),
+        )
     end
     return dest
 end
@@ -104,8 +115,13 @@ function Base.fill!(
     if Nv > 0 && Nh > 0
         Nv_per_block = min(Nv, fld(256, Nij * Nij))
         Nv_blocks = cld(Nv, Nv_per_block)
-        CUDA.@cuda always_inline = true threads = (Nij, Nij, Nv_per_block) blocks =
-            (Nh, Nv_blocks) knl_fill!(dest, val)
+        auto_launch!(
+            knl_fill!,
+            (dest, val),
+            dest;
+            threads_s = (Nij, Nij, Nv_per_block),
+            blocks_s = (Nh, Nv_blocks),
+        )
     end
     return dest
 end
@@ -117,19 +133,25 @@ function Base.copyto!(
 ) where {S, A <: CUDA.CuArray}
     _, _, _, Nv, Nh = size(bc)
     if Nv > 0 && Nh > 0
-        CUDA.@cuda always_inline = true threads = (1, 1) blocks = (Nh, Nv) knl_copyto!(
-            dest,
-            bc,
+        auto_launch!(
+            knl_copyto!,
+            (dest, bc),
+            dest;
+            threads_s = (1, 1),
+            blocks_s = (Nh, Nv),
         )
     end
     return dest
 end
 function Base.fill!(dest::VF{S, A}, val) where {S, A <: CUDA.CuArray}
     _, _, _, Nv, Nh = size(dest)
     if Nv > 0 && Nh > 0
-        CUDA.@cuda always_inline = true threads = (1, 1) blocks = (Nh, Nv) knl_fill!(
-            dest,
-            val,
+        auto_launch!(
+            knl_fill!,
+            (dest, val),
+            dest;
+            threads_s = (1, 1),
+            blocks_s = (Nh, Nv),
         )
     end
     return dest
@@ -139,16 +161,22 @@ function Base.copyto!(
     dest::DataF{S},
     bc::Union{DataF{S, A}, Base.Broadcast.Broadcasted{DataFStyle{A}}},
 ) where {S, A <: CUDA.CuArray}
-    CUDA.@cuda always_inline = true threads = (1, 1) blocks = (1, 1) knl_copyto!(
-        dest,
-        bc,
+    auto_launch!(
+        knl_copyto!,
+        (dest, bc),
+        dest;
+        threads_s = (1, 1),
+        blocks_s = (1, 1),
     )
     return dest
 end
 function Base.fill!(dest::DataF{S, A}, val) where {S, A <: CUDA.CuArray}
-    CUDA.@cuda always_inline = true threads = (1, 1) blocks = (1, 1) knl_fill!(
-        dest,
-        val,
+    auto_launch!(
+        knl_fill!,
+        (dest, val),
+        dest;
+        threads_s = (1, 1),
+        blocks_s = (1, 1),
     )
     return dest
 end
@@ -31,7 +31,7 @@ function compute_element_bounds!(
     (Ni, Nj, _, Nv, Nh) = S
     nthreads, nblocks = config_threadblock(Nv, Nh)
 
-    CUDA.@cuda always_inline = true threads = nthreads blocks = nblocks compute_element_bounds_kernel!(
+    args = (
         limiter,
         Fields.field_values(Operators.strip_space(ρq, axes(ρq))),
         Fields.field_values(Operators.strip_space(ρ, axes(ρ))),
@@ -40,6 +40,13 @@ function compute_element_bounds!(
         Val(Ni),
         Val(Nj),
     )
+    auto_launch!(
+        compute_element_bounds_kernel!,
+        args,
+        ρ;
+        threads_s = nthreads,
+        blocks_s = nblocks,
+    )
     return nothing
 end
 
@@ -87,7 +94,7 @@ function compute_neighbor_bounds_local!(
     topology = Spaces.topology(axes(ρ))
     Ni, Nj, _, Nv, Nh = size(Fields.field_values(ρ))
     nthreads, nblocks = config_threadblock(Nv, Nh)
-    CUDA.@cuda always_inline = true threads = nthreads blocks = nblocks compute_neighbor_bounds_local_kernel!(
+    args = (
         limiter,
         topology.local_neighbor_elem,
         topology.local_neighbor_elem_offset,
@@ -96,6 +103,13 @@ function compute_neighbor_bounds_local!(
         Val(Ni),
         Val(Nj),
     )
+    auto_launch!(
+        compute_neighbor_bounds_local_kernel!,
+        args,
+        ρ;
+        threads_s = nthreads,
+        blocks_s = nblocks,
+    )
 end
 
 function compute_neighbor_bounds_local_kernel!(
@@ -140,7 +154,7 @@ function apply_limiter!(
     maxiter = Ni * Nj
     WJ = Spaces.local_geometry_data(axes(ρq)).WJ
     nthreads, nblocks = config_threadblock(Nv, Nh)
-    CUDA.@cuda always_inline = true threads = nthreads blocks = nblocks apply_limiter_kernel!(
+    args = (
         limiter,
         Fields.field_values(Operators.strip_space(ρq, axes(ρq))),
         Fields.field_values(Operators.strip_space(ρ, axes(ρ))),
@@ -152,6 +166,13 @@ function apply_limiter!(
         Val(Nj),
         Val(maxiter),
     )
+    auto_launch!(
+        apply_limiter_kernel!,
+        args,
+        ρ;
+        threads_s = nthreads,
+        blocks_s = nblocks,
+    )
     return nothing
 end
 
 
@@ -3,6 +3,7 @@ import ClimaComms
 import LinearAlgebra: UniformScaling
 import ClimaCore.Operators
 import ClimaCore.MatrixFields
+import ClimaCore.MatrixFields: _single_field_solve!
 import ClimaCore.MatrixFields: multiple_field_solve!
 import ClimaCore.MatrixFields: is_CuArray_type
 import ClimaCore.MatrixFields: allow_scalar_func
@@ -30,11 +31,16 @@ function multiple_field_solve!(::ClimaComms.CUDADevice, cache, x, A, b, x1)
     tups = (cache_tup, x_tup, A_tup, b_tup)
 
     device = ClimaComms.device(x[first(names)])
-    CUDA.@cuda threads = nthreads blocks = nblocks multiple_field_solve_kernel!(
-        device,
-        tups,
-        x1,
-        Val(Nnames),
+
+    args = (device, tups, x1, Val(Nnames))
+    # TODO: use always_inline=true
+    auto_launch!(
+        multiple_field_solve_kernel!,
+        args,
+        x1;
+        threads_s = nthreads,
+        blocks_s = nblocks,
+        always_inline = false,
     )
 end
 
 
@@ -31,7 +31,7 @@ function Base.copyto!(
     max_threads = 256
     nitems = Nv * Nq * Nq * Nh # # of independent items
     (nthreads, nblocks) = _configure_threadblock(max_threads, nitems)
-    @cuda always_inline = true threads = (nthreads,) blocks = (nblocks,) copyto_stencil_kernel!(
+    args = (
         strip_space(out, space),
         strip_space(bc, space),
         axes(out),
@@ -40,6 +40,13 @@ function Base.copyto!(
         Nh,
         Nv,
     )
+    auto_launch!(
+        copyto_stencil_kernel!,
+        args,
+        out;
+        threads_s = (nthreads,),
+        blocks_s = (nblocks,),
+    )
     return out
 end