CliMA
diff --git a/‎Project.toml
Lines changed: 2 additions & 1 deletion b/‎Project.toml
Lines changed: 2 additions & 1 deletion
diff --git a/‎ext/ClimaCoreCUDAExt.jl
Lines changed: 21 additions & 0 deletions b/‎ext/ClimaCoreCUDAExt.jl
Lines changed: 21 additions & 0 deletions
diff --git a/‎src/DataLayouts/cuda.jl renamed to ‎ext/cuda/data_layouts.jl
Lines changed: 5 additions & 27 deletions b/‎src/DataLayouts/cuda.jl renamed to ‎ext/cuda/data_layouts.jl
Lines changed: 5 additions & 27 deletions
diff --git a/‎src/Fields/mapreduce_cuda.jl renamed to ‎ext/cuda/fields.jl
Lines changed: 33 additions & 1 deletion b/‎src/Fields/mapreduce_cuda.jl renamed to ‎ext/cuda/fields.jl
Lines changed: 33 additions & 1 deletion
diff --git a/‎ext/cuda/limiters.jl
Lines changed: 276 additions & 0 deletions b/‎ext/cuda/limiters.jl
Lines changed: 276 additions & 0 deletions
@@ -7,7 +7,6 @@ version = "0.13.4"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 BandedMatrices = "aae01518-5342-5314-be14-df237901396f"
 BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ClimaComms = "3a4d1b5c-c61d-41fd-a00a-5873ba7a1b0d"
 CubedSphere = "7445602f-e544-4518-8976-18f8e8ae6cdb"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
@@ -32,9 +31,11 @@ Unrolled = "9602ed7d-8fef-5bc8-8597-8f21381861e8"
 
 [weakdeps]
 Krylov = "ba0b0d4f-ebba-5204-a429-3ac8c609bfb7"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 
 [extensions]
 KrylovExt = "Krylov"
+ClimaCoreCUDAExt = "CUDA"
 
 [compat]
 Adapt = "3, 4"
 
@@ -0,0 +1,21 @@
+module ClimaCoreCUDAExt
+
+import ClimaComms
+import ClimaCore: DataLayouts, Grids, Spaces, Fields
+import CUDA
+
+include(joinpath("cuda", "data_layouts.jl"))
+include(joinpath("cuda", "fields.jl"))
+include(joinpath("cuda", "topologies_dss.jl"))
+include(joinpath("cuda", "operators_finite_difference.jl"))
+include(joinpath("cuda", "remapping_distributed.jl"))
+include(joinpath("cuda", "operators_integral.jl"))
+include(joinpath("cuda", "remapping_interpolate_array.jl"))
+include(joinpath("cuda", "limiters.jl"))
+include(joinpath("cuda", "operators_sem_shmem.jl"))
+include(joinpath("cuda", "operators_thomas_algorithm.jl"))
+include(joinpath("cuda", "matrix_fields_multiple_field_solve.jl"))
+include(joinpath("cuda", "operators_spectral_element.jl"))
+include(joinpath("cuda", "matrix_fields_single_field_solve.jl"))
+
+end
@@ -1,33 +1,11 @@
+
+import ClimaCore.DataLayouts: IJKFVH, IJFH, VIJFH, VIFH, IFH, IJF, IF, VF, DataF
+import ClimaCore.DataLayouts: IJFHStyle, VIJFHStyle, VFStyle, DataFStyle
+import ClimaCore.DataLayouts: promote_parent_array_type
+import ClimaCore.DataLayouts: parent_array_type
 import Adapt
 import CUDA
 
-Adapt.adapt_structure(to, data::IJKFVH{S, Nij, Nk}) where {S, Nij, Nk} =
-    IJKFVH{S, Nij, Nk}(Adapt.adapt(to, parent(data)))
-
-Adapt.adapt_structure(to, data::IJFH{S, Nij}) where {S, Nij} =
-    IJFH{S, Nij}(Adapt.adapt(to, parent(data)))
-
-Adapt.adapt_structure(to, data::VIJFH{S, Nij}) where {S, Nij} =
-    VIJFH{S, Nij}(Adapt.adapt(to, parent(data)))
-
-Adapt.adapt_structure(to, data::VIFH{S, Ni, A}) where {S, Ni, A} =
-    VIFH{S, Ni}(Adapt.adapt(to, parent(data)))
-
-Adapt.adapt_structure(to, data::IFH{S, Ni}) where {S, Ni} =
-    IFH{S, Ni}(Adapt.adapt(to, parent(data)))
-
-Adapt.adapt_structure(to, data::IJF{S, Nij}) where {S, Nij} =
-    IJF{S, Nij}(Adapt.adapt(to, parent(data)))
-
-Adapt.adapt_structure(to, data::IF{S, Ni}) where {S, Ni} =
-    IF{S, Ni}(Adapt.adapt(to, parent(data)))
-
-Adapt.adapt_structure(to, data::VF{S}) where {S} =
-    VF{S}(Adapt.adapt(to, parent(data)))
-
-Adapt.adapt_structure(to, data::DataF{S}) where {S} =
-    DataF{S}(Adapt.adapt(to, parent(data)))
-
 parent_array_type(::Type{<:CUDA.CuArray{T, N, B} where {N}}) where {T, B} =
     CUDA.CuArray{T, N, B} where {N}
 
 
@@ -1,4 +1,10 @@
-
+import ClimaComms
+using CUDA: @cuda
+import LinearAlgebra, Statistics
+import ClimaCore: DataLayouts, Spaces, Grids, Fields
+import ClimaCore.Fields: Field, FieldStyle
+import ClimaCore.Fields: AbstractFieldStyle
+import ClimaCore.Spaces: cuda_synchronize
 function Base.sum(
     field::Union{Field, Base.Broadcast.Broadcasted{<:FieldStyle}},
     ::ClimaComms.CUDADevice,
@@ -280,3 +286,29 @@ end
     newsize = _cuda_reduce!(op, reduction, tidx, newsize, 1)
     return nothing
 end
+
+
+function Adapt.adapt_structure(
+    to::CUDA.KernelAdaptor,
+    bc::Base.Broadcast.Broadcasted{Style},
+) where {Style <: AbstractFieldStyle}
+    Base.Broadcast.Broadcasted{Style}(
+        Adapt.adapt(to, bc.f),
+        Adapt.adapt(to, bc.args),
+        Adapt.adapt(to, bc.axes),
+    )
+end
+
+function Adapt.adapt_structure(
+    to::CUDA.KernelAdaptor,
+    bc::Base.Broadcast.Broadcasted{Style, <:Any, Type{T}},
+) where {Style <: AbstractFieldStyle, T}
+    Base.Broadcast.Broadcasted{Style}(
+        (x...) -> T(x...),
+        Adapt.adapt(to, bc.args),
+        bc.axes,
+    )
+end
+
+cuda_synchronize(device::ClimaComms.CUDADevice; kwargs...) =
+    CUDA.synchronize(; kwargs...)
@@ -0,0 +1,276 @@
+import ClimaCore.Limiters: QuasiMonotoneLimiter
+import ClimaCore.Fields
+
+function config_threadblock(Nv, Nh)
+    nitems = Nv * Nh
+    nthreads = min(256, nitems)
+    nblocks = cld(nitems, nthreads)
+    return (nthreads, nblocks)
+end
+
+function get_hv(Nv, Nh, blockIdx, threadIdx, blockDim, gridDim)
+    tidx = (blockIdx.x - 1) * blockDim.x + threadIdx.x
+    (h, v) = CartesianIndices((1:Nh, 1:Nv))[tidx].I
+    # @cuprintln("Nv,Nh,v,h=($Nv, $Nh,$v,$h)")
+    return (h, v)
+end
+
+function compute_element_bounds!(
+    limiter::QuasiMonotoneLimiter,
+    ρq,
+    ρ,
+    ::ClimaComms.CUDADevice,
+)
+    S = size(Fields.field_values(ρ))
+    (Ni, Nj, _, Nv, Nh) = S
+    nthreads, nblocks = config_threadblock(Nv, Nh)
+
+    CUDA.@cuda always_inline = true threads = nthreads blocks = nblocks compute_element_bounds_kernel!(
+        limiter,
+        Fields.field_values(Operators.strip_space(ρq, axes(ρq))),
+        Fields.field_values(Operators.strip_space(ρ, axes(ρ))),
+        Nv,
+        Nh,
+        Val(Ni),
+        Val(Nj),
+    )
+    return nothing
+end
+
+
+function compute_element_bounds_kernel!(
+    limiter,
+    ρq,
+    ρ,
+    Nv,
+    Nh,
+    ::Val{Ni},
+    ::Val{Nj},
+) where {Ni, Nj}
+    (h, v) = get_hv(Nv, Nh, blockIdx(), threadIdx(), blockDim(), gridDim())
+    if h ≤ Nh && v ≤ Nv
+        (; q_bounds) = limiter
+        local q_min, q_max
+        slab_ρq = slab(ρq, v, h)
+        slab_ρ = slab(ρ, v, h)
+        for j in 1:Nj
+            for i in 1:Ni
+                q = rdiv(slab_ρq[i, j], slab_ρ[i, j])
+                if i == 1 && j == 1
+                    q_min = q
+                    q_max = q
+                else
+                    q_min = rmin(q_min, q)
+                    q_max = rmax(q_max, q)
+                end
+            end
+        end
+        slab_q_bounds = slab(q_bounds, v, h)
+        slab_q_bounds[1] = q_min
+        slab_q_bounds[2] = q_max
+    end
+    return nothing
+end
+
+
+function compute_neighbor_bounds_local!(
+    limiter::QuasiMonotoneLimiter,
+    ρ,
+    ::ClimaComms.CUDADevice,
+)
+    topology = Spaces.topology(axes(ρ))
+    Ni, Nj, _, Nv, Nh = size(Fields.field_values(ρ))
+    nthreads, nblocks = config_threadblock(Nv, Nh)
+    CUDA.@cuda always_inline = true threads = nthreads blocks = nblocks compute_neighbor_bounds_local_kernel!(
+        limiter,
+        topology.local_neighbor_elem,
+        topology.local_neighbor_elem_offset,
+        Nv,
+        Nh,
+        Val(Ni),
+        Val(Nj),
+    )
+end
+
+function compute_neighbor_bounds_local_kernel!(
+    limiter,
+    local_neighbor_elem,
+    local_neighbor_elem_offset,
+    Nv,
+    Nh,
+    ::Val{Ni},
+    ::Val{Nj},
+) where {Ni, Nj}
+
+    (h, v) = get_hv(Nv, Nh, blockIdx(), threadIdx(), blockDim(), gridDim())
+    if h ≤ Nh && v ≤ Nv
+        (; q_bounds, q_bounds_nbr, ghost_buffer, rtol) = limiter
+        slab_q_bounds = slab(q_bounds, v, h)
+        q_min = slab_q_bounds[1]
+        q_max = slab_q_bounds[2]
+        for lne in
+            local_neighbor_elem_offset[h]:(local_neighbor_elem_offset[h + 1] - 1)
+            h_nbr = local_neighbor_elem[lne]
+            slab_q_bounds = slab(q_bounds, v, h_nbr)
+            q_min = rmin(q_min, slab_q_bounds[1])
+            q_max = rmax(q_max, slab_q_bounds[2])
+        end
+        slab_q_bounds_nbr = slab(q_bounds_nbr, v, h)
+        slab_q_bounds_nbr[1] = q_min
+        slab_q_bounds_nbr[2] = q_max
+    end
+    return nothing
+end
+
+function apply_limiter!(
+    ρq::Fields.Field,
+    ρ::Fields.Field,
+    limiter::QuasiMonotoneLimiter,
+    ::ClimaComms.CUDADevice,
+)
+    ρq_data = Fields.field_values(ρq)
+    (Ni, Nj, _, Nv, Nh) = size(ρq_data)
+    Nf = DataLayouts.ncomponents(ρq_data)
+    maxiter = Ni * Nj
+    WJ = Spaces.local_geometry_data(axes(ρq)).WJ
+    nthreads, nblocks = config_threadblock(Nv, Nh)
+    CUDA.@cuda always_inline = true threads = nthreads blocks = nblocks apply_limiter_kernel!(
+        limiter,
+        Fields.field_values(Operators.strip_space(ρq, axes(ρq))),
+        Fields.field_values(Operators.strip_space(ρ, axes(ρ))),
+        WJ,
+        Nv,
+        Nh,
+        Val(Nf),
+        Val(Ni),
+        Val(Nj),
+        Val(maxiter),
+    )
+    return nothing
+end
+
+function apply_limiter_kernel!(
+    limiter::QuasiMonotoneLimiter,
+    ρq_data,
+    ρ_data,
+    WJ_data,
+    Nv,
+    Nh,
+    ::Val{Nf},
+    ::Val{Ni},
+    ::Val{Nj},
+    ::Val{maxiter},
+) where {Nf, Ni, Nj, maxiter}
+    (; q_bounds_nbr, rtol) = limiter
+    converged = true
+    (h, v) = get_hv(Nv, Nh, blockIdx(), threadIdx(), blockDim(), gridDim())
+    if h ≤ Nh && v ≤ Nv
+
+        slab_ρ = slab(ρ_data, v, h)
+        slab_ρq = slab(ρq_data, v, h)
+        slab_WJ = slab(WJ_data, v, h)
+        slab_q_bounds = slab(q_bounds_nbr, v, h)
+
+        array_ρq = parent(slab_ρq)
+        array_ρ = parent(slab_ρ)
+        array_w = parent(slab_WJ)
+        array_q_bounds = parent(slab_q_bounds)
+
+        # 1) compute ∫ρ
+        total_mass = zero(eltype(array_ρ))
+        for j in 1:Nj, i in 1:Ni
+            total_mass += array_ρ[i, j, 1] * array_w[i, j, 1]
+        end
+
+        @assert total_mass > 0
+
+        converged = true
+        for f in 1:Nf
+            q_min = array_q_bounds[1, f]
+            q_max = array_q_bounds[2, f]
+
+            # 2) compute ∫ρq
+            tracer_mass = zero(eltype(array_ρq))
+            for j in 1:Nj, i in 1:Ni
+                tracer_mass += array_ρq[i, j, f] * array_w[i, j, 1]
+            end
+
+            # TODO: Should this condition be enforced? (It isn't in HOMME.)
+            # @assert tracer_mass >= 0
+
+            # 3) set bounds
+            q_avg = tracer_mass / total_mass
+            q_min = min(q_min, q_avg)
+            q_max = max(q_max, q_avg)
+
+            # 3) modify ρq
+            for iter in 1:maxiter
+                Δtracer_mass = zero(eltype(array_ρq))
+                for j in 1:Nj, i in 1:Ni
+                    ρ = array_ρ[i, j, 1]
+                    ρq = array_ρq[i, j, f]
+                    ρq_max = ρ * q_max
+                    ρq_min = ρ * q_min
+                    w = array_w[i, j]
+                    if ρq > ρq_max
+                        Δtracer_mass += (ρq - ρq_max) * w
+                        array_ρq[i, j, f] = ρq_max
+                    elseif ρq < ρq_min
+                        Δtracer_mass += (ρq - ρq_min) * w
+                        array_ρq[i, j, f] = ρq_min
+                    end
+                end
+
+                if abs(Δtracer_mass) <= rtol * abs(tracer_mass)
+                    break
+                end
+
+                if Δtracer_mass > 0 # add mass
+                    total_mass_at_Δ_points = zero(eltype(array_ρ))
+                    for j in 1:Nj, i in 1:Ni
+                        ρ = array_ρ[i, j, 1]
+                        ρq = array_ρq[i, j, f]
+                        w = array_w[i, j]
+                        if ρq < ρ * q_max
+                            total_mass_at_Δ_points += ρ * w
+                        end
+                    end
+                    Δq_at_Δ_points = Δtracer_mass / total_mass_at_Δ_points
+                    for j in 1:Nj, i in 1:Ni
+                        ρ = array_ρ[i, j, 1]
+                        ρq = array_ρq[i, j, f]
+                        if ρq < ρ * q_max
+                            array_ρq[i, j, f] += ρ * Δq_at_Δ_points
+                        end
+                    end
+                else # remove mass
+                    total_mass_at_Δ_points = zero(eltype(array_ρ))
+                    for j in 1:Nj, i in 1:Ni
+                        ρ = array_ρ[i, j, 1]
+                        ρq = array_ρq[i, j, f]
+                        w = array_w[i, j]
+                        if ρq > ρ * q_min
+                            total_mass_at_Δ_points += ρ * w
+                        end
+                    end
+                    Δq_at_Δ_points = Δtracer_mass / total_mass_at_Δ_points
+                    for j in 1:Nj, i in 1:Ni
+                        ρ = array_ρ[i, j, 1]
+                        ρq = array_ρq[i, j, f]
+                        if ρq > ρ * q_min
+                            array_ρq[i, j, f] += ρ * Δq_at_Δ_points
+                        end
+                    end
+                end
+
+                if iter == maxiter
+                    converged = false
+                end
+            end
+        end
+
+    end
+    # converged || @warn "Limiter failed to converge with rtol = $rtol"
+
+    return nothing
+end