Skip to content

Commit 13c6176

Browse files
Merge pull request #1845 from CliMA/ck/mapreduce_data
Move mapreduce logic to DataLayouts layer
2 parents 67f0683 + f813978 commit 13c6176

File tree

7 files changed

+262
-237
lines changed

7 files changed

+262
-237
lines changed

ext/ClimaCoreCUDAExt.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import CUDA
99
using CUDA
1010
using CUDA: threadIdx, blockIdx, blockDim
1111
import StaticArrays: SVector, SMatrix, SArray
12+
import ClimaCore.DataLayouts: mapreduce_cuda
1213
import ClimaCore.DataLayouts: slab, column
1314
import ClimaCore.Utilities: half
1415
import ClimaCore.Utilities: cart_ind, linear_ind

ext/cuda/data_layouts.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ Base.similar(
2727

2828
include("data_layouts_fill.jl")
2929
include("data_layouts_copyto.jl")
30+
include("data_layouts_mapreduce.jl")
3031

3132
Base.@propagate_inbounds function rcopyto_at!(
3233
pair::Pair{<:AbstractData, <:Any},

ext/cuda/data_layouts_mapreduce.jl

Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
# To implement a single flexible mapreduce, let's define
2+
# a `OnesArray` that has nothing, and always returns 1:
3+
struct OnesArray{T, N} <: AbstractArray{T, N} end
4+
OnesArray(x::AbstractArray) = OnesArray{eltype(x), ndims(x)}()
5+
Base.@propagate_inbounds Base.getindex(::OnesArray, inds...) = 1
6+
Base.parent(x::OnesArray) = x
7+
8+
function mapreduce_cuda(
9+
f,
10+
op,
11+
data::DataLayouts.DataF;
12+
weighted_jacobian = OnesArray(parent(data)),
13+
opargs...,
14+
)
15+
pdata = parent(data)
16+
S = eltype(data)
17+
return DataLayouts.DataF{S}(Array(Array(f(pdata))[1, :]))
18+
end
19+
20+
function mapreduce_cuda(
21+
f,
22+
op,
23+
data::Union{DataLayouts.VF, DataLayouts.IJFH, DataLayouts.VIJFH};
24+
weighted_jacobian = OnesArray(parent(data)),
25+
opargs...,
26+
)
27+
S = eltype(data)
28+
pdata = parent(data)
29+
T = eltype(pdata)
30+
(Ni, Nj, Nk, Nv, Nh) = size(data)
31+
Nf = div(length(pdata), prod(size(data))) # length of field dimension
32+
pwt = parent(weighted_jacobian)
33+
34+
nitems = Nv * Ni * Nj * Nk * Nh
35+
max_threads = 256# 512 1024
36+
nthreads = min(max_threads, nitems)
37+
# perform n ops during loading to shmem (this is a tunable parameter)
38+
n_ops_on_load = cld(nitems, nthreads) == 1 ? 0 : 7
39+
effective_blksize = nthreads * (n_ops_on_load + 1)
40+
nblocks = cld(nitems, effective_blksize)
41+
42+
reduce_cuda = CuArray{T}(undef, nblocks, Nf)
43+
shmemsize = nthreads
44+
# place each field on a different block
45+
@cuda always_inline = true threads = (nthreads) blocks = (nblocks, Nf) mapreduce_cuda_kernel!(
46+
reduce_cuda,
47+
f,
48+
op,
49+
pdata,
50+
pwt,
51+
n_ops_on_load,
52+
Val(shmemsize),
53+
)
54+
# reduce block data
55+
if nblocks > 1
56+
nthreads = min(32, nblocks)
57+
shmemsize = nthreads
58+
@cuda always_inline = true threads = (nthreads) blocks = (Nf) reduce_cuda_blocks_kernel!(
59+
reduce_cuda,
60+
op,
61+
Val(shmemsize),
62+
)
63+
end
64+
return DataLayouts.DataF{S}(Array(Array(reduce_cuda)[1, :]))
65+
end
66+
67+
function mapreduce_cuda_kernel!(
68+
reduce_cuda::AbstractArray{T, 2},
69+
f,
70+
op,
71+
pdata::AbstractArray{T, N},
72+
pwt::AbstractArray{T, N},
73+
n_ops_on_load::Int,
74+
::Val{shmemsize},
75+
) where {T, N, shmemsize}
76+
blksize = blockDim().x
77+
nblk = gridDim().x
78+
tidx = threadIdx().x
79+
bidx = blockIdx().x
80+
fidx = blockIdx().y
81+
dataview = _dataview(pdata, fidx)
82+
effective_blksize = blksize * (n_ops_on_load + 1)
83+
gidx = _get_gidx(tidx, bidx, effective_blksize)
84+
reduction = CUDA.CuStaticSharedArray(T, shmemsize)
85+
reduction[tidx] = 0
86+
(Nv, Nij, Nf, Nh) = _get_dims(dataview)
87+
nitems = Nv * Nij * Nij * Nf * Nh
88+
89+
# load shmem
90+
if gidx nitems
91+
reduction[tidx] = f(dataview[gidx]) * pwt[gidx]
92+
for n_ops in 1:n_ops_on_load
93+
gidx2 = _get_gidx(tidx + blksize * n_ops, bidx, effective_blksize)
94+
if gidx2 nitems
95+
reduction[tidx] =
96+
op(reduction[tidx], f(dataview[gidx2]) * pwt[gidx2])
97+
end
98+
end
99+
end
100+
sync_threads()
101+
_cuda_intrablock_reduce!(op, reduction, tidx, blksize)
102+
103+
tidx == 1 && (reduce_cuda[bidx, fidx] = reduction[1])
104+
return nothing
105+
end
106+
107+
@inline function _get_gidx(tidx, bidx, effective_blksize)
108+
return tidx + (bidx - 1) * effective_blksize
109+
end
110+
# for VF DataLayout
111+
@inline function _get_dims(pdata::AbstractArray{FT, 2}) where {FT}
112+
(Nv, Nf) = size(pdata)
113+
return (Nv, 1, Nf, 1)
114+
end
115+
@inline _dataview(pdata::AbstractArray{FT, 2}, fidx) where {FT} =
116+
view(pdata, :, fidx:fidx)
117+
118+
# for IJFH DataLayout
119+
@inline function _get_dims(pdata::AbstractArray{FT, 4}) where {FT}
120+
(Nij, _, Nf, Nh) = size(pdata)
121+
return (1, Nij, Nf, Nh)
122+
end
123+
@inline _dataview(pdata::AbstractArray{FT, 4}, fidx) where {FT} =
124+
view(pdata, :, :, fidx:fidx, :)
125+
126+
# for VIJFH DataLayout
127+
@inline function _get_dims(pdata::AbstractArray{FT, 5}) where {FT}
128+
(Nv, Nij, _, Nf, Nh) = size(pdata)
129+
return (Nv, Nij, Nf, Nh)
130+
end
131+
@inline _dataview(pdata::AbstractArray{FT, 5}, fidx) where {FT} =
132+
view(pdata, :, :, :, fidx:fidx, :)
133+
134+
@inline function _cuda_reduce!(op, reduction, tidx, reduction_size, N)
135+
if reduction_size > N
136+
if tidx reduction_size - N
137+
@inbounds reduction[tidx] = op(reduction[tidx], reduction[tidx + N])
138+
end
139+
N > 32 && sync_threads()
140+
return N
141+
end
142+
return reduction_size
143+
end
144+
145+
function reduce_cuda_blocks_kernel!(
146+
reduce_cuda::AbstractArray{T, 2},
147+
op,
148+
::Val{shmemsize},
149+
) where {T, shmemsize}
150+
blksize = blockDim().x
151+
fidx = blockIdx().x
152+
tidx = threadIdx().x
153+
nitems = size(reduce_cuda, 1)
154+
nloads = cld(nitems, blksize) - 1
155+
reduction = CUDA.CuStaticSharedArray(T, shmemsize)
156+
157+
reduction[tidx] = reduce_cuda[tidx, fidx]
158+
159+
for i in 1:nloads
160+
idx = tidx + blksize * i
161+
if idx nitems
162+
reduction[tidx] = op(reduction[tidx], reduce_cuda[idx, fidx])
163+
end
164+
end
165+
166+
blksize > 32 && sync_threads()
167+
_cuda_intrablock_reduce!(op, reduction, tidx, blksize)
168+
169+
tidx == 1 && (reduce_cuda[1, fidx] = reduction[1])
170+
return nothing
171+
end
172+
173+
@inline function _cuda_intrablock_reduce!(op, reduction, tidx, blksize)
174+
# assumes max_threads ≤ 1024 which is the current max on any CUDA device
175+
newsize = _cuda_reduce!(op, reduction, tidx, blksize, 512)
176+
newsize = _cuda_reduce!(op, reduction, tidx, newsize, 256)
177+
newsize = _cuda_reduce!(op, reduction, tidx, newsize, 128)
178+
newsize = _cuda_reduce!(op, reduction, tidx, newsize, 64)
179+
newsize = _cuda_reduce!(op, reduction, tidx, newsize, 32)
180+
newsize = _cuda_reduce!(op, reduction, tidx, newsize, 16)
181+
newsize = _cuda_reduce!(op, reduction, tidx, newsize, 8)
182+
newsize = _cuda_reduce!(op, reduction, tidx, newsize, 4)
183+
newsize = _cuda_reduce!(op, reduction, tidx, newsize, 2)
184+
newsize = _cuda_reduce!(op, reduction, tidx, newsize, 1)
185+
return nothing
186+
end

0 commit comments

Comments
 (0)