Skip to content

Commit 5dd64c4

Browse files
Revert most of 1969
1 parent 3e95814 commit 5dd64c4

File tree

5 files changed

+108
-17
lines changed

5 files changed

+108
-17
lines changed

ext/cuda/data_layouts_copyto.jl

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,90 @@
11
DataLayouts.device_dispatch(x::CUDA.CuArray) = ToCUDA()
22

3+
##### Multi-dimensional launch configuration kernels
4+
5+
function knl_copyto!(dest, src)
6+
7+
i = CUDA.threadIdx().x
8+
j = CUDA.threadIdx().y
9+
10+
h = CUDA.blockIdx().x
11+
v = CUDA.blockDim().z * (CUDA.blockIdx().y - 1) + CUDA.threadIdx().z
12+
13+
if v <= size(dest, 4)
14+
I = CartesianIndex((i, j, 1, v, h))
15+
@inbounds dest[I] = src[I]
16+
end
17+
return nothing
18+
end
19+
20+
function Base.copyto!(
21+
dest::IJFH{S, Nij, Nh},
22+
bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh},
23+
::ToCUDA,
24+
) where {S, Nij, Nh}
25+
if Nh > 0
26+
auto_launch!(
27+
knl_copyto!,
28+
(dest, bc);
29+
threads_s = (Nij, Nij),
30+
blocks_s = (Nh, 1),
31+
)
32+
end
33+
return dest
34+
end
35+
36+
function Base.copyto!(
37+
dest::VIJFH{S, Nv, Nij, Nh},
38+
bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh},
39+
::ToCUDA,
40+
) where {S, Nv, Nij, Nh}
41+
if Nv > 0 && Nh > 0
42+
Nv_per_block = min(Nv, fld(256, Nij * Nij))
43+
Nv_blocks = cld(Nv, Nv_per_block)
44+
auto_launch!(
45+
knl_copyto!,
46+
(dest, bc);
47+
threads_s = (Nij, Nij, Nv_per_block),
48+
blocks_s = (Nh, Nv_blocks),
49+
)
50+
end
51+
return dest
52+
end
53+
54+
import ClimaCore.DataLayouts: isascalar
55+
function knl_copyto_flat!(dest::AbstractData, bc, us)
56+
@inbounds begin
57+
tidx = thread_index()
58+
if tidx get_N(us)
59+
n = size(dest)
60+
I = kernel_indexes(tidx, n)
61+
dest[I] = bc[I]
62+
end
63+
end
64+
return nothing
65+
end
66+
67+
function cuda_copyto!(dest::AbstractData, bc)
68+
(_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
69+
us = DataLayouts.UniversalSize(dest)
70+
if Nv > 0 && Nh > 0
71+
nitems = prod(DataLayouts.universal_size(dest))
72+
auto_launch!(knl_copyto_flat!, (dest, bc, us), nitems; auto = true)
73+
end
74+
return dest
75+
end
76+
Base.copyto!(
77+
dest::IFH{S, Ni, Nh},
78+
bc::DataLayouts.BroadcastedUnionIFH{S, Ni, Nh},
79+
::ToCUDA,
80+
) where {S, Ni, Nh} = cuda_copyto!(dest, bc)
81+
Base.copyto!(
82+
dest::VIFH{S, Nv, Ni, Nh},
83+
bc::DataLayouts.BroadcastedUnionVIFH{S, Nv, Ni, Nh},
84+
::ToCUDA,
85+
) where {S, Nv, Ni, Nh} = cuda_copyto!(dest, bc)
86+
#####
87+
388
function knl_copyto!(dest, src, us)
489
I = universal_index(dest)
590
if is_valid_index(dest, I, us)

ext/cuda/matrix_fields_multiple_field_solve.jl

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@ NVTX.@annotate function multiple_field_solve!(
3939
nitems = Ni * Nj * Nh * Nnames
4040
threads = threads_via_occupancy(multiple_field_solve_kernel!, args)
4141
n_max_threads = min(threads, nitems)
42-
p = multiple_field_solve_partition(us, n_max_threads; Nnames)
42+
# p = multiple_field_solve_partition(us, n_max_threads; Nnames)
43+
p = linear_partition(nitems, n_max_threads)
4344

4445
auto_launch!(
4546
multiple_field_solve_kernel!,
@@ -89,9 +90,11 @@ function multiple_field_solve_kernel!(
8990
::Val{Nnames},
9091
) where {Nnames}
9192
@inbounds begin
92-
(I, iname) = multiple_field_solve_universal_index(us)
93-
if multiple_field_solve_is_valid_index(I, us)
94-
(i, j, _, _, h) = I.I
93+
(Ni, Nj, _, _, Nh) = size(Fields.field_values(x1))
94+
tidx = thread_index()
95+
n = (Ni, Nj, Nh, Nnames)
96+
if valid_range(tidx, prod(n))
97+
(i, j, h, iname) = kernel_indexes(tidx, n).I
9598
generated_single_field_solve!(
9699
device,
97100
caches,

ext/cuda/matrix_fields_single_field_solve.jl

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ function single_field_solve!(device::ClimaComms.CUDADevice, cache, x, A, b)
2020
threads = threads_via_occupancy(single_field_solve_kernel!, args)
2121
nitems = Ni * Nj * Nh
2222
n_max_threads = min(threads, nitems)
23-
p = columnwise_partition(us, n_max_threads)
23+
p = linear_partition(nitems, n_max_threads)
2424
auto_launch!(
2525
single_field_solve_kernel!,
2626
args;
@@ -30,9 +30,10 @@ function single_field_solve!(device::ClimaComms.CUDADevice, cache, x, A, b)
3030
end
3131

3232
function single_field_solve_kernel!(device, cache, x, A, b, us)
33-
I = columnwise_universal_index(us)
34-
if columnwise_is_valid_index(I, us)
35-
(i, j, _, _, h) = I.I
33+
idx = CUDA.threadIdx().x + (CUDA.blockIdx().x - 1) * CUDA.blockDim().x
34+
Ni, Nj, _, _, Nh = size(Fields.field_values(A))
35+
if idx <= Ni * Nj * Nh
36+
(i, j, h) = CartesianIndices((1:Ni, 1:Nj, 1:Nh))[idx].I
3637
_single_field_solve!(
3738
device,
3839
Spaces.column(cache, i, j, h),

ext/cuda/operators_integral.jl

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ function column_reduce_device!(
3333
nitems = Ni * Nj * Nh
3434
threads = threads_via_occupancy(bycolumn_kernel!, args)
3535
n_max_threads = min(threads, nitems)
36-
p = columnwise_partition(us, n_max_threads)
36+
p = linear_partition(nitems, n_max_threads)
3737
auto_launch!(
3838
bycolumn_kernel!,
3939
args;
@@ -67,7 +67,7 @@ function column_accumulate_device!(
6767
nitems = Ni * Nj * Nh
6868
threads = threads_via_occupancy(bycolumn_kernel!, args)
6969
n_max_threads = min(threads, nitems)
70-
p = columnwise_partition(us, n_max_threads)
70+
p = linear_partition(nitems, n_max_threads)
7171
auto_launch!(
7272
bycolumn_kernel!,
7373
args;
@@ -89,9 +89,10 @@ bycolumn_kernel!(
8989
if space isa Spaces.FiniteDifferenceSpace
9090
single_column_function!(f, transform, output, input, init, space)
9191
else
92-
I = columnwise_universal_index(us)
93-
if columnwise_is_valid_index(I, us)
94-
(i, j, _, _, h) = I.I
92+
idx = threadIdx().x + (blockIdx().x - 1) * blockDim().x
93+
Ni, Nj, _, _, Nh = size(Fields.field_values(output))
94+
if idx <= Ni * Nj * Nh
95+
i, j, h = cart_ind((Ni, Nj, Nh), idx).I
9596
single_column_function!(
9697
f,
9798
transform,

ext/cuda/operators_thomas_algorithm.jl

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ function column_thomas_solve!(::ClimaComms.CUDADevice, A, b)
1111
threads = threads_via_occupancy(thomas_algorithm_kernel!, args)
1212
nitems = Ni * Nj * Nh
1313
n_max_threads = min(threads, nitems)
14-
p = columnwise_partition(us, n_max_threads)
14+
p = linear_partition(nitems, n_max_threads)
1515
auto_launch!(
1616
thomas_algorithm_kernel!,
1717
args;
@@ -25,9 +25,10 @@ function thomas_algorithm_kernel!(
2525
b::Fields.ExtrudedFiniteDifferenceField,
2626
us::DataLayouts.UniversalSize,
2727
)
28-
I = columnwise_universal_index(us)
29-
if columnwise_is_valid_index(I, us)
30-
(i, j, _, _, h) = I.I
28+
idx = threadIdx().x + (blockIdx().x - 1) * blockDim().x
29+
Ni, Nj, _, _, Nh = size(Fields.field_values(A))
30+
if idx <= Ni * Nj * Nh
31+
i, j, h = cart_ind((Ni, Nj, Nh), idx).I
3132
thomas_algorithm!(Spaces.column(A, i, j, h), Spaces.column(b, i, j, h))
3233
end
3334
return nothing

0 commit comments

Comments
 (0)