Skip to content

Commit 5ea5fea

Browse files
Merge pull request #1804 from CliMA/ck/test_copyto
Add DataLayouts `copyto!` unit tests
2 parents 5f23521 + 21f00f8 commit 5ea5fea

File tree

9 files changed

+276
-86
lines changed

9 files changed

+276
-86
lines changed

.buildkite/pipeline.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,10 @@ steps:
8181
key: unit_data_fill
8282
command: "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/unit_fill.jl"
8383

84+
- label: "Unit: data_copyto"
85+
key: unit_data_copyto
86+
command: "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/unit_copyto.jl"
87+
8488
- label: "Unit: data_opt_similar"
8589
key: data_opt_similar
8690
command: "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/opt_similar.jl"
@@ -125,6 +129,16 @@ steps:
125129
agents:
126130
slurm_gpus: 1
127131

132+
- label: "Unit: data copyto"
133+
key: gpu_unit_data_copyto
134+
command:
135+
- "julia --project=.buildkite -e 'using CUDA; CUDA.versioninfo()'"
136+
- "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/unit_copyto.jl"
137+
env:
138+
CLIMACOMMS_DEVICE: "CUDA"
139+
agents:
140+
slurm_gpus: 1
141+
128142
- group: "Unit: Geometry"
129143
steps:
130144

ext/cuda/cuda_utils.jl

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@ import ClimaCore.DataLayouts
44
import ClimaCore.DataLayouts: empty_kernel_stats
55

66
get_n_items(field::Fields.Field) = get_n_items(Fields.field_values(field))
7-
get_n_items(data::DataLayouts.AbstractData) =
8-
get_n_items(DataLayouts.universal_size(data))
7+
get_n_items(data::DataLayouts.AbstractData) = get_n_items(size(data))
98
get_n_items(arr::AbstractArray) = get_n_items(size(parent(arr)))
109
get_n_items(tup::Tuple) = prod(tup)
1110

ext/cuda/data_layouts.jl

Lines changed: 2 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -25,89 +25,8 @@ Base.similar(
2525
dims::Dims{N},
2626
) where {T, N, B} = similar(CUDA.CuArray{T, N, B}, dims)
2727

28-
function knl_copyto!(dest, src)
29-
30-
i = CUDA.threadIdx().x
31-
j = CUDA.threadIdx().y
32-
33-
h = CUDA.blockIdx().x
34-
v = CUDA.blockDim().z * (CUDA.blockIdx().y - 1) + CUDA.threadIdx().z
35-
36-
if v <= size(dest, 4)
37-
I = CartesianIndex((i, j, 1, v, h))
38-
@inbounds dest[I] = src[I]
39-
end
40-
return nothing
41-
end
42-
43-
function Base.copyto!(
44-
dest::IJFH{S, Nij},
45-
bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, A},
46-
) where {S, Nij, A <: CuArrayBackedTypes}
47-
_, _, _, _, Nh = size(bc)
48-
if Nh > 0
49-
auto_launch!(
50-
knl_copyto!,
51-
(dest, bc),
52-
dest;
53-
threads_s = (Nij, Nij),
54-
blocks_s = (Nh, 1),
55-
)
56-
end
57-
return dest
58-
end
59-
60-
function Base.copyto!(
61-
dest::VIJFH{S, Nv, Nij},
62-
bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, A},
63-
) where {S, Nv, Nij, A <: CuArrayBackedTypes}
64-
_, _, _, _, Nh = size(bc)
65-
if Nv > 0 && Nh > 0
66-
Nv_per_block = min(Nv, fld(256, Nij * Nij))
67-
Nv_blocks = cld(Nv, Nv_per_block)
68-
auto_launch!(
69-
knl_copyto!,
70-
(dest, bc),
71-
dest;
72-
threads_s = (Nij, Nij, Nv_per_block),
73-
blocks_s = (Nh, Nv_blocks),
74-
)
75-
end
76-
return dest
77-
end
78-
79-
function Base.copyto!(
80-
dest::VF{S, Nv},
81-
bc::DataLayouts.BroadcastedUnionVF{S, Nv, A},
82-
) where {S, Nv, A <: CuArrayBackedTypes}
83-
_, _, _, _, Nh = size(dest)
84-
if Nv > 0 && Nh > 0
85-
auto_launch!(
86-
knl_copyto!,
87-
(dest, bc),
88-
dest;
89-
threads_s = (1, 1),
90-
blocks_s = (Nh, Nv),
91-
)
92-
end
93-
return dest
94-
end
95-
96-
function Base.copyto!(
97-
dest::DataF{S},
98-
bc::DataLayouts.BroadcastedUnionDataF{S, A},
99-
) where {S, A <: CUDA.CuArray}
100-
auto_launch!(
101-
knl_copyto!,
102-
(dest, bc),
103-
dest;
104-
threads_s = (1, 1),
105-
blocks_s = (1, 1),
106-
)
107-
return dest
108-
end
109-
110-
include("fill.jl")
28+
include("data_layouts_fill.jl")
29+
include("data_layouts_copyto.jl")
11130

11231
Base.@propagate_inbounds function rcopyto_at!(
11332
pair::Pair{<:AbstractData, <:Any},

ext/cuda/data_layouts_copyto.jl

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
function knl_copyto!(dest, src)
2+
3+
i = CUDA.threadIdx().x
4+
j = CUDA.threadIdx().y
5+
6+
h = CUDA.blockIdx().x
7+
v = CUDA.blockDim().z * (CUDA.blockIdx().y - 1) + CUDA.threadIdx().z
8+
9+
if v <= size(dest, 4)
10+
I = CartesianIndex((i, j, 1, v, h))
11+
@inbounds dest[I] = src[I]
12+
end
13+
return nothing
14+
end
15+
16+
function Base.copyto!(
17+
dest::IJFH{S, Nij},
18+
bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, A},
19+
) where {S, Nij, A <: CuArrayBackedTypes}
20+
_, _, _, _, Nh = size(bc)
21+
if Nh > 0
22+
auto_launch!(
23+
knl_copyto!,
24+
(dest, bc),
25+
dest;
26+
threads_s = (Nij, Nij),
27+
blocks_s = (Nh, 1),
28+
)
29+
end
30+
return dest
31+
end
32+
33+
function Base.copyto!(
34+
dest::VIJFH{S, Nv, Nij},
35+
bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, A},
36+
) where {S, Nv, Nij, A <: CuArrayBackedTypes}
37+
_, _, _, _, Nh = size(bc)
38+
if Nv > 0 && Nh > 0
39+
Nv_per_block = min(Nv, fld(256, Nij * Nij))
40+
Nv_blocks = cld(Nv, Nv_per_block)
41+
auto_launch!(
42+
knl_copyto!,
43+
(dest, bc),
44+
dest;
45+
threads_s = (Nij, Nij, Nv_per_block),
46+
blocks_s = (Nh, Nv_blocks),
47+
)
48+
end
49+
return dest
50+
end
51+
52+
function Base.copyto!(
53+
dest::VF{S, Nv},
54+
bc::DataLayouts.BroadcastedUnionVF{S, Nv, A},
55+
) where {S, Nv, A <: CuArrayBackedTypes}
56+
_, _, _, _, Nh = size(dest)
57+
if Nv > 0 && Nh > 0
58+
auto_launch!(
59+
knl_copyto!,
60+
(dest, bc),
61+
dest;
62+
threads_s = (1, 1),
63+
blocks_s = (Nh, Nv),
64+
)
65+
end
66+
return dest
67+
end
68+
69+
function Base.copyto!(
70+
dest::DataF{S},
71+
bc::DataLayouts.BroadcastedUnionDataF{S, A},
72+
) where {S, A <: CUDA.CuArray}
73+
auto_launch!(
74+
knl_copyto!,
75+
(dest, bc),
76+
dest;
77+
threads_s = (1, 1),
78+
blocks_s = (1, 1),
79+
)
80+
return dest
81+
end
82+
83+
import ClimaCore.DataLayouts: isascalar
84+
function knl_copyto_flat!(dest::AbstractData, bc)
85+
@inbounds begin
86+
n = size(dest)
87+
tidx = thread_index()
88+
if valid_range(tidx, prod(n))
89+
I = kernel_indexes(tidx, n)
90+
dest[I] = bc[I]
91+
end
92+
end
93+
return nothing
94+
end
95+
96+
function cuda_copyto!(dest::AbstractData, bc)
97+
(_, _, Nf, Nv, Nh) = DataLayouts.universal_size(dest)
98+
if Nv > 0 && Nh > 0 && Nf > 0
99+
auto_launch!(knl_copyto_flat!, (dest, bc), dest; auto = true)
100+
end
101+
return dest
102+
end
103+
104+
# TODO: can we use CUDA's luanch configuration for all data layouts?
105+
# Currently, it seems to have a slight performance degredation.
106+
#! format: off
107+
# Base.copyto!(dest::IJFH{S, Nij, <:CuArrayBackedTypes}, bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, <:CuArrayBackedTypes}) where {S, Nij} = cuda_copyto!(dest, bc)
108+
Base.copyto!(dest::IFH{S, Ni, <:CuArrayBackedTypes}, bc::DataLayouts.BroadcastedUnionIFH{S, Ni, <:CuArrayBackedTypes}) where {S, Ni} = cuda_copyto!(dest, bc)
109+
Base.copyto!(dest::IJF{S, Nij, <:CuArrayBackedTypes}, bc::DataLayouts.BroadcastedUnionIJF{S, Nij, <:CuArrayBackedTypes}) where {S, Nij} = cuda_copyto!(dest, bc)
110+
Base.copyto!(dest::IF{S, Ni, <:CuArrayBackedTypes}, bc::DataLayouts.BroadcastedUnionIF{S, Ni, <:CuArrayBackedTypes}) where {S, Ni} = cuda_copyto!(dest, bc)
111+
# Base.copyto!(dest::VIFH{S, Nv, Ni, <:CuArrayBackedTypes}, bc::DataLayouts.BroadcastedUnionVIFH{S, Nv, Ni, <:CuArrayBackedTypes}) where {S, Nv, Ni} = cuda_copyto!(dest, bc)
112+
# Base.copyto!(dest::VIJFH{S, Nv, Nij, <:CuArrayBackedTypes}, bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, <:CuArrayBackedTypes}) where {S, Nv, Nij} = cuda_copyto!(dest, bc)
113+
# Base.copyto!(dest::VF{S, Nv, <:CuArrayBackedTypes}, bc::DataLayouts.BroadcastedUnionVF{S, Nv, <:CuArrayBackedTypes}) where {S, Nv} = cuda_copyto!(dest, bc)
114+
# Base.copyto!(dest::DataF{S, <:CuArrayBackedTypes}, bc::DataLayouts.BroadcastedUnionDataF{S, <:CuArrayBackedTypes}) where {S} = cuda_copyto!(dest, bc)
115+
#! format: on

ext/cuda/fill.jl renamed to ext/cuda/data_layouts_fill.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
function knl_fill_flat!(dest::AbstractData, val)
22
@inbounds begin
33
tidx = thread_index()
4-
n = DataLayouts.universal_size(dest)
4+
n = size(dest)
55
if valid_range(tidx, prod(n))
66
I = kernel_indexes(tidx, n)
77
@inbounds dest[I] = val

src/DataLayouts/broadcast.jl

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -540,6 +540,17 @@ function Base.copyto!(
540540
return dest
541541
end
542542

543+
function Base.copyto!(
544+
dest::IF{S, Ni},
545+
bc::BroadcastedUnionIF{S, Ni, A},
546+
) where {S, Ni, A}
547+
@inbounds for i in 1:Ni
548+
idx = CartesianIndex(i, 1, 1, 1, 1)
549+
dest[idx] = convert(S, bc[idx])
550+
end
551+
return dest
552+
end
553+
543554
# inline inner slab(::DataSlab1D) copy
544555
function Base.copyto!(
545556
dest::IF{S, Ni},

src/Geometry/globalgeometry.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ LocalVector(u::CartesianVector{T,I}, ::CartesianGlobalGeometry) where {T,I} =
6262
=#
6363

6464
abstract type AbstractSphericalGlobalGeometry <: AbstractGlobalGeometry end
65+
Base.broadcastable(x::AbstractSphericalGlobalGeometry) = tuple(x)
6566

6667
"""
6768
SphericalGlobalGeometry(radius)

test/DataLayouts/benchmark_fill.jl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@ function benchmarkfill!(device, data, val)
1212
trial = @benchmark ClimaComms.@cuda_sync $device fill!($data, $val)
1313
show(stdout, MIME("text/plain"), trial)
1414
println()
15+
trial =
16+
@benchmark ClimaComms.@cuda_sync $device fill!($(parent(data)), $val)
17+
show(stdout, MIME("text/plain"), trial)
18+
println()
1519
end
1620

1721
@testset "fill! with Nf = 1" begin

0 commit comments

Comments
 (0)