|
| 1 | +import ClimaCore.DataLayouts: |
| 2 | + to_non_extruded_broadcasted, has_uniform_datalayouts |
1 | 3 | DataLayouts._device_dispatch(x::CUDA.CuArray) = ToCUDA()
|
2 | 4 |
|
3 |
| -function knl_copyto!(dest, src) |
| 5 | +# function Base.copyto!( |
| 6 | +# dest::VIJFH{S, Nv, Nij, Nh}, |
| 7 | +# bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh}, |
| 8 | +# ::ToCUDA, |
| 9 | +# ) where {S, Nv, Nij, Nh} |
| 10 | +# if Nv > 0 && Nh > 0 |
| 11 | +# us = DataLayouts.UniversalSize(dest) |
| 12 | +# n = prod(DataLayouts.universal_size(us)) |
| 13 | +# if has_uniform_datalayouts(bc) |
| 14 | +# bc′ = to_non_extruded_broadcasted(bc) |
| 15 | +# auto_launch!(knl_copyto_linear!, (dest, bc′, us), n; auto = true) |
| 16 | +# else |
| 17 | +# auto_launch!(knl_copyto_cart!, (dest, bc, us), n; auto = true) |
| 18 | +# end |
| 19 | +# end |
| 20 | +# return dest |
| 21 | +# end |
4 | 22 |
|
5 |
| - i = CUDA.threadIdx().x |
6 |
| - j = CUDA.threadIdx().y |
7 |
| - |
8 |
| - h = CUDA.blockIdx().x |
9 |
| - v = CUDA.blockDim().z * (CUDA.blockIdx().y - 1) + CUDA.threadIdx().z |
10 |
| - |
11 |
| - if v <= size(dest, 4) |
12 |
| - I = CartesianIndex((i, j, 1, v, h)) |
13 |
| - @inbounds dest[I] = src[I] |
| 23 | +function knl_copyto_linear!(dest::AbstractData, bc, us) |
| 24 | + @inbounds begin |
| 25 | + tidx = thread_index() |
| 26 | + if tidx ≤ get_N(us) |
| 27 | + dest[tidx] = bc[tidx] |
| 28 | + end |
14 | 29 | end
|
15 | 30 | return nothing
|
16 | 31 | end
|
17 | 32 |
|
18 |
| -function Base.copyto!( |
19 |
| - dest::IJFH{S, Nij, Nh}, |
20 |
| - bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh}, |
21 |
| - ::ToCUDA, |
22 |
| -) where {S, Nij, Nh} |
23 |
| - if Nh > 0 |
24 |
| - auto_launch!( |
25 |
| - knl_copyto!, |
26 |
| - (dest, bc); |
27 |
| - threads_s = (Nij, Nij), |
28 |
| - blocks_s = (Nh, 1), |
29 |
| - ) |
30 |
| - end |
31 |
| - return dest |
32 |
| -end |
33 |
| - |
34 |
| -function Base.copyto!( |
35 |
| - dest::VIJFH{S, Nv, Nij, Nh}, |
36 |
| - bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh}, |
37 |
| - ::ToCUDA, |
38 |
| -) where {S, Nv, Nij, Nh} |
39 |
| - if Nv > 0 && Nh > 0 |
40 |
| - Nv_per_block = min(Nv, fld(256, Nij * Nij)) |
41 |
| - Nv_blocks = cld(Nv, Nv_per_block) |
42 |
| - auto_launch!( |
43 |
| - knl_copyto!, |
44 |
| - (dest, bc); |
45 |
| - threads_s = (Nij, Nij, Nv_per_block), |
46 |
| - blocks_s = (Nh, Nv_blocks), |
47 |
| - ) |
48 |
| - end |
49 |
| - return dest |
| 33 | +function knl_copyto_linear!(dest::DataF, bc, us) |
| 34 | + tidx = thread_index() |
| 35 | + @inbounds dest[] = bc[tidx] |
| 36 | + return nothing |
50 | 37 | end
|
51 | 38 |
|
52 |
| -function Base.copyto!( |
53 |
| - dest::VF{S, Nv}, |
54 |
| - bc::DataLayouts.BroadcastedUnionVF{S, Nv}, |
55 |
| - ::ToCUDA, |
56 |
| -) where {S, Nv} |
57 |
| - if Nv > 0 |
58 |
| - auto_launch!( |
59 |
| - knl_copyto!, |
60 |
| - (dest, bc); |
61 |
| - threads_s = (1, 1), |
62 |
| - blocks_s = (1, Nv), |
63 |
| - ) |
| 39 | +function knl_copyto_flat!(dest::AbstractData, bc, us) |
| 40 | + @inbounds begin |
| 41 | + tidx = thread_index() |
| 42 | + if tidx ≤ get_N(us) |
| 43 | + n = size(dest) |
| 44 | + I = kernel_indexes(tidx, n) |
| 45 | + dest[I] = bc[I] |
| 46 | + end |
64 | 47 | end
|
65 |
| - return dest |
66 |
| -end |
67 |
| - |
68 |
| -function Base.copyto!( |
69 |
| - dest::DataF{S}, |
70 |
| - bc::DataLayouts.BroadcastedUnionDataF{S}, |
71 |
| - ::ToCUDA, |
72 |
| -) where {S} |
73 |
| - auto_launch!(knl_copyto!, (dest, bc); threads_s = (1, 1), blocks_s = (1, 1)) |
74 |
| - return dest |
| 48 | + return nothing |
75 | 49 | end
|
76 | 50 |
|
77 |
| -import ClimaCore.DataLayouts: isascalar |
78 |
| -function knl_copyto_flat!(dest::AbstractData, bc, us) |
| 51 | +function knl_copyto_flat!(dest::DataF, bc, us) |
79 | 52 | @inbounds begin
|
80 | 53 | tidx = thread_index()
|
81 | 54 | if tidx ≤ get_N(us)
|
82 | 55 | n = size(dest)
|
83 | 56 | I = kernel_indexes(tidx, n)
|
84 |
| - dest[I] = bc[I] |
| 57 | + dest[] = bc[I] |
85 | 58 | end
|
86 | 59 | end
|
87 | 60 | return nothing
|
|
90 | 63 | function cuda_copyto!(dest::AbstractData, bc)
|
91 | 64 | (_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
|
92 | 65 | us = DataLayouts.UniversalSize(dest)
|
| 66 | + n = prod(DataLayouts.universal_size(us)) |
93 | 67 | if Nv > 0 && Nh > 0
|
94 |
| - nitems = prod(DataLayouts.universal_size(dest)) |
95 |
| - auto_launch!(knl_copyto_flat!, (dest, bc, us), nitems; auto = true) |
| 68 | + if has_uniform_datalayouts(bc) |
| 69 | + bc′ = to_non_extruded_broadcasted(bc) |
| 70 | + auto_launch!( |
| 71 | + knl_copyto_linear!, |
| 72 | + (dest, bc′, us), |
| 73 | + nitems; |
| 74 | + auto = true, |
| 75 | + ) |
| 76 | + else |
| 77 | + auto_launch!(knl_copyto_flat!, (dest, bc, us), nitems; auto = true) |
| 78 | + end |
96 | 79 | end
|
97 | 80 | return dest
|
98 | 81 | end
|
99 | 82 |
|
100 | 83 | # TODO: can we use CUDA's luanch configuration for all data layouts?
|
101 | 84 | # Currently, it seems to have a slight performance degradation.
|
102 | 85 | #! format: off
|
103 |
| -# Base.copyto!(dest::IJFH{S, Nij}, bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh}, ::ToCUDA) where {S, Nij, Nh} = cuda_copyto!(dest, bc) |
| 86 | +Base.copyto!(dest::IJFH{S, Nij}, bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh}, ::ToCUDA) where {S, Nij, Nh} = cuda_copyto!(dest, bc) |
104 | 87 | Base.copyto!(dest::IFH{S, Ni, Nh}, bc::DataLayouts.BroadcastedUnionIFH{S, Ni, Nh}, ::ToCUDA) where {S, Ni, Nh} = cuda_copyto!(dest, bc)
|
105 | 88 | Base.copyto!(dest::IJF{S, Nij}, bc::DataLayouts.BroadcastedUnionIJF{S, Nij}, ::ToCUDA) where {S, Nij} = cuda_copyto!(dest, bc)
|
106 | 89 | Base.copyto!(dest::IF{S, Ni}, bc::DataLayouts.BroadcastedUnionIF{S, Ni}, ::ToCUDA) where {S, Ni} = cuda_copyto!(dest, bc)
|
107 | 90 | Base.copyto!(dest::VIFH{S, Nv, Ni, Nh}, bc::DataLayouts.BroadcastedUnionVIFH{S, Nv, Ni, Nh}, ::ToCUDA) where {S, Nv, Ni, Nh} = cuda_copyto!(dest, bc)
|
108 |
| -# Base.copyto!(dest::VIJFH{S, Nv, Nij, Nh}, bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh}, ::ToCUDA) where {S, Nv, Nij, Nh} = cuda_copyto!(dest, bc) |
109 |
| -# Base.copyto!(dest::VF{S, Nv}, bc::DataLayouts.BroadcastedUnionVF{S, Nv}, ::ToCUDA) where {S, Nv} = cuda_copyto!(dest, bc) |
110 |
| -# Base.copyto!(dest::DataF{S}, bc::DataLayouts.BroadcastedUnionDataF{S}, ::ToCUDA) where {S} = cuda_copyto!(dest, bc) |
| 91 | +Base.copyto!(dest::VIJFH{S, Nv, Nij, Nh}, bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh}, ::ToCUDA) where {S, Nv, Nij, Nh} = cuda_copyto!(dest, bc) |
| 92 | +Base.copyto!(dest::VF{S, Nv}, bc::DataLayouts.BroadcastedUnionVF{S, Nv}, ::ToCUDA) where {S, Nv} = cuda_copyto!(dest, bc) |
| 93 | +Base.copyto!(dest::DataF{S}, bc::DataLayouts.BroadcastedUnionDataF{S}, ::ToCUDA) where {S} = cuda_copyto!(dest, bc) |
111 | 94 | #! format: on
|
0 commit comments