@@ -2,49 +2,23 @@ import ClimaCore.DataLayouts:
2
2
to_non_extruded_broadcasted, has_uniform_datalayouts
3
3
DataLayouts. _device_dispatch (x:: CUDA.CuArray ) = ToCUDA ()
4
4
5
- function knl_copyto! (dest, src)
6
-
7
- i = CUDA. threadIdx (). x
8
- j = CUDA. threadIdx (). y
9
-
10
- h = CUDA. blockIdx (). x
11
- v = CUDA. blockDim (). z * (CUDA. blockIdx (). y - 1 ) + CUDA. threadIdx (). z
12
-
13
- if v <= size (dest, 4 )
14
- I = CartesianIndex ((i, j, 1 , v, h))
15
- @inbounds dest[I] = src[I]
16
- end
17
- return nothing
18
- end
19
-
20
- function knl_copyto_field_array! (dest, src, us)
21
- @inbounds begin
22
- tidx = thread_index ()
23
- if tidx ≤ get_N (us)
24
- n = size (dest)
25
- I = kernel_indexes (tidx, n)
26
- dest[I] = src[I]
27
- end
28
- end
29
- return nothing
30
- end
31
-
32
- function Base. copyto! (
33
- dest:: IJFH{S, Nij, Nh} ,
34
- bc:: DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh} ,
35
- :: ToCUDA ,
36
- ) where {S, Nij, Nh}
37
- us = DataLayouts. UniversalSize (dest)
38
- if Nh > 0
39
- auto_launch! (
40
- knl_copyto_field_array!,
41
- (dest, bc, us),
42
- prod (DataLayouts. universal_size (us));
43
- auto = true ,
44
- )
45
- end
46
- return dest
47
- end
5
+ # function Base.copyto!(
6
+ # dest::VIJFH{S, Nv, Nij, Nh},
7
+ # bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh},
8
+ # ::ToCUDA,
9
+ # ) where {S, Nv, Nij, Nh}
10
+ # if Nv > 0 && Nh > 0
11
+ # us = DataLayouts.UniversalSize(dest)
12
+ # n = prod(DataLayouts.universal_size(us))
13
+ # if has_uniform_datalayouts(bc)
14
+ # bc′ = to_non_extruded_broadcasted(bc)
15
+ # auto_launch!(knl_copyto_linear!, (dest, bc′, us), n; auto = true)
16
+ # else
17
+ # auto_launch!(knl_copyto_cart!, (dest, bc, us), n; auto = true)
18
+ # end
19
+ # end
20
+ # return dest
21
+ # end
48
22
49
23
function knl_copyto_linear! (dest:: AbstractData , bc, us)
50
24
@inbounds begin
@@ -57,73 +31,30 @@ function knl_copyto_linear!(dest::AbstractData, bc, us)
57
31
end
58
32
59
33
function knl_copyto_linear! (dest:: DataF , bc, us)
34
+ tidx = thread_index ()
60
35
@inbounds dest[] = bc[tidx]
61
36
return nothing
62
37
end
63
38
64
- function knl_copyto_cart ! (dest, src , us)
39
+ function knl_copyto_flat ! (dest:: AbstractData , bc , us)
65
40
@inbounds begin
66
41
tidx = thread_index ()
67
42
if tidx ≤ get_N (us)
68
43
n = size (dest)
69
44
I = kernel_indexes (tidx, n)
70
- dest[I] = src [I]
45
+ dest[I] = bc [I]
71
46
end
72
47
end
73
48
return nothing
74
49
end
75
50
76
- function Base. copyto! (
77
- dest:: VIJFH{S, Nv, Nij, Nh} ,
78
- bc:: DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh} ,
79
- :: ToCUDA ,
80
- ) where {S, Nv, Nij, Nh}
81
- if Nv > 0 && Nh > 0
82
- us = DataLayouts. UniversalSize (dest)
83
- n = prod (DataLayouts. universal_size (us))
84
- if has_uniform_datalayouts (bc)
85
- bc′ = to_non_extruded_broadcasted (bc)
86
- auto_launch! (knl_copyto_linear!, (dest, bc′, us), n; auto = true )
87
- else
88
- auto_launch! (knl_copyto_cart!, (dest, bc, us), n; auto = true )
89
- end
90
- end
91
- return dest
92
- end
93
-
94
- function Base. copyto! (
95
- dest:: VF{S, Nv} ,
96
- bc:: DataLayouts.BroadcastedUnionVF{S, Nv} ,
97
- :: ToCUDA ,
98
- ) where {S, Nv}
99
- if Nv > 0
100
- auto_launch! (
101
- knl_copyto!,
102
- (dest, bc);
103
- threads_s = (1 , 1 ),
104
- blocks_s = (1 , Nv),
105
- )
106
- end
107
- return dest
108
- end
109
-
110
- function Base. copyto! (
111
- dest:: DataF{S} ,
112
- bc:: DataLayouts.BroadcastedUnionDataF{S} ,
113
- :: ToCUDA ,
114
- ) where {S}
115
- auto_launch! (knl_copyto!, (dest, bc); threads_s = (1 , 1 ), blocks_s = (1 , 1 ))
116
- return dest
117
- end
118
-
119
- import ClimaCore. DataLayouts: isascalar
120
- function knl_copyto_flat! (dest:: AbstractData , bc, us)
51
+ function knl_copyto_flat! (dest:: DataF , bc, us)
121
52
@inbounds begin
122
53
tidx = thread_index ()
123
54
if tidx ≤ get_N (us)
124
55
n = size (dest)
125
56
I = kernel_indexes (tidx, n)
126
- dest[I ] = bc[I]
57
+ dest[] = bc[I]
127
58
end
128
59
end
129
60
return nothing
132
63
function cuda_copyto! (dest:: AbstractData , bc)
133
64
(_, _, Nv, _, Nh) = DataLayouts. universal_size (dest)
134
65
us = DataLayouts. UniversalSize (dest)
66
+ n = prod (DataLayouts. universal_size (us))
135
67
if Nv > 0 && Nh > 0
136
- nitems = prod (DataLayouts. universal_size (dest))
137
- auto_launch! (knl_copyto_flat!, (dest, bc, us), nitems; auto = true )
68
+ if has_uniform_datalayouts (bc)
69
+ bc′ = to_non_extruded_broadcasted (bc)
70
+ auto_launch! (knl_copyto_linear!, (dest, bc′, us), nitems; auto = true )
71
+ else
72
+ auto_launch! (knl_copyto_flat!, (dest, bc, us), nitems; auto = true )
73
+ end
138
74
end
139
75
return dest
140
76
end
141
77
142
78
# TODO : can we use CUDA's luanch configuration for all data layouts?
143
79
# Currently, it seems to have a slight performance degradation.
144
80
# ! format: off
145
- # Base.copyto!(dest::IJFH{S, Nij}, bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh}, ::ToCUDA) where {S, Nij, Nh} = cuda_copyto!(dest, bc)
81
+ Base. copyto! (dest:: IJFH{S, Nij} , bc:: DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh} , :: ToCUDA ) where {S, Nij, Nh} = cuda_copyto! (dest, bc)
146
82
Base. copyto! (dest:: IFH{S, Ni, Nh} , bc:: DataLayouts.BroadcastedUnionIFH{S, Ni, Nh} , :: ToCUDA ) where {S, Ni, Nh} = cuda_copyto! (dest, bc)
147
83
Base. copyto! (dest:: IJF{S, Nij} , bc:: DataLayouts.BroadcastedUnionIJF{S, Nij} , :: ToCUDA ) where {S, Nij} = cuda_copyto! (dest, bc)
148
84
Base. copyto! (dest:: IF{S, Ni} , bc:: DataLayouts.BroadcastedUnionIF{S, Ni} , :: ToCUDA ) where {S, Ni} = cuda_copyto! (dest, bc)
149
85
Base. copyto! (dest:: VIFH{S, Nv, Ni, Nh} , bc:: DataLayouts.BroadcastedUnionVIFH{S, Nv, Ni, Nh} , :: ToCUDA ) where {S, Nv, Ni, Nh} = cuda_copyto! (dest, bc)
150
- # Base.copyto!(dest::VIJFH{S, Nv, Nij, Nh}, bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh}, ::ToCUDA) where {S, Nv, Nij, Nh} = cuda_copyto!(dest, bc)
151
- # Base.copyto!(dest::VF{S, Nv}, bc::DataLayouts.BroadcastedUnionVF{S, Nv}, ::ToCUDA) where {S, Nv} = cuda_copyto!(dest, bc)
152
- # Base.copyto!(dest::DataF{S}, bc::DataLayouts.BroadcastedUnionDataF{S}, ::ToCUDA) where {S} = cuda_copyto!(dest, bc)
86
+ Base. copyto! (dest:: VIJFH{S, Nv, Nij, Nh} , bc:: DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh} , :: ToCUDA ) where {S, Nv, Nij, Nh} = cuda_copyto! (dest, bc)
87
+ Base. copyto! (dest:: VF{S, Nv} , bc:: DataLayouts.BroadcastedUnionVF{S, Nv} , :: ToCUDA ) where {S, Nv} = cuda_copyto! (dest, bc)
88
+ Base. copyto! (dest:: DataF{S} , bc:: DataLayouts.BroadcastedUnionDataF{S} , :: ToCUDA ) where {S} = cuda_copyto! (dest, bc)
153
89
# ! format: on
0 commit comments