@@ -114,58 +114,148 @@ RT_API_ATTRS void CheckIntegerKind(
114
114
}
115
115
}
116
116
117
+ template <typename P, int RANK>
117
118
RT_API_ATTRS void ShallowCopyDiscontiguousToDiscontiguous (
118
119
const Descriptor &to, const Descriptor &from) {
119
- SubscriptValue toAt[maxRank], fromAt[maxRank];
120
- to.GetLowerBounds (toAt);
121
- from.GetLowerBounds (fromAt);
120
+ DescriptorIterator<RANK> toIt{to};
121
+ DescriptorIterator<RANK> fromIt{from};
122
+ // Knowing the size at compile time can enable memcpy inlining optimisations
123
+ constexpr std::size_t typeElementBytes{sizeof (P)};
124
+ // We might still need to check the actual size as a fallback
122
125
std::size_t elementBytes{to.ElementBytes ()};
123
126
for (std::size_t n{to.Elements ()}; n-- > 0 ;
124
- to.IncrementSubscripts (toAt), from.IncrementSubscripts (fromAt)) {
125
- std::memcpy (
126
- to.Element <char >(toAt), from.Element <char >(fromAt), elementBytes);
127
+ toIt.Advance (), fromIt.Advance ()) {
128
+ // typeElementBytes == 1 when P is a char - the non-specialised case
129
+ if constexpr (typeElementBytes != 1 ) {
130
+ std::memcpy (
131
+ toIt.template Get <P>(), fromIt.template Get <P>(), typeElementBytes);
132
+ } else {
133
+ std::memcpy (
134
+ toIt.template Get <P>(), fromIt.template Get <P>(), elementBytes);
135
+ }
127
136
}
128
137
}
129
138
139
+ template <typename P, int RANK>
130
140
RT_API_ATTRS void ShallowCopyDiscontiguousToContiguous (
131
141
const Descriptor &to, const Descriptor &from) {
132
142
char *toAt{to.OffsetElement ()};
133
- SubscriptValue fromAt[maxRank];
134
- from.GetLowerBounds (fromAt);
143
+ constexpr std::size_t typeElementBytes{sizeof (P)};
135
144
std::size_t elementBytes{to.ElementBytes ()};
145
+ DescriptorIterator<RANK> fromIt{from};
136
146
for (std::size_t n{to.Elements ()}; n-- > 0 ;
137
- toAt += elementBytes, from.IncrementSubscripts (fromAt)) {
138
- std::memcpy (toAt, from.Element <char >(fromAt), elementBytes);
147
+ toAt += elementBytes, fromIt.Advance ()) {
148
+ if constexpr (typeElementBytes != 1 ) {
149
+ std::memcpy (toAt, fromIt.template Get <P>(), typeElementBytes);
150
+ } else {
151
+ std::memcpy (toAt, fromIt.template Get <P>(), elementBytes);
152
+ }
139
153
}
140
154
}
141
155
156
+ template <typename P, int RANK>
142
157
RT_API_ATTRS void ShallowCopyContiguousToDiscontiguous (
143
158
const Descriptor &to, const Descriptor &from) {
144
- SubscriptValue toAt[maxRank];
145
- to.GetLowerBounds (toAt);
146
159
char *fromAt{from.OffsetElement ()};
160
+ DescriptorIterator<RANK> toIt{to};
161
+ constexpr std::size_t typeElementBytes{sizeof (P)};
147
162
std::size_t elementBytes{to.ElementBytes ()};
148
163
for (std::size_t n{to.Elements ()}; n-- > 0 ;
149
- to.IncrementSubscripts (toAt), fromAt += elementBytes) {
150
- std::memcpy (to.Element <char >(toAt), fromAt, elementBytes);
164
+ toIt.Advance (), fromAt += elementBytes) {
165
+ if constexpr (typeElementBytes != 1 ) {
166
+ std::memcpy (toIt.template Get <P>(), fromAt, typeElementBytes);
167
+ } else {
168
+ std::memcpy (toIt.template Get <P>(), fromAt, elementBytes);
169
+ }
151
170
}
152
171
}
153
172
154
- RT_API_ATTRS void ShallowCopy (const Descriptor &to, const Descriptor &from,
173
+ // ShallowCopy helper for calling the correct specialised variant based on
174
+ // scenario
175
+ template <typename P, int RANK = -1 >
176
+ RT_API_ATTRS void ShallowCopyInner (const Descriptor &to, const Descriptor &from,
155
177
bool toIsContiguous, bool fromIsContiguous) {
156
178
if (toIsContiguous) {
157
179
if (fromIsContiguous) {
158
180
std::memcpy (to.OffsetElement (), from.OffsetElement (),
159
181
to.Elements () * to.ElementBytes ());
160
182
} else {
161
- ShallowCopyDiscontiguousToContiguous (to, from);
183
+ ShallowCopyDiscontiguousToContiguous<P, RANK> (to, from);
162
184
}
163
185
} else {
164
186
if (fromIsContiguous) {
165
- ShallowCopyContiguousToDiscontiguous (to, from);
187
+ ShallowCopyContiguousToDiscontiguous<P, RANK>(to, from);
188
+ } else {
189
+ ShallowCopyDiscontiguousToDiscontiguous<P, RANK>(to, from);
190
+ }
191
+ }
192
+ }
193
+
194
+ // Most arrays are much closer to rank-1 than to maxRank.
195
+ // Doing the recursion upwards instead of downwards puts the more common
196
+ // cases earlier in the if-chain and has a tangible impact on performance.
197
+ template <typename P, int RANK> struct ShallowCopyRankSpecialize {
198
+ static bool execute (const Descriptor &to, const Descriptor &from,
199
+ bool toIsContiguous, bool fromIsContiguous) {
200
+ if (to.rank () == RANK && from.rank () == RANK) {
201
+ ShallowCopyInner<P, RANK>(to, from, toIsContiguous, fromIsContiguous);
202
+ return true ;
203
+ }
204
+ return ShallowCopyRankSpecialize<P, RANK + 1 >::execute (
205
+ to, from, toIsContiguous, fromIsContiguous);
206
+ }
207
+ };
208
+
209
+ template <typename P> struct ShallowCopyRankSpecialize <P, maxRank + 1 > {
210
+ static bool execute (const Descriptor &to, const Descriptor &from,
211
+ bool toIsContiguous, bool fromIsContiguous) {
212
+ return false ;
213
+ }
214
+ };
215
+
216
+ // ShallowCopy helper for specialising the variants based on array rank
217
+ template <typename P>
218
+ RT_API_ATTRS void ShallowCopyRank (const Descriptor &to, const Descriptor &from,
219
+ bool toIsContiguous, bool fromIsContiguous) {
220
+ // Try to call a specialised ShallowCopy variant from rank-1 up to maxRank
221
+ bool specialized{ShallowCopyRankSpecialize<P, 1 >::execute (
222
+ to, from, toIsContiguous, fromIsContiguous)};
223
+ if (!specialized) {
224
+ ShallowCopyInner<P>(to, from, toIsContiguous, fromIsContiguous);
225
+ }
226
+ }
227
+
228
+ RT_API_ATTRS void ShallowCopy (const Descriptor &to, const Descriptor &from,
229
+ bool toIsContiguous, bool fromIsContiguous) {
230
+ std::size_t elementBytes{to.ElementBytes ()};
231
+ // Checking the type at runtime and making sure the pointer passed to memcpy
232
+ // has a type that matches the element type makes it possible for the compiler
233
+ // to optimise out the memcpy calls altogether and can substantially improve
234
+ // performance for some applications.
235
+ if (to.type ().IsInteger ()) {
236
+ if (elementBytes == sizeof (int64_t )) {
237
+ ShallowCopyRank<int64_t >(to, from, toIsContiguous, fromIsContiguous);
238
+ } else if (elementBytes == sizeof (int32_t )) {
239
+ ShallowCopyRank<int32_t >(to, from, toIsContiguous, fromIsContiguous);
240
+ } else if (elementBytes == sizeof (int16_t )) {
241
+ ShallowCopyRank<int16_t >(to, from, toIsContiguous, fromIsContiguous);
242
+ #if defined USING_NATIVE_INT128_T
243
+ } else if (elementBytes == sizeof (__int128_t )) {
244
+ ShallowCopyRank<__int128_t >(to, from, toIsContiguous, fromIsContiguous);
245
+ #endif
166
246
} else {
167
- ShallowCopyDiscontiguousToDiscontiguous (to, from);
247
+ ShallowCopyRank< char > (to, from, toIsContiguous, fromIsContiguous );
168
248
}
249
+ } else if (to.type ().IsReal ()) {
250
+ if (elementBytes == sizeof (double )) {
251
+ ShallowCopyRank<double >(to, from, toIsContiguous, fromIsContiguous);
252
+ } else if (elementBytes == sizeof (float )) {
253
+ ShallowCopyRank<float >(to, from, toIsContiguous, fromIsContiguous);
254
+ } else {
255
+ ShallowCopyRank<char >(to, from, toIsContiguous, fromIsContiguous);
256
+ }
257
+ } else {
258
+ ShallowCopyRank<char >(to, from, toIsContiguous, fromIsContiguous);
169
259
}
170
260
}
171
261
0 commit comments