Skip to content

Commit c2892b0

Browse files
authored
[flang-rt] Optimise ShallowCopy and use it in CopyInAssign (llvm#140569)
Using Descriptor.Element<>() when iterating through a rank-1 array is currently inefficient, because the generic implementation suitable for arrays of any rank makes the compiler unable to perform optimisations that would make the rank-1 case considerably faster. This is currently done inside ShallowCopy, as well as by CopyInAssign, where the implementation of elemental copies (inside Assign) is equivalent to ShallowCopyDiscontiguousToDiscontiguous. To address that, add a DescriptorIterator abstraction specialised for arrays of various ranks, and use that throughout ShallowCopy to iterate over the arrays. Furthermore, depending on the pointer type passed to memcpy, the optimiser can remove the memcpy calls from ShallowCopy altogether which can result in substantial performance improvements on its own. Specialise ShallowCopy for various element pointer types to make these optimisations possible. Finally, replace the call to Assign inside CopyInAssign with a call to newly optimised ShallowCopy. For the thornado-mini application, this reduces the runtime by 27.7%. --------- Signed-off-by: Kajetan Puchalski <kajetan.puchalski@arm.com>
1 parent 6375a85 commit c2892b0

File tree

6 files changed

+232
-23
lines changed

6 files changed

+232
-23
lines changed

flang-rt/include/flang-rt/runtime/descriptor.h

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,64 @@ class Descriptor {
437437
};
438438
static_assert(sizeof(Descriptor) == sizeof(ISO::CFI_cdesc_t));
439439

440+
// Lightweight iterator-like API to simplify specialising Descriptor indexing
441+
// in cases where it can improve application performance. On account of the
442+
// purpose of this API being performance optimisation, it is up to the user to
443+
// do all the necessary checks to make sure the specialised variants can be used
444+
// safely and that Advance() is not called more times than the number of
445+
// elements in the Descriptor allows for.
446+
// Default RANK=-1 supports aray descriptors of any rank up to maxRank.
447+
template <int RANK = -1> class DescriptorIterator {
448+
private:
449+
const Descriptor &descriptor;
450+
SubscriptValue subscripts[maxRank];
451+
std::size_t elementOffset{0};
452+
453+
public:
454+
RT_API_ATTRS DescriptorIterator(const Descriptor &descriptor)
455+
: descriptor(descriptor) {
456+
// We do not need the subscripts to iterate over a rank-1 array
457+
if constexpr (RANK != 1) {
458+
descriptor.GetLowerBounds(subscripts);
459+
}
460+
};
461+
462+
template <typename A> RT_API_ATTRS A *Get() {
463+
std::size_t offset{0};
464+
// The rank-1 case doesn't require looping at all
465+
if constexpr (RANK == 1) {
466+
offset = elementOffset;
467+
// The compiler might be able to optimise this better if we know the rank
468+
// at compile time
469+
} else if constexpr (RANK != -1) {
470+
for (int j{0}; j < RANK; ++j) {
471+
offset += descriptor.SubscriptByteOffset(j, subscripts[j]);
472+
}
473+
// General fallback
474+
} else {
475+
offset = descriptor.SubscriptsToByteOffset(subscripts);
476+
}
477+
478+
return descriptor.OffsetElement<A>(offset);
479+
}
480+
481+
RT_API_ATTRS void Advance() {
482+
if constexpr (RANK == 1) {
483+
elementOffset += descriptor.GetDimension(0).ByteStride();
484+
} else if constexpr (RANK != -1) {
485+
for (int j{0}; j < RANK; ++j) {
486+
const Dimension &dim{descriptor.GetDimension(j)};
487+
if (subscripts[j]++ < dim.UpperBound()) {
488+
break;
489+
}
490+
subscripts[j] = dim.LowerBound();
491+
}
492+
} else {
493+
descriptor.IncrementSubscripts(subscripts);
494+
}
495+
}
496+
};
497+
440498
// Properly configured instances of StaticDescriptor will occupy the
441499
// exact amount of storage required for the descriptor, its dimensional
442500
// information, and possible addendum. To build such a static descriptor,

flang-rt/include/flang-rt/runtime/tools.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -511,10 +511,13 @@ inline RT_API_ATTRS const char *FindCharacter(
511511
// Copy payload data from one allocated descriptor to another.
512512
// Assumes element counts and element sizes match, and that both
513513
// descriptors are allocated.
514+
template <typename P = char, int RANK = -1>
514515
RT_API_ATTRS void ShallowCopyDiscontiguousToDiscontiguous(
515516
const Descriptor &to, const Descriptor &from);
517+
template <typename P = char, int RANK = -1>
516518
RT_API_ATTRS void ShallowCopyDiscontiguousToContiguous(
517519
const Descriptor &to, const Descriptor &from);
520+
template <typename P = char, int RANK = -1>
518521
RT_API_ATTRS void ShallowCopyContiguousToDiscontiguous(
519522
const Descriptor &to, const Descriptor &from);
520523
RT_API_ATTRS void ShallowCopy(const Descriptor &to, const Descriptor &from,

flang-rt/lib/runtime/assign.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -497,7 +497,7 @@ RT_API_ATTRS void Assign(Descriptor &to, const Descriptor &from,
497497
}
498498
} else { // elemental copies, possibly with character truncation
499499
for (std::size_t n{toElements}; n-- > 0;
500-
to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
500+
to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
501501
memmoveFct(to.Element<char>(toAt), from.Element<const char>(fromAt),
502502
toElementBytes);
503503
}
@@ -591,7 +591,8 @@ void RTDEF(CopyInAssign)(Descriptor &temp, const Descriptor &var,
591591
temp = var;
592592
temp.set_base_addr(nullptr);
593593
temp.raw().attribute = CFI_attribute_allocatable;
594-
RTNAME(AssignTemporary)(temp, var, sourceFile, sourceLine);
594+
temp.Allocate(kNoAsyncId);
595+
ShallowCopy(temp, var);
595596
}
596597

597598
void RTDEF(CopyOutAssign)(
@@ -600,9 +601,10 @@ void RTDEF(CopyOutAssign)(
600601

601602
// Copyout from the temporary must not cause any finalizations
602603
// for LHS. The variable must be properly initialized already.
603-
if (var)
604-
Assign(*var, temp, terminator, NoAssignFlags);
605-
temp.Destroy(/*finalize=*/false, /*destroyPointers=*/false, &terminator);
604+
if (var) {
605+
ShallowCopy(*var, temp);
606+
}
607+
temp.Deallocate();
606608
}
607609

608610
void RTDEF(AssignExplicitLengthCharacter)(Descriptor &to,

flang-rt/lib/runtime/tools.cpp

Lines changed: 108 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -114,58 +114,148 @@ RT_API_ATTRS void CheckIntegerKind(
114114
}
115115
}
116116

117+
template <typename P, int RANK>
117118
RT_API_ATTRS void ShallowCopyDiscontiguousToDiscontiguous(
118119
const Descriptor &to, const Descriptor &from) {
119-
SubscriptValue toAt[maxRank], fromAt[maxRank];
120-
to.GetLowerBounds(toAt);
121-
from.GetLowerBounds(fromAt);
120+
DescriptorIterator<RANK> toIt{to};
121+
DescriptorIterator<RANK> fromIt{from};
122+
// Knowing the size at compile time can enable memcpy inlining optimisations
123+
constexpr std::size_t typeElementBytes{sizeof(P)};
124+
// We might still need to check the actual size as a fallback
122125
std::size_t elementBytes{to.ElementBytes()};
123126
for (std::size_t n{to.Elements()}; n-- > 0;
124-
to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
125-
std::memcpy(
126-
to.Element<char>(toAt), from.Element<char>(fromAt), elementBytes);
127+
toIt.Advance(), fromIt.Advance()) {
128+
// typeElementBytes == 1 when P is a char - the non-specialised case
129+
if constexpr (typeElementBytes != 1) {
130+
std::memcpy(
131+
toIt.template Get<P>(), fromIt.template Get<P>(), typeElementBytes);
132+
} else {
133+
std::memcpy(
134+
toIt.template Get<P>(), fromIt.template Get<P>(), elementBytes);
135+
}
127136
}
128137
}
129138

139+
template <typename P, int RANK>
130140
RT_API_ATTRS void ShallowCopyDiscontiguousToContiguous(
131141
const Descriptor &to, const Descriptor &from) {
132142
char *toAt{to.OffsetElement()};
133-
SubscriptValue fromAt[maxRank];
134-
from.GetLowerBounds(fromAt);
143+
constexpr std::size_t typeElementBytes{sizeof(P)};
135144
std::size_t elementBytes{to.ElementBytes()};
145+
DescriptorIterator<RANK> fromIt{from};
136146
for (std::size_t n{to.Elements()}; n-- > 0;
137-
toAt += elementBytes, from.IncrementSubscripts(fromAt)) {
138-
std::memcpy(toAt, from.Element<char>(fromAt), elementBytes);
147+
toAt += elementBytes, fromIt.Advance()) {
148+
if constexpr (typeElementBytes != 1) {
149+
std::memcpy(toAt, fromIt.template Get<P>(), typeElementBytes);
150+
} else {
151+
std::memcpy(toAt, fromIt.template Get<P>(), elementBytes);
152+
}
139153
}
140154
}
141155

156+
template <typename P, int RANK>
142157
RT_API_ATTRS void ShallowCopyContiguousToDiscontiguous(
143158
const Descriptor &to, const Descriptor &from) {
144-
SubscriptValue toAt[maxRank];
145-
to.GetLowerBounds(toAt);
146159
char *fromAt{from.OffsetElement()};
160+
DescriptorIterator<RANK> toIt{to};
161+
constexpr std::size_t typeElementBytes{sizeof(P)};
147162
std::size_t elementBytes{to.ElementBytes()};
148163
for (std::size_t n{to.Elements()}; n-- > 0;
149-
to.IncrementSubscripts(toAt), fromAt += elementBytes) {
150-
std::memcpy(to.Element<char>(toAt), fromAt, elementBytes);
164+
toIt.Advance(), fromAt += elementBytes) {
165+
if constexpr (typeElementBytes != 1) {
166+
std::memcpy(toIt.template Get<P>(), fromAt, typeElementBytes);
167+
} else {
168+
std::memcpy(toIt.template Get<P>(), fromAt, elementBytes);
169+
}
151170
}
152171
}
153172

154-
RT_API_ATTRS void ShallowCopy(const Descriptor &to, const Descriptor &from,
173+
// ShallowCopy helper for calling the correct specialised variant based on
174+
// scenario
175+
template <typename P, int RANK = -1>
176+
RT_API_ATTRS void ShallowCopyInner(const Descriptor &to, const Descriptor &from,
155177
bool toIsContiguous, bool fromIsContiguous) {
156178
if (toIsContiguous) {
157179
if (fromIsContiguous) {
158180
std::memcpy(to.OffsetElement(), from.OffsetElement(),
159181
to.Elements() * to.ElementBytes());
160182
} else {
161-
ShallowCopyDiscontiguousToContiguous(to, from);
183+
ShallowCopyDiscontiguousToContiguous<P, RANK>(to, from);
162184
}
163185
} else {
164186
if (fromIsContiguous) {
165-
ShallowCopyContiguousToDiscontiguous(to, from);
187+
ShallowCopyContiguousToDiscontiguous<P, RANK>(to, from);
188+
} else {
189+
ShallowCopyDiscontiguousToDiscontiguous<P, RANK>(to, from);
190+
}
191+
}
192+
}
193+
194+
// Most arrays are much closer to rank-1 than to maxRank.
195+
// Doing the recursion upwards instead of downwards puts the more common
196+
// cases earlier in the if-chain and has a tangible impact on performance.
197+
template <typename P, int RANK> struct ShallowCopyRankSpecialize {
198+
static bool execute(const Descriptor &to, const Descriptor &from,
199+
bool toIsContiguous, bool fromIsContiguous) {
200+
if (to.rank() == RANK && from.rank() == RANK) {
201+
ShallowCopyInner<P, RANK>(to, from, toIsContiguous, fromIsContiguous);
202+
return true;
203+
}
204+
return ShallowCopyRankSpecialize<P, RANK + 1>::execute(
205+
to, from, toIsContiguous, fromIsContiguous);
206+
}
207+
};
208+
209+
template <typename P> struct ShallowCopyRankSpecialize<P, maxRank + 1> {
210+
static bool execute(const Descriptor &to, const Descriptor &from,
211+
bool toIsContiguous, bool fromIsContiguous) {
212+
return false;
213+
}
214+
};
215+
216+
// ShallowCopy helper for specialising the variants based on array rank
217+
template <typename P>
218+
RT_API_ATTRS void ShallowCopyRank(const Descriptor &to, const Descriptor &from,
219+
bool toIsContiguous, bool fromIsContiguous) {
220+
// Try to call a specialised ShallowCopy variant from rank-1 up to maxRank
221+
bool specialized{ShallowCopyRankSpecialize<P, 1>::execute(
222+
to, from, toIsContiguous, fromIsContiguous)};
223+
if (!specialized) {
224+
ShallowCopyInner<P>(to, from, toIsContiguous, fromIsContiguous);
225+
}
226+
}
227+
228+
RT_API_ATTRS void ShallowCopy(const Descriptor &to, const Descriptor &from,
229+
bool toIsContiguous, bool fromIsContiguous) {
230+
std::size_t elementBytes{to.ElementBytes()};
231+
// Checking the type at runtime and making sure the pointer passed to memcpy
232+
// has a type that matches the element type makes it possible for the compiler
233+
// to optimise out the memcpy calls altogether and can substantially improve
234+
// performance for some applications.
235+
if (to.type().IsInteger()) {
236+
if (elementBytes == sizeof(int64_t)) {
237+
ShallowCopyRank<int64_t>(to, from, toIsContiguous, fromIsContiguous);
238+
} else if (elementBytes == sizeof(int32_t)) {
239+
ShallowCopyRank<int32_t>(to, from, toIsContiguous, fromIsContiguous);
240+
} else if (elementBytes == sizeof(int16_t)) {
241+
ShallowCopyRank<int16_t>(to, from, toIsContiguous, fromIsContiguous);
242+
#if defined USING_NATIVE_INT128_T
243+
} else if (elementBytes == sizeof(__int128_t)) {
244+
ShallowCopyRank<__int128_t>(to, from, toIsContiguous, fromIsContiguous);
245+
#endif
166246
} else {
167-
ShallowCopyDiscontiguousToDiscontiguous(to, from);
247+
ShallowCopyRank<char>(to, from, toIsContiguous, fromIsContiguous);
168248
}
249+
} else if (to.type().IsReal()) {
250+
if (elementBytes == sizeof(double)) {
251+
ShallowCopyRank<double>(to, from, toIsContiguous, fromIsContiguous);
252+
} else if (elementBytes == sizeof(float)) {
253+
ShallowCopyRank<float>(to, from, toIsContiguous, fromIsContiguous);
254+
} else {
255+
ShallowCopyRank<char>(to, from, toIsContiguous, fromIsContiguous);
256+
}
257+
} else {
258+
ShallowCopyRank<char>(to, from, toIsContiguous, fromIsContiguous);
169259
}
170260
}
171261

flang-rt/unittests/Runtime/Assign.cpp

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
//===-- unittests/Runtime/Assign.cpp ------------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "flang/Runtime/assign.h"
10+
#include "tools.h"
11+
#include "gtest/gtest.h"
12+
#include <vector>
13+
14+
using namespace Fortran::runtime;
15+
using Fortran::common::TypeCategory;
16+
17+
TEST(Assign, RTNAME(CopyInAssign)) {
18+
// contiguous -> contiguous copy in
19+
auto intArray{MakeArray<TypeCategory::Integer, 1>(
20+
std::vector<int>{2, 3}, std::vector<int>{1, 2, 3, 4, 5, 6}, sizeof(int))};
21+
StaticDescriptor<2> staticIntResult;
22+
Descriptor &intResult{staticIntResult.descriptor()};
23+
24+
RTNAME(CopyInAssign(intResult, *intArray));
25+
ASSERT_TRUE(intResult.IsAllocated());
26+
ASSERT_TRUE(intResult.IsContiguous());
27+
ASSERT_EQ(intResult.type(), intArray->type());
28+
ASSERT_EQ(intResult.ElementBytes(), sizeof(int));
29+
EXPECT_EQ(intResult.GetDimension(0).LowerBound(), 1);
30+
EXPECT_EQ(intResult.GetDimension(0).Extent(), 2);
31+
EXPECT_EQ(intResult.GetDimension(1).LowerBound(), 1);
32+
EXPECT_EQ(intResult.GetDimension(1).Extent(), 3);
33+
int expected[6] = {1, 2, 3, 4, 5, 6};
34+
EXPECT_EQ(
35+
std::memcmp(intResult.OffsetElement<int>(0), expected, 6 * sizeof(int)),
36+
0);
37+
intResult.Destroy();
38+
39+
// discontiguous -> contiguous rank-1 copy in
40+
intArray = MakeArray<TypeCategory::Integer, 1>(std::vector<int>{8},
41+
std::vector<int>{1, 2, 3, 4, 5, 6, 7, 8}, sizeof(int));
42+
StaticDescriptor<1> staticIntResultStrided;
43+
Descriptor &intResultStrided{staticIntResultStrided.descriptor()};
44+
// Treat the descriptor as a strided array of 4
45+
intArray->GetDimension(0).SetByteStride(sizeof(int) * 2);
46+
intArray->GetDimension(0).SetExtent(4);
47+
RTNAME(CopyInAssign(intResultStrided, *intArray));
48+
49+
int expectedStrided[4] = {1, 3, 5, 7};
50+
EXPECT_EQ(std::memcmp(intResultStrided.OffsetElement<int>(0), expectedStrided,
51+
4 * sizeof(int)),
52+
0);
53+
54+
intResultStrided.Destroy();
55+
}

flang-rt/unittests/Runtime/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ add_flangrt_unittest(RuntimeTests
1010
AccessTest.cpp
1111
Allocatable.cpp
1212
ArrayConstructor.cpp
13+
Assign.cpp
1314
BufferTest.cpp
1415
CharacterTest.cpp
1516
CommandTest.cpp

0 commit comments

Comments
 (0)