Skip to content

Commit c22a5d3

Browse files
[SYCL][Reduction] Optimize reduCGFuncForRangeFastAtomics for discrete GPU (#6489)
Main idea is that memory transfer host->GPU has significant cost here unlike the integrated GPU/shared memory. As such, make sure sycl::no_init is used for partial sums buffer (free change). Also, modify initialization of group counter buffer to do it with an extra kernel as that is cheaper than transferring memory host->device. This latter change is dependent on the runtime condition of the device having host_unified_memory. By itself the change would have caused incompatibilities in PartialSums's accessor type. In order to deal with this, the optimization for a single WorkGroup was moved from the host to device code. That also made possible to always write final user's variable inside main kernel avoiding calling reduSaveFinalResultToUserMem for this scenario. As such, changed reduCGFuncForRange* to return a boolean indicating if such post-processing is needed as that is a property of a particular implementation now. Second part is to change how number of work groups is selected. Before this change one WG for each physcial EU thread was created, but it doesn't seem to be universally true. Make it one WG per EU (not thread) instead of discrete case.
1 parent 1e89466 commit c22a5d3

File tree

3 files changed

+91
-52
lines changed

3 files changed

+91
-52
lines changed

sycl/include/sycl/ext/oneapi/reduction.hpp

Lines changed: 85 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -649,6 +649,13 @@ class reduction_impl_algo : public reduction_impl_common<T, BinaryOperation> {
649649
}
650650
}
651651

652+
template <class _T = T, int D = buffer_dim>
653+
auto &getTempBuffer(size_t Size, handler &CGH) {
654+
auto Buffer = std::make_shared<buffer<_T, D>>(range<1>(Size));
655+
CGH.addReduction(Buffer);
656+
return *Buffer;
657+
}
658+
652659
/// Returns an accessor accessing the memory that will hold the reduction
653660
/// partial sums.
654661
/// If \p Size is equal to one, then the reduction result is the final and
@@ -708,15 +715,28 @@ class reduction_impl_algo : public reduction_impl_common<T, BinaryOperation> {
708715
return {*CounterBuf, CGH};
709716
}
710717

711-
RedOutVar &getUserRedVar() { return MRedOut; }
712-
713-
static inline result_type *getOutPointer(const rw_accessor_type &OutAcc) {
714-
return OutAcc.get_pointer().get();
718+
// On discrete (vs. integrated) GPUs it's faster to initialize memory with an
719+
// extra kernel than copy it from the host.
720+
template <typename Name> auto getGroupsCounterAccDiscrete(handler &CGH) {
721+
auto &Buf = getTempBuffer<int, 1>(1, CGH);
722+
std::shared_ptr<detail::queue_impl> QueueCopy = CGH.MQueue;
723+
auto Event = CGH.withAuxHandler(QueueCopy, [&](handler &InitHandler) {
724+
auto Acc = accessor{Buf, InitHandler, sycl::write_only, sycl::no_init};
725+
InitHandler.single_task<Name>([=]() { Acc[0] = 0; });
726+
});
727+
CGH.depends_on(Event);
728+
return accessor{Buf, CGH};
715729
}
716730

731+
RedOutVar &getUserRedVar() { return MRedOut; }
732+
717733
static inline result_type *getOutPointer(result_type *OutPtr) {
718734
return OutPtr;
719735
}
736+
template <class AccessorType>
737+
static inline result_type *getOutPointer(const AccessorType &OutAcc) {
738+
return OutAcc.get_pointer().get();
739+
}
720740

721741
private:
722742
template <typename BufferT>
@@ -892,7 +912,7 @@ template <class KernelName> struct RangeFastAtomics;
892912
} // namespace main_krn
893913
} // namespace reduction
894914
template <typename KernelName, typename KernelType, int Dims, class Reduction>
895-
void reduCGFuncForRangeFastAtomics(handler &CGH, KernelType KernelFunc,
915+
bool reduCGFuncForRangeFastAtomics(handler &CGH, KernelType KernelFunc,
896916
const range<Dims> &Range,
897917
const nd_range<1> &NDRange,
898918
Reduction &Redu) {
@@ -927,29 +947,43 @@ void reduCGFuncForRangeFastAtomics(handler &CGH, KernelType KernelFunc,
927947
Reducer.template atomic_combine(Reduction::getOutPointer(Out));
928948
}
929949
});
950+
return Reduction::is_usm || Redu.initializeToIdentity();
930951
}
931952

932953
namespace reduction {
933954
namespace main_krn {
934955
template <class KernelName> struct RangeFastReduce;
935956
} // namespace main_krn
957+
namespace init_krn {
958+
template <class KernelName> struct GroupCounter;
959+
}
936960
} // namespace reduction
937961
template <typename KernelName, typename KernelType, int Dims, class Reduction>
938-
void reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
962+
bool reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
939963
const range<Dims> &Range,
940964
const nd_range<1> &NDRange, Reduction &Redu) {
941965
constexpr size_t NElements = Reduction::num_elements;
942966
size_t WGSize = NDRange.get_local_range().size();
943967
size_t NWorkGroups = NDRange.get_group_range().size();
944968

969+
auto &Out = Redu.getUserRedVar();
970+
if constexpr (Reduction::is_acc)
971+
associateWithHandler(CGH, &Out, access::target::device);
972+
973+
auto &PartialSumsBuf = Redu.getTempBuffer(NWorkGroups * NElements, CGH);
974+
accessor PartialSums(PartialSumsBuf, CGH, sycl::read_write, sycl::no_init);
975+
945976
bool IsUpdateOfUserVar = !Reduction::is_usm && !Redu.initializeToIdentity();
946-
auto PartialSums =
947-
Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH);
948-
auto Out = (NWorkGroups == 1)
949-
? PartialSums
950-
: Redu.getWriteAccForPartialReds(NElements, CGH);
977+
using InitName =
978+
__sycl_reduction_kernel<reduction::init_krn::GroupCounter, KernelName>;
979+
980+
// Integrated/discrete GPUs have different faster path.
951981
auto NWorkGroupsFinished =
952-
Redu.getReadWriteAccessorToInitializedGroupsCounter(CGH);
982+
sycl::detail::getDeviceFromHandler(CGH)
983+
.get_info<info::device::host_unified_memory>()
984+
? Redu.getReadWriteAccessorToInitializedGroupsCounter(CGH)
985+
: Redu.template getGroupsCounterAccDiscrete<InitName>(CGH);
986+
953987
auto DoReducePartialSumsInLastWG =
954988
Reduction::template getReadWriteLocalAcc<int>(1, CGH);
955989

@@ -967,50 +1001,57 @@ void reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
9671001
// reduce_over_group is only defined for each T, not for span<T, ...>
9681002
size_t LID = NDId.get_local_id(0);
9691003
for (int E = 0; E < NElements; ++E) {
970-
Reducer.getElement(E) =
971-
reduce_over_group(Group, Reducer.getElement(E), BOp);
972-
1004+
auto &RedElem = Reducer.getElement(E);
1005+
RedElem = reduce_over_group(Group, RedElem, BOp);
9731006
if (LID == 0) {
974-
if (NWorkGroups == 1 && IsUpdateOfUserVar)
975-
Reducer.getElement(E) =
976-
BOp(Reducer.getElement(E), Reduction::getOutPointer(Out)[E]);
977-
978-
// if NWorkGroups == 1, then PartialsSum and Out point to same memory.
979-
Reduction::getOutPointer(
980-
PartialSums)[NDId.get_group_linear_id() * NElements + E] =
981-
Reducer.getElement(E);
1007+
if (NWorkGroups == 1) {
1008+
auto &OutElem = Reduction::getOutPointer(Out)[E];
1009+
// Can avoid using partial sum and write the final result immediately.
1010+
if (IsUpdateOfUserVar)
1011+
RedElem = BOp(RedElem, OutElem);
1012+
OutElem = RedElem;
1013+
} else {
1014+
PartialSums[NDId.get_group_linear_id() * NElements + E] =
1015+
Reducer.getElement(E);
1016+
}
9821017
}
9831018
}
9841019

1020+
if (NWorkGroups == 1)
1021+
// We're done.
1022+
return;
1023+
9851024
// Signal this work-group has finished after all values are reduced
9861025
if (LID == 0) {
9871026
auto NFinished =
9881027
sycl::atomic_ref<int, memory_order::relaxed, memory_scope::device,
9891028
access::address_space::global_space>(
9901029
NWorkGroupsFinished[0]);
991-
DoReducePartialSumsInLastWG[0] =
992-
++NFinished == NWorkGroups && NWorkGroups > 1;
1030+
DoReducePartialSumsInLastWG[0] = ++NFinished == NWorkGroups;
9931031
}
9941032

9951033
sycl::detail::workGroupBarrier();
9961034
if (DoReducePartialSumsInLastWG[0]) {
9971035
// Reduce each result separately
998-
// TODO: Opportunity to parallelize across elements
1036+
// TODO: Opportunity to parallelize across elements.
9991037
for (int E = 0; E < NElements; ++E) {
1038+
auto &OutElem = Reduction::getOutPointer(Out)[E];
10001039
auto LocalSum = Reducer.getIdentity();
10011040
for (size_t I = LID; I < NWorkGroups; I += WGSize)
10021041
LocalSum = BOp(LocalSum, PartialSums[I * NElements + E]);
1003-
Reducer.getElement(E) = reduce_over_group(Group, LocalSum, BOp);
1042+
auto Result = reduce_over_group(Group, LocalSum, BOp);
10041043

10051044
if (LID == 0) {
10061045
if (IsUpdateOfUserVar)
1007-
Reducer.getElement(E) =
1008-
BOp(Reducer.getElement(E), Reduction::getOutPointer(Out)[E]);
1009-
Reduction::getOutPointer(Out)[E] = Reducer.getElement(E);
1046+
Result = BOp(Result, OutElem);
1047+
OutElem = Result;
10101048
}
10111049
}
10121050
}
10131051
});
1052+
1053+
// We've updated user's variable, no extra work needed.
1054+
return false;
10141055
}
10151056

10161057
namespace reduction {
@@ -1019,7 +1060,7 @@ template <class KernelName> struct RangeBasic;
10191060
} // namespace main_krn
10201061
} // namespace reduction
10211062
template <typename KernelName, typename KernelType, int Dims, class Reduction>
1022-
void reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
1063+
bool reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
10231064
const range<Dims> &Range,
10241065
const nd_range<1> &NDRange, Reduction &Redu) {
10251066
constexpr size_t NElements = Reduction::num_elements;
@@ -1125,10 +1166,13 @@ void reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
11251166
}
11261167
}
11271168
});
1169+
return Reduction::is_usm || Reduction::is_dw_acc;
11281170
}
11291171

1172+
/// Returns "true" if the result has to be saved to user's variable by
1173+
/// reduSaveFinalResultToUserMem.
11301174
template <typename KernelName, typename KernelType, int Dims, class Reduction>
1131-
void reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
1175+
bool reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
11321176
const range<Dims> &Range, size_t MaxWGSize,
11331177
uint32_t NumConcurrentWorkGroups, Reduction &Redu) {
11341178
size_t NWorkItems = Range.size();
@@ -1141,16 +1185,15 @@ void reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
11411185
size_t NDRItems = NWorkGroups * WGSize;
11421186
nd_range<1> NDRange{range<1>{NDRItems}, range<1>{WGSize}};
11431187

1144-
if constexpr (Reduction::has_fast_atomics) {
1145-
reduCGFuncForRangeFastAtomics<KernelName>(CGH, KernelFunc, Range, NDRange,
1146-
Redu);
1147-
1148-
} else if constexpr (Reduction::has_fast_reduce) {
1149-
reduCGFuncForRangeFastReduce<KernelName>(CGH, KernelFunc, Range, NDRange,
1150-
Redu);
1151-
} else {
1152-
reduCGFuncForRangeBasic<KernelName>(CGH, KernelFunc, Range, NDRange, Redu);
1153-
}
1188+
if constexpr (Reduction::has_fast_atomics)
1189+
return reduCGFuncForRangeFastAtomics<KernelName>(CGH, KernelFunc, Range,
1190+
NDRange, Redu);
1191+
else if constexpr (Reduction::has_fast_reduce)
1192+
return reduCGFuncForRangeFastReduce<KernelName>(CGH, KernelFunc, Range,
1193+
NDRange, Redu);
1194+
else
1195+
return reduCGFuncForRangeBasic<KernelName>(CGH, KernelFunc, Range, NDRange,
1196+
Redu);
11541197
}
11551198

11561199
namespace reduction {

sycl/include/sycl/handler.hpp

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,7 @@ using sycl::detail::queue_impl;
251251
/// If we are given sycl::range and not sycl::nd_range we have more freedom in
252252
/// how to split the iteration space.
253253
template <typename KernelName, typename KernelType, int Dims, class Reduction>
254-
void reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
254+
bool reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
255255
const range<Dims> &Range, size_t MaxWGSize,
256256
uint32_t NumConcurrentWorkGroups, Reduction &Redu);
257257

@@ -1649,11 +1649,9 @@ class __SYCL_EXPORT handler {
16491649
// for the device.
16501650
size_t MaxWGSize =
16511651
ext::oneapi::detail::reduGetMaxWGSize(MQueue, OneElemSize);
1652-
ext::oneapi::detail::reduCGFuncForRange<KernelName>(
1653-
*this, KernelFunc, Range, MaxWGSize, NumConcurrentWorkGroups, Redu);
1654-
if (Reduction::is_usm ||
1655-
(Reduction::has_fast_atomics && Redu.initializeToIdentity()) ||
1656-
(!Reduction::has_fast_atomics && Reduction::is_dw_acc)) {
1652+
if (ext::oneapi::detail::reduCGFuncForRange<KernelName>(
1653+
*this, KernelFunc, Range, MaxWGSize, NumConcurrentWorkGroups,
1654+
Redu)) {
16571655
this->finalize();
16581656
MLastEvent = withAuxHandler(QueueCopy, [&](handler &CopyHandler) {
16591657
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(

sycl/source/detail/reduction.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,10 +56,8 @@ __SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(
5656
device Dev = Queue->get_device();
5757
uint32_t NumThreads = Dev.get_info<info::device::max_compute_units>();
5858
// TODO: The heuristics here require additional tuning for various devices
59-
// and vendors. For now this code assumes that execution units have about
60-
// 8 working threads, which gives good results on some known/supported
61-
// GPU devices.
62-
if (Dev.is_gpu())
59+
// and vendors. Also, it would be better to check vendor/generation/etc.
60+
if (Dev.is_gpu() && Dev.get_info<info::device::host_unified_memory>())
6361
NumThreads *= 8;
6462
return NumThreads;
6563
}

0 commit comments

Comments
 (0)