Skip to content

Commit 9670beb

Browse files
[NFC][SYCL] Range reduction refactoring (#7180)
To make it look like delegating to NDRange version. I expect some future change completely unify the paths, but this looks like a self-contained atomic improvement.
1 parent 35a1ce9 commit 9670beb

File tree

1 file changed

+59
-76
lines changed

1 file changed

+59
-76
lines changed

sycl/include/sycl/reduction.hpp

Lines changed: 59 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -837,55 +837,25 @@ using __sycl_reduction_kernel =
837837
std::conditional_t<std::is_same<KernelName, auto_name>::value, auto_name,
838838
Namer<KernelName, Ts...>>;
839839

840-
/// Called in device code. This function iterates through the index space
841-
/// by assigning contiguous chunks to each work-group, then iterating
842-
/// through each chunk using a stride equal to the work-group's local range,
843-
/// which gives much better performance than using stride equal to 1.
844-
/// For each of the index the given \p F function/functor is called and
845-
/// the reduction value hold in \p Reducer is accumulated in those calls.
846-
template <typename KernelFunc, int Dims, typename ReducerT>
847-
void reductionLoop(const range<Dims> &Range, const size_t PerGroup,
848-
ReducerT &Reducer, const nd_item<1> &NdId, KernelFunc &F) {
849-
// Divide into contiguous chunks and assign each chunk to a Group
850-
// Rely on precomputed division to avoid repeating expensive operations
851-
// TODO: Some devices may prefer alternative remainder handling
852-
auto Group = NdId.get_group();
853-
size_t GroupId = Group.get_group_linear_id();
854-
size_t NumGroups = Group.get_group_linear_range();
855-
bool LastGroup = (GroupId == NumGroups - 1);
856-
size_t GroupStart = GroupId * PerGroup;
857-
size_t GroupEnd = LastGroup ? Range.size() : (GroupStart + PerGroup);
858-
859-
// Loop over the contiguous chunk
860-
size_t Start = GroupStart + NdId.get_local_id(0);
861-
size_t End = GroupEnd;
862-
size_t Stride = NdId.get_local_range(0);
863-
for (size_t I = Start; I < End; I += Stride)
864-
F(getDelinearizedId(Range, I), Reducer);
865-
}
866-
867840
namespace reduction {
868841
namespace main_krn {
869842
template <class KernelName> struct RangeFastAtomics;
870843
} // namespace main_krn
871844
} // namespace reduction
872-
template <typename KernelName, typename KernelType, int Dims,
873-
typename PropertiesT, class Reduction>
845+
template <typename KernelName, typename KernelType, typename PropertiesT,
846+
class Reduction>
874847
void reduCGFuncForRangeFastAtomics(handler &CGH, KernelType KernelFunc,
875-
const range<Dims> &Range,
876848
const nd_range<1> &NDRange,
877849
PropertiesT Properties, Reduction &Redu) {
878850
size_t NElements = Reduction::num_elements;
879851
auto Out = Redu.getReadWriteAccessorToInitializedMem(CGH);
880852
local_accessor<typename Reduction::result_type, 1> GroupSum{NElements, CGH};
881853
using Name = __sycl_reduction_kernel<reduction::main_krn::RangeFastAtomics,
882854
KernelName>;
883-
size_t NWorkGroups = NDRange.get_group_range().size();
884-
size_t PerGroup = Range.size() / NWorkGroups;
885855
CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<1> NDId) {
886856
// Call user's functions. Reducer.MValue gets initialized there.
887857
typename Reduction::reducer_type Reducer;
888-
reductionLoop(Range, PerGroup, Reducer, NDId, KernelFunc);
858+
KernelFunc(NDId, Reducer);
889859

890860
// Work-group cooperates to initialize multiple reduction variables
891861
auto LID = NDId.get_local_id(0);
@@ -920,10 +890,9 @@ namespace main_krn {
920890
template <class KernelName, class NWorkGroupsFinished> struct RangeFastReduce;
921891
} // namespace main_krn
922892
} // namespace reduction
923-
template <typename KernelName, typename KernelType, int Dims,
924-
typename PropertiesT, class Reduction>
893+
template <typename KernelName, typename KernelType, typename PropertiesT,
894+
class Reduction>
925895
void reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
926-
const range<Dims> &Range,
927896
const nd_range<1> &NDRange,
928897
PropertiesT Properties, Reduction &Redu) {
929898
size_t NElements = Reduction::num_elements;
@@ -941,13 +910,13 @@ void reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
941910
auto Rest = [&](auto NWorkGroupsFinished) {
942911
local_accessor<int, 1> DoReducePartialSumsInLastWG{1, CGH};
943912

944-
using Name = __sycl_reduction_kernel<reduction::main_krn::RangeFastReduce,
945-
KernelName, decltype(NWorkGroupsFinished)>;
946-
size_t PerGroup = Range.size() / NWorkGroups;
913+
using Name =
914+
__sycl_reduction_kernel<reduction::main_krn::RangeFastReduce,
915+
KernelName, decltype(NWorkGroupsFinished)>;
947916
CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<1> NDId) {
948917
// Call user's functions. Reducer.MValue gets initialized there.
949918
typename Reduction::reducer_type Reducer;
950-
reductionLoop(Range, PerGroup, Reducer, NDId, KernelFunc);
919+
KernelFunc(NDId, Reducer);
951920

952921
typename Reduction::binary_operation BOp;
953922
auto Group = NDId.get_group();
@@ -1020,10 +989,9 @@ namespace main_krn {
1020989
template <class KernelName> struct RangeBasic;
1021990
} // namespace main_krn
1022991
} // namespace reduction
1023-
template <typename KernelName, typename KernelType, int Dims,
1024-
typename PropertiesT, class Reduction>
992+
template <typename KernelName, typename KernelType, typename PropertiesT,
993+
class Reduction>
1025994
void reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
1026-
const range<Dims> &Range,
1027995
const nd_range<1> &NDRange, PropertiesT Properties,
1028996
Reduction &Redu) {
1029997
size_t NElements = Reduction::num_elements;
@@ -1045,11 +1013,10 @@ void reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
10451013
auto BOp = Redu.getBinaryOperation();
10461014
using Name =
10471015
__sycl_reduction_kernel<reduction::main_krn::RangeBasic, KernelName>;
1048-
size_t PerGroup = Range.size() / NWorkGroups;
10491016
CGH.parallel_for<Name>(NDRange, Properties, [=](nd_item<1> NDId) {
10501017
// Call user's functions. Reducer.MValue gets initialized there.
10511018
typename Reduction::reducer_type Reducer(Identity, BOp);
1052-
reductionLoop(Range, PerGroup, Reducer, NDId, KernelFunc);
1019+
KernelFunc(NDId, Reducer);
10531020

10541021
// If there are multiple values, reduce each separately
10551022
// This prevents local memory from scaling with elements
@@ -1133,35 +1100,6 @@ void reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
11331100
});
11341101
}
11351102

1136-
/// Returns "true" if the result has to be saved to user's variable by
1137-
/// reduSaveFinalResultToUserMem.
1138-
template <typename KernelName, typename KernelType, int Dims,
1139-
typename PropertiesT, class Reduction>
1140-
void reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
1141-
const range<Dims> &Range, size_t MaxWGSize,
1142-
uint32_t NumConcurrentWorkGroups,
1143-
PropertiesT Properties, Reduction &Redu) {
1144-
size_t NWorkItems = Range.size();
1145-
size_t WGSize = std::min(NWorkItems, MaxWGSize);
1146-
size_t NWorkGroups = NWorkItems / WGSize;
1147-
if (NWorkItems % WGSize)
1148-
NWorkGroups++;
1149-
size_t MaxNWorkGroups = NumConcurrentWorkGroups;
1150-
NWorkGroups = std::min(NWorkGroups, MaxNWorkGroups);
1151-
size_t NDRItems = NWorkGroups * WGSize;
1152-
nd_range<1> NDRange{range<1>{NDRItems}, range<1>{WGSize}};
1153-
1154-
if constexpr (Reduction::has_fast_reduce)
1155-
reduCGFuncForRangeFastReduce<KernelName>(CGH, KernelFunc, Range, NDRange,
1156-
Properties, Redu);
1157-
else if constexpr (Reduction::has_fast_atomics)
1158-
reduCGFuncForRangeFastAtomics<KernelName>(CGH, KernelFunc, Range, NDRange,
1159-
Properties, Redu);
1160-
else
1161-
reduCGFuncForRangeBasic<KernelName>(CGH, KernelFunc, Range, NDRange,
1162-
Properties, Redu);
1163-
}
1164-
11651103
namespace reduction {
11661104
namespace main_krn {
11671105
template <class KernelName> struct NDRangeBothFastReduceAndAtomics;
@@ -2233,12 +2171,57 @@ void reduction_parallel_for(handler &CGH,
22332171
#else
22342172
reduGetMaxNumConcurrentWorkGroups(Queue);
22352173
#endif
2174+
22362175
// TODO: currently the preferred work group size is determined for the given
22372176
// queue/device, while it is safer to use queries to the kernel pre-compiled
22382177
// for the device.
22392178
size_t PrefWGSize = reduGetPreferredWGSize(Queue, OneElemSize);
2240-
reduCGFuncForRange<KernelName>(CGH, KernelFunc, Range, PrefWGSize,
2241-
NumConcurrentWorkGroups, Properties, Redu);
2179+
2180+
size_t NWorkItems = Range.size();
2181+
size_t WGSize = std::min(NWorkItems, PrefWGSize);
2182+
size_t NWorkGroups = NWorkItems / WGSize;
2183+
if (NWorkItems % WGSize)
2184+
NWorkGroups++;
2185+
size_t MaxNWorkGroups = NumConcurrentWorkGroups;
2186+
NWorkGroups = std::min(NWorkGroups, MaxNWorkGroups);
2187+
size_t NDRItems = NWorkGroups * WGSize;
2188+
nd_range<1> NDRange{range<1>{NDRItems}, range<1>{WGSize}};
2189+
2190+
size_t PerGroup = Range.size() / NWorkGroups;
2191+
// Iterate through the index space by assigning contiguous chunks to each
2192+
// work-group, then iterating through each chunk using a stride equal to the
2193+
// work-group's local range, which gives much better performance than using
2194+
// stride equal to 1. For each of the index the given the original KernelFunc
2195+
// is called and the reduction value hold in \p Reducer is accumulated in
2196+
// those calls.
2197+
auto UpdatedKernelFunc = [=](auto NDId, auto &Reducer) {
2198+
// Divide into contiguous chunks and assign each chunk to a Group
2199+
// Rely on precomputed division to avoid repeating expensive operations
2200+
// TODO: Some devices may prefer alternative remainder handling
2201+
auto Group = NDId.get_group();
2202+
size_t GroupId = Group.get_group_linear_id();
2203+
size_t NumGroups = Group.get_group_linear_range();
2204+
bool LastGroup = (GroupId == NumGroups - 1);
2205+
size_t GroupStart = GroupId * PerGroup;
2206+
size_t GroupEnd = LastGroup ? Range.size() : (GroupStart + PerGroup);
2207+
2208+
// Loop over the contiguous chunk
2209+
size_t Start = GroupStart + NDId.get_local_id(0);
2210+
size_t End = GroupEnd;
2211+
size_t Stride = NDId.get_local_range(0);
2212+
for (size_t I = Start; I < End; I += Stride)
2213+
KernelFunc(getDelinearizedId(Range, I), Reducer);
2214+
};
2215+
2216+
if constexpr (Reduction::has_fast_reduce)
2217+
reduCGFuncForRangeFastReduce<KernelName>(CGH, UpdatedKernelFunc, NDRange,
2218+
Properties, Redu);
2219+
else if constexpr (Reduction::has_fast_atomics)
2220+
reduCGFuncForRangeFastAtomics<KernelName>(CGH, UpdatedKernelFunc, NDRange,
2221+
Properties, Redu);
2222+
else
2223+
reduCGFuncForRangeBasic<KernelName>(CGH, UpdatedKernelFunc, NDRange,
2224+
Properties, Redu);
22422225
}
22432226

22442227
template <typename KernelName, int Dims, typename PropertiesT,

0 commit comments

Comments
 (0)