@@ -837,55 +837,25 @@ using __sycl_reduction_kernel =
837
837
std::conditional_t <std::is_same<KernelName, auto_name>::value, auto_name,
838
838
Namer<KernelName, Ts...>>;
839
839
840
- // / Called in device code. This function iterates through the index space
841
- // / by assigning contiguous chunks to each work-group, then iterating
842
- // / through each chunk using a stride equal to the work-group's local range,
843
- // / which gives much better performance than using stride equal to 1.
844
- // / For each of the index the given \p F function/functor is called and
845
- // / the reduction value hold in \p Reducer is accumulated in those calls.
846
- template <typename KernelFunc, int Dims, typename ReducerT>
847
- void reductionLoop (const range<Dims> &Range, const size_t PerGroup,
848
- ReducerT &Reducer, const nd_item<1 > &NdId, KernelFunc &F) {
849
- // Divide into contiguous chunks and assign each chunk to a Group
850
- // Rely on precomputed division to avoid repeating expensive operations
851
- // TODO: Some devices may prefer alternative remainder handling
852
- auto Group = NdId.get_group ();
853
- size_t GroupId = Group.get_group_linear_id ();
854
- size_t NumGroups = Group.get_group_linear_range ();
855
- bool LastGroup = (GroupId == NumGroups - 1 );
856
- size_t GroupStart = GroupId * PerGroup;
857
- size_t GroupEnd = LastGroup ? Range.size () : (GroupStart + PerGroup);
858
-
859
- // Loop over the contiguous chunk
860
- size_t Start = GroupStart + NdId.get_local_id (0 );
861
- size_t End = GroupEnd;
862
- size_t Stride = NdId.get_local_range (0 );
863
- for (size_t I = Start; I < End; I += Stride)
864
- F (getDelinearizedId (Range, I), Reducer);
865
- }
866
-
867
840
namespace reduction {
868
841
namespace main_krn {
869
842
template <class KernelName > struct RangeFastAtomics ;
870
843
} // namespace main_krn
871
844
} // namespace reduction
872
- template <typename KernelName, typename KernelType, int Dims ,
873
- typename PropertiesT, class Reduction >
845
+ template <typename KernelName, typename KernelType, typename PropertiesT ,
846
+ class Reduction >
874
847
void reduCGFuncForRangeFastAtomics (handler &CGH, KernelType KernelFunc,
875
- const range<Dims> &Range,
876
848
const nd_range<1 > &NDRange,
877
849
PropertiesT Properties, Reduction &Redu) {
878
850
size_t NElements = Reduction::num_elements;
879
851
auto Out = Redu.getReadWriteAccessorToInitializedMem (CGH);
880
852
local_accessor<typename Reduction::result_type, 1 > GroupSum{NElements, CGH};
881
853
using Name = __sycl_reduction_kernel<reduction::main_krn::RangeFastAtomics,
882
854
KernelName>;
883
- size_t NWorkGroups = NDRange.get_group_range ().size ();
884
- size_t PerGroup = Range.size () / NWorkGroups;
885
855
CGH.parallel_for <Name>(NDRange, Properties, [=](nd_item<1 > NDId) {
886
856
// Call user's functions. Reducer.MValue gets initialized there.
887
857
typename Reduction::reducer_type Reducer;
888
- reductionLoop (Range, PerGroup, Reducer, NDId, KernelFunc );
858
+ KernelFunc ( NDId, Reducer );
889
859
890
860
// Work-group cooperates to initialize multiple reduction variables
891
861
auto LID = NDId.get_local_id (0 );
@@ -920,10 +890,9 @@ namespace main_krn {
920
890
template <class KernelName , class NWorkGroupsFinished > struct RangeFastReduce ;
921
891
} // namespace main_krn
922
892
} // namespace reduction
923
- template <typename KernelName, typename KernelType, int Dims ,
924
- typename PropertiesT, class Reduction >
893
+ template <typename KernelName, typename KernelType, typename PropertiesT ,
894
+ class Reduction >
925
895
void reduCGFuncForRangeFastReduce (handler &CGH, KernelType KernelFunc,
926
- const range<Dims> &Range,
927
896
const nd_range<1 > &NDRange,
928
897
PropertiesT Properties, Reduction &Redu) {
929
898
size_t NElements = Reduction::num_elements;
@@ -941,13 +910,13 @@ void reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
941
910
auto Rest = [&](auto NWorkGroupsFinished) {
942
911
local_accessor<int , 1 > DoReducePartialSumsInLastWG{1 , CGH};
943
912
944
- using Name = __sycl_reduction_kernel<reduction::main_krn::RangeFastReduce,
945
- KernelName, decltype (NWorkGroupsFinished)>;
946
- size_t PerGroup = Range. size () / NWorkGroups ;
913
+ using Name =
914
+ __sycl_reduction_kernel<reduction::main_krn::RangeFastReduce,
915
+ KernelName, decltype (NWorkGroupsFinished)> ;
947
916
CGH.parallel_for <Name>(NDRange, Properties, [=](nd_item<1 > NDId) {
948
917
// Call user's functions. Reducer.MValue gets initialized there.
949
918
typename Reduction::reducer_type Reducer;
950
- reductionLoop (Range, PerGroup, Reducer, NDId, KernelFunc );
919
+ KernelFunc ( NDId, Reducer );
951
920
952
921
typename Reduction::binary_operation BOp;
953
922
auto Group = NDId.get_group ();
@@ -1020,10 +989,9 @@ namespace main_krn {
1020
989
template <class KernelName > struct RangeBasic ;
1021
990
} // namespace main_krn
1022
991
} // namespace reduction
1023
- template <typename KernelName, typename KernelType, int Dims ,
1024
- typename PropertiesT, class Reduction >
992
+ template <typename KernelName, typename KernelType, typename PropertiesT ,
993
+ class Reduction >
1025
994
void reduCGFuncForRangeBasic (handler &CGH, KernelType KernelFunc,
1026
- const range<Dims> &Range,
1027
995
const nd_range<1 > &NDRange, PropertiesT Properties,
1028
996
Reduction &Redu) {
1029
997
size_t NElements = Reduction::num_elements;
@@ -1045,11 +1013,10 @@ void reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
1045
1013
auto BOp = Redu.getBinaryOperation ();
1046
1014
using Name =
1047
1015
__sycl_reduction_kernel<reduction::main_krn::RangeBasic, KernelName>;
1048
- size_t PerGroup = Range.size () / NWorkGroups;
1049
1016
CGH.parallel_for <Name>(NDRange, Properties, [=](nd_item<1 > NDId) {
1050
1017
// Call user's functions. Reducer.MValue gets initialized there.
1051
1018
typename Reduction::reducer_type Reducer (Identity, BOp);
1052
- reductionLoop (Range, PerGroup, Reducer, NDId, KernelFunc );
1019
+ KernelFunc ( NDId, Reducer );
1053
1020
1054
1021
// If there are multiple values, reduce each separately
1055
1022
// This prevents local memory from scaling with elements
@@ -1133,35 +1100,6 @@ void reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
1133
1100
});
1134
1101
}
1135
1102
1136
- // / Returns "true" if the result has to be saved to user's variable by
1137
- // / reduSaveFinalResultToUserMem.
1138
- template <typename KernelName, typename KernelType, int Dims,
1139
- typename PropertiesT, class Reduction >
1140
- void reduCGFuncForRange (handler &CGH, KernelType KernelFunc,
1141
- const range<Dims> &Range, size_t MaxWGSize,
1142
- uint32_t NumConcurrentWorkGroups,
1143
- PropertiesT Properties, Reduction &Redu) {
1144
- size_t NWorkItems = Range.size ();
1145
- size_t WGSize = std::min (NWorkItems, MaxWGSize);
1146
- size_t NWorkGroups = NWorkItems / WGSize;
1147
- if (NWorkItems % WGSize)
1148
- NWorkGroups++;
1149
- size_t MaxNWorkGroups = NumConcurrentWorkGroups;
1150
- NWorkGroups = std::min (NWorkGroups, MaxNWorkGroups);
1151
- size_t NDRItems = NWorkGroups * WGSize;
1152
- nd_range<1 > NDRange{range<1 >{NDRItems}, range<1 >{WGSize}};
1153
-
1154
- if constexpr (Reduction::has_fast_reduce)
1155
- reduCGFuncForRangeFastReduce<KernelName>(CGH, KernelFunc, Range, NDRange,
1156
- Properties, Redu);
1157
- else if constexpr (Reduction::has_fast_atomics)
1158
- reduCGFuncForRangeFastAtomics<KernelName>(CGH, KernelFunc, Range, NDRange,
1159
- Properties, Redu);
1160
- else
1161
- reduCGFuncForRangeBasic<KernelName>(CGH, KernelFunc, Range, NDRange,
1162
- Properties, Redu);
1163
- }
1164
-
1165
1103
namespace reduction {
1166
1104
namespace main_krn {
1167
1105
template <class KernelName > struct NDRangeBothFastReduceAndAtomics ;
@@ -2233,12 +2171,57 @@ void reduction_parallel_for(handler &CGH,
2233
2171
#else
2234
2172
reduGetMaxNumConcurrentWorkGroups (Queue);
2235
2173
#endif
2174
+
2236
2175
// TODO: currently the preferred work group size is determined for the given
2237
2176
// queue/device, while it is safer to use queries to the kernel pre-compiled
2238
2177
// for the device.
2239
2178
size_t PrefWGSize = reduGetPreferredWGSize (Queue, OneElemSize);
2240
- reduCGFuncForRange<KernelName>(CGH, KernelFunc, Range, PrefWGSize,
2241
- NumConcurrentWorkGroups, Properties, Redu);
2179
+
2180
+ size_t NWorkItems = Range.size ();
2181
+ size_t WGSize = std::min (NWorkItems, PrefWGSize);
2182
+ size_t NWorkGroups = NWorkItems / WGSize;
2183
+ if (NWorkItems % WGSize)
2184
+ NWorkGroups++;
2185
+ size_t MaxNWorkGroups = NumConcurrentWorkGroups;
2186
+ NWorkGroups = std::min (NWorkGroups, MaxNWorkGroups);
2187
+ size_t NDRItems = NWorkGroups * WGSize;
2188
+ nd_range<1 > NDRange{range<1 >{NDRItems}, range<1 >{WGSize}};
2189
+
2190
+ size_t PerGroup = Range.size () / NWorkGroups;
2191
+ // Iterate through the index space by assigning contiguous chunks to each
2192
+ // work-group, then iterating through each chunk using a stride equal to the
2193
+ // work-group's local range, which gives much better performance than using
2194
+ // stride equal to 1. For each of the index the given the original KernelFunc
2195
+ // is called and the reduction value hold in \p Reducer is accumulated in
2196
+ // those calls.
2197
+ auto UpdatedKernelFunc = [=](auto NDId, auto &Reducer) {
2198
+ // Divide into contiguous chunks and assign each chunk to a Group
2199
+ // Rely on precomputed division to avoid repeating expensive operations
2200
+ // TODO: Some devices may prefer alternative remainder handling
2201
+ auto Group = NDId.get_group ();
2202
+ size_t GroupId = Group.get_group_linear_id ();
2203
+ size_t NumGroups = Group.get_group_linear_range ();
2204
+ bool LastGroup = (GroupId == NumGroups - 1 );
2205
+ size_t GroupStart = GroupId * PerGroup;
2206
+ size_t GroupEnd = LastGroup ? Range.size () : (GroupStart + PerGroup);
2207
+
2208
+ // Loop over the contiguous chunk
2209
+ size_t Start = GroupStart + NDId.get_local_id (0 );
2210
+ size_t End = GroupEnd;
2211
+ size_t Stride = NDId.get_local_range (0 );
2212
+ for (size_t I = Start; I < End; I += Stride)
2213
+ KernelFunc (getDelinearizedId (Range, I), Reducer);
2214
+ };
2215
+
2216
+ if constexpr (Reduction::has_fast_reduce)
2217
+ reduCGFuncForRangeFastReduce<KernelName>(CGH, UpdatedKernelFunc, NDRange,
2218
+ Properties, Redu);
2219
+ else if constexpr (Reduction::has_fast_atomics)
2220
+ reduCGFuncForRangeFastAtomics<KernelName>(CGH, UpdatedKernelFunc, NDRange,
2221
+ Properties, Redu);
2222
+ else
2223
+ reduCGFuncForRangeBasic<KernelName>(CGH, UpdatedKernelFunc, NDRange,
2224
+ Properties, Redu);
2242
2225
}
2243
2226
2244
2227
template <typename KernelName, int Dims, typename PropertiesT,
0 commit comments