Skip to content

Commit ea4f773

Browse files
[NFC][SYCL][Reduction] Call reduSaveFinalResultToUserMem once we know it's needed (#7276)
Previously we had to use a boolean return value to call it from the handler.hpp. Now private access to the sycl::handler is limited to a few helpers and we can call them immediately after realising the action is necessary.
1 parent 96dab08 commit ea4f773

File tree

1 file changed

+81
-88
lines changed

1 file changed

+81
-88
lines changed

sycl/include/sycl/reduction.hpp

Lines changed: 81 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -793,6 +793,43 @@ template <class FunctorTy> void withAuxHandler(handler &CGH, FunctorTy Func) {
793793
}
794794
} // namespace reduction
795795

796+
// This method is used for implementation of parallel_for accepting 1 reduction.
797+
// TODO: remove this method when everything is switched to general algorithm
798+
// implementing arbitrary number of reductions in parallel_for().
799+
/// Copies the final reduction result kept in read-write accessor to user's
800+
/// accessor. This method is not called for user's read-write accessors
801+
/// requiring update-write to it.
802+
template <typename KernelName, class Reduction>
803+
std::enable_if_t<!Reduction::is_usm>
804+
reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu) {
805+
auto InAcc = Redu.getReadAccToPreviousPartialReds(CGH);
806+
associateWithHandler(CGH, &Redu.getUserRedVar(), access::target::device);
807+
CGH.copy(InAcc, Redu.getUserRedVar());
808+
}
809+
810+
// This method is used for implementation of parallel_for accepting 1 reduction.
811+
// TODO: remove this method when everything is switched to general algorithm
812+
// implementing arbitrary number of reductions in parallel_for().
813+
/// Copies the final reduction result kept in read-write accessor to user's
814+
/// USM memory.
815+
template <typename KernelName, class Reduction>
816+
std::enable_if_t<Reduction::is_usm>
817+
reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu) {
818+
size_t NElements = Reduction::num_elements;
819+
auto InAcc = Redu.getReadAccToPreviousPartialReds(CGH);
820+
auto UserVarPtr = Redu.getUserRedVar();
821+
bool IsUpdateOfUserVar = !Redu.initializeToIdentity();
822+
auto BOp = Redu.getBinaryOperation();
823+
CGH.single_task<KernelName>([=] {
824+
for (int i = 0; i < NElements; ++i) {
825+
if (IsUpdateOfUserVar)
826+
UserVarPtr[i] = BOp(UserVarPtr[i], InAcc.get_pointer()[i]);
827+
else
828+
UserVarPtr[i] = InAcc.get_pointer()[i];
829+
}
830+
});
831+
}
832+
796833
/// A helper to pass undefined (sycl::detail::auto_name) names unmodified. We
797834
/// must do that to avoid name collisions.
798835
template <template <typename...> class Namer, class KernelName, class... Ts>
@@ -834,7 +871,7 @@ template <class KernelName> struct RangeFastAtomics;
834871
} // namespace reduction
835872
template <typename KernelName, typename KernelType, int Dims,
836873
typename PropertiesT, class Reduction>
837-
bool reduCGFuncForRangeFastAtomics(handler &CGH, KernelType KernelFunc,
874+
void reduCGFuncForRangeFastAtomics(handler &CGH, KernelType KernelFunc,
838875
const range<Dims> &Range,
839876
const nd_range<1> &NDRange,
840877
PropertiesT Properties, Reduction &Redu) {
@@ -871,7 +908,11 @@ bool reduCGFuncForRangeFastAtomics(handler &CGH, KernelType KernelFunc,
871908
Reducer.template atomic_combine(&Out[0]);
872909
}
873910
});
874-
return Reduction::is_usm || Redu.initializeToIdentity();
911+
912+
if (Reduction::is_usm || Redu.initializeToIdentity())
913+
reduction::withAuxHandler(CGH, [&](handler &CopyHandler) {
914+
reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
915+
});
875916
}
876917

877918
namespace reduction {
@@ -881,7 +922,7 @@ template <class KernelName, class NWorkGroupsFinished> struct RangeFastReduce;
881922
} // namespace reduction
882923
template <typename KernelName, typename KernelType, int Dims,
883924
typename PropertiesT, class Reduction>
884-
bool reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
925+
void reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
885926
const range<Dims> &Range,
886927
const nd_range<1> &NDRange,
887928
PropertiesT Properties, Reduction &Redu) {
@@ -972,9 +1013,6 @@ bool reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
9721013
Rest(Redu.getReadWriteAccessorToInitializedGroupsCounter(CGH));
9731014
else
9741015
Rest(Redu.getGroupsCounterAccDiscrete(CGH));
975-
976-
// We've updated user's variable, no extra work needed.
977-
return false;
9781016
}
9791017

9801018
namespace reduction {
@@ -984,7 +1022,7 @@ template <class KernelName> struct RangeBasic;
9841022
} // namespace reduction
9851023
template <typename KernelName, typename KernelType, int Dims,
9861024
typename PropertiesT, class Reduction>
987-
bool reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
1025+
void reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
9881026
const range<Dims> &Range,
9891027
const nd_range<1> &NDRange, PropertiesT Properties,
9901028
Reduction &Redu) {
@@ -1088,14 +1126,18 @@ bool reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
10881126
}
10891127
}
10901128
});
1091-
return Reduction::is_usm;
1129+
1130+
if (Reduction::is_usm)
1131+
reduction::withAuxHandler(CGH, [&](handler &CopyHandler) {
1132+
reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
1133+
});
10921134
}
10931135

10941136
/// Returns "true" if the result has to be saved to user's variable by
10951137
/// reduSaveFinalResultToUserMem.
10961138
template <typename KernelName, typename KernelType, int Dims,
10971139
typename PropertiesT, class Reduction>
1098-
bool reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
1140+
void reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
10991141
const range<Dims> &Range, size_t MaxWGSize,
11001142
uint32_t NumConcurrentWorkGroups,
11011143
PropertiesT Properties, Reduction &Redu) {
@@ -1110,14 +1152,14 @@ bool reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
11101152
nd_range<1> NDRange{range<1>{NDRItems}, range<1>{WGSize}};
11111153

11121154
if constexpr (Reduction::has_fast_reduce)
1113-
return reduCGFuncForRangeFastReduce<KernelName>(CGH, KernelFunc, Range,
1114-
NDRange, Properties, Redu);
1155+
reduCGFuncForRangeFastReduce<KernelName>(CGH, KernelFunc, Range, NDRange,
1156+
Properties, Redu);
11151157
else if constexpr (Reduction::has_fast_atomics)
1116-
return reduCGFuncForRangeFastAtomics<KernelName>(CGH, KernelFunc, Range,
1117-
NDRange, Properties, Redu);
1158+
reduCGFuncForRangeFastAtomics<KernelName>(CGH, KernelFunc, Range, NDRange,
1159+
Properties, Redu);
11181160
else
1119-
return reduCGFuncForRangeBasic<KernelName>(CGH, KernelFunc, Range, NDRange,
1120-
Properties, Redu);
1161+
reduCGFuncForRangeBasic<KernelName>(CGH, KernelFunc, Range, NDRange,
1162+
Properties, Redu);
11211163
}
11221164

11231165
namespace reduction {
@@ -1158,6 +1200,12 @@ void reduCGFuncForNDRangeBothFastReduceAndAtomics(handler &CGH,
11581200
if (NDIt.get_local_linear_id() == 0)
11591201
Reducer.atomic_combine(&Out[0]);
11601202
});
1203+
1204+
if (Reduction::is_usm || Redu.initializeToIdentity()) {
1205+
reduction::withAuxHandler(CGH, [&](handler &CopyHandler) {
1206+
reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
1207+
});
1208+
}
11611209
}
11621210

11631211
namespace reduction {
@@ -1242,6 +1290,12 @@ void reduCGFuncForNDRangeFastAtomicsOnly(handler &CGH, KernelType KernelFunc,
12421290
Reducer.atomic_combine(&Out[0]);
12431291
}
12441292
});
1293+
1294+
if (Reduction::is_usm || Redu.initializeToIdentity()) {
1295+
reduction::withAuxHandler(CGH, [&](handler &CopyHandler) {
1296+
reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
1297+
});
1298+
}
12451299
}
12461300

12471301
namespace reduction {
@@ -1544,43 +1598,6 @@ size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
15441598
return NWorkGroups;
15451599
}
15461600

1547-
// This method is used for implementation of parallel_for accepting 1 reduction.
1548-
// TODO: remove this method when everything is switched to general algorithm
1549-
// implementing arbitrary number of reductions in parallel_for().
1550-
/// Copies the final reduction result kept in read-write accessor to user's
1551-
/// accessor. This method is not called for user's read-write accessors
1552-
/// requiring update-write to it.
1553-
template <typename KernelName, class Reduction>
1554-
std::enable_if_t<!Reduction::is_usm>
1555-
reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu) {
1556-
auto InAcc = Redu.getReadAccToPreviousPartialReds(CGH);
1557-
associateWithHandler(CGH, &Redu.getUserRedVar(), access::target::device);
1558-
CGH.copy(InAcc, Redu.getUserRedVar());
1559-
}
1560-
1561-
// This method is used for implementation of parallel_for accepting 1 reduction.
1562-
// TODO: remove this method when everything is switched to general algorithm
1563-
// implementing arbitrary number of reductions in parallel_for().
1564-
/// Copies the final reduction result kept in read-write accessor to user's
1565-
/// USM memory.
1566-
template <typename KernelName, class Reduction>
1567-
std::enable_if_t<Reduction::is_usm>
1568-
reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu) {
1569-
size_t NElements = Reduction::num_elements;
1570-
auto InAcc = Redu.getReadAccToPreviousPartialReds(CGH);
1571-
auto UserVarPtr = Redu.getUserRedVar();
1572-
bool IsUpdateOfUserVar = !Redu.initializeToIdentity();
1573-
auto BOp = Redu.getBinaryOperation();
1574-
CGH.single_task<KernelName>([=] {
1575-
for (int i = 0; i < NElements; ++i) {
1576-
if (IsUpdateOfUserVar)
1577-
UserVarPtr[i] = BOp(UserVarPtr[i], InAcc.get_pointer()[i]);
1578-
else
1579-
UserVarPtr[i] = InAcc.get_pointer()[i];
1580-
}
1581-
});
1582-
}
1583-
15841601
/// For the given 'Reductions' types pack and indices enumerating them this
15851602
/// function either creates new temporary accessors for partial sums (if IsOneWG
15861603
/// is false) or returns user's accessor/USM-pointer if (IsOneWG is true).
@@ -2220,13 +2237,8 @@ void reduction_parallel_for(handler &CGH,
22202237
// queue/device, while it is safer to use queries to the kernel pre-compiled
22212238
// for the device.
22222239
size_t PrefWGSize = reduGetPreferredWGSize(Queue, OneElemSize);
2223-
if (reduCGFuncForRange<KernelName>(CGH, KernelFunc, Range, PrefWGSize,
2224-
NumConcurrentWorkGroups, Properties,
2225-
Redu)) {
2226-
reduction::withAuxHandler(CGH, [&](handler &CopyHandler) {
2227-
reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
2228-
});
2229-
}
2240+
reduCGFuncForRange<KernelName>(CGH, KernelFunc, Range, PrefWGSize,
2241+
NumConcurrentWorkGroups, Properties, Redu);
22302242
}
22312243

22322244
template <typename KernelName, int Dims, typename PropertiesT,
@@ -2317,43 +2329,24 @@ void reduction_parallel_for(handler &CGH,
23172329
nd_range<Dims> Range, PropertiesT Properties,
23182330
Reduction Redu, KernelType KernelFunc) {
23192331
if constexpr (Reduction::has_float64_atomics) {
2320-
device D = detail::getDeviceFromHandler(CGH);
2321-
2322-
if (D.has(aspect::atomic64)) {
2323-
reduCGFuncForNDRangeBothFastReduceAndAtomics<KernelName>(
2332+
if (detail::getDeviceFromHandler(CGH).has(aspect::atomic64))
2333+
return reduCGFuncForNDRangeBothFastReduceAndAtomics<KernelName>(
23242334
CGH, KernelFunc, Range, Properties, Redu);
2325-
} else {
2326-
reduction_parallel_for_basic_impl<KernelName>(
2327-
CGH, Queue, Range, Properties, Redu, KernelFunc);
2328-
return;
2329-
}
2335+
2336+
return reduction_parallel_for_basic_impl<KernelName>(
2337+
CGH, Queue, Range, Properties, Redu, KernelFunc);
23302338
} else if constexpr (Reduction::has_fast_atomics) {
23312339
if constexpr (Reduction::has_fast_reduce) {
2332-
reduCGFuncForNDRangeBothFastReduceAndAtomics<KernelName, KernelType>(
2340+
return reduCGFuncForNDRangeBothFastReduceAndAtomics<KernelName,
2341+
KernelType>(
23332342
CGH, KernelFunc, Range, Properties, Redu);
23342343
} else {
2335-
reduCGFuncForNDRangeFastAtomicsOnly<KernelName, KernelType>(
2344+
return reduCGFuncForNDRangeFastAtomicsOnly<KernelName, KernelType>(
23362345
CGH, KernelFunc, Range, Properties, Redu);
23372346
}
23382347
} else {
2339-
reduction_parallel_for_basic_impl<KernelName>(CGH, Queue, Range, Properties,
2340-
Redu, KernelFunc);
2341-
return;
2342-
}
2343-
2344-
// If the reduction variable must be initialized with the identity value
2345-
// before the kernel run, then an additional working accessor is created,
2346-
// initialized with the identity value and used in the kernel. That
2347-
// working accessor is then copied to user's accessor or USM pointer after
2348-
// the kernel run.
2349-
// For USM pointers without initialize_to_identity properties the same
2350-
// scheme with working accessor is used as re-using user's USM pointer in
2351-
// the kernel would require creation of another variant of user's kernel,
2352-
// which does not seem efficient.
2353-
if (Reduction::is_usm || Redu.initializeToIdentity()) {
2354-
reduction::withAuxHandler(CGH, [&](handler &CopyHandler) {
2355-
reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
2356-
});
2348+
return reduction_parallel_for_basic_impl<KernelName>(
2349+
CGH, Queue, Range, Properties, Redu, KernelFunc);
23572350
}
23582351
}
23592352

0 commit comments

Comments
 (0)