@@ -1131,12 +1131,13 @@ template <class KernelName> struct NDRangeBothFastReduceAndAtomics;
1131
1131
// / Briefly: calls user's lambda, reduce() + atomic, INT +
1132
1132
// / ADD/MIN/MAX.
1133
1133
template <typename KernelName, typename KernelType, int Dims,
1134
- typename PropertiesT, class Reduction , class AccTy >
1134
+ typename PropertiesT, class Reduction >
1135
1135
void reduCGFuncForNDRangeBothFastReduceAndAtomics (handler &CGH,
1136
1136
KernelType KernelFunc,
1137
1137
const nd_range<Dims> &Range,
1138
1138
PropertiesT Properties,
1139
- Reduction &, AccTy Out) {
1139
+ Reduction &Redu) {
1140
+ auto Out = Redu.getReadWriteAccessorToInitializedMem (CGH);
1140
1141
size_t NElements = Reduction::num_elements;
1141
1142
using Name = __sycl_reduction_kernel<
1142
1143
reduction::main_krn::NDRangeBothFastReduceAndAtomics, KernelName>;
@@ -1169,14 +1170,15 @@ template <class KernelName> struct NDRangeFastAtomicsOnly;
1169
1170
// /
1170
1171
// / Briefly: calls user's lambda, tree-reduction + atomic, INT + AND/OR/XOR.
1171
1172
template <typename KernelName, typename KernelType, int Dims,
1172
- typename PropertiesT, class Reduction , class AccTy >
1173
- void reduCGFuncForNDRangeFastAtomicsOnly (handler &CGH, bool IsPow2WG,
1174
- KernelType KernelFunc,
1173
+ typename PropertiesT, class Reduction >
1174
+ void reduCGFuncForNDRangeFastAtomicsOnly (handler &CGH, KernelType KernelFunc,
1175
1175
const nd_range<Dims> &Range,
1176
- PropertiesT Properties, Reduction &,
1177
- AccTy Out) {
1176
+ PropertiesT Properties,
1177
+ Reduction &Redu) {
1178
+ auto Out = Redu.getReadWriteAccessorToInitializedMem (CGH);
1178
1179
size_t NElements = Reduction::num_elements;
1179
1180
size_t WGSize = Range.get_local_range ().size ();
1181
+ bool IsPow2WG = (WGSize & (WGSize - 1 )) == 0 ;
1180
1182
1181
1183
// Use local memory to reduce elements in work-groups into zero-th element.
1182
1184
// If WGSize is not power of two, then WGSize+1 elements are allocated.
@@ -1252,13 +1254,15 @@ template <class KernelName> struct NDRangeFastReduceOnly;
1252
1254
// /
1253
1255
// / Briefly: user's lambda, reduce(), FP + ADD/MIN/MAX.
1254
1256
template <typename KernelName, typename KernelType, int Dims,
1255
- typename PropertiesT, class Reduction , class AccTy >
1257
+ typename PropertiesT, class Reduction >
1256
1258
void reduCGFuncForNDRangeFastReduceOnly (handler &CGH, KernelType KernelFunc,
1257
1259
const nd_range<Dims> &Range,
1258
- PropertiesT Properties, Reduction &Redu,
1259
- AccTy Out ) {
1260
+ PropertiesT Properties,
1261
+ Reduction &Redu ) {
1260
1262
size_t NElements = Reduction::num_elements;
1261
1263
size_t NWorkGroups = Range.get_group_range ().size ();
1264
+ auto Out = Redu.getWriteAccForPartialReds (NWorkGroups * NElements, CGH);
1265
+
1262
1266
bool IsUpdateOfUserVar =
1263
1267
!Reduction::is_usm && !Redu.initializeToIdentity () && NWorkGroups == 1 ;
1264
1268
@@ -1300,15 +1304,15 @@ template <class KernelName> struct NDRangeBasic;
1300
1304
// /
1301
1305
// / Briefly: user's lambda, tree-reduction, CUSTOM types/ops.
1302
1306
template <typename KernelName, typename KernelType, int Dims,
1303
- typename PropertiesT, class Reduction , class AccTy >
1304
- void reduCGFuncForNDRangeBasic (handler &CGH, bool IsPow2WG,
1305
- KernelType KernelFunc,
1307
+ typename PropertiesT, class Reduction >
1308
+ void reduCGFuncForNDRangeBasic (handler &CGH, KernelType KernelFunc,
1306
1309
const nd_range<Dims> &Range,
1307
- PropertiesT Properties, Reduction &Redu,
1308
- AccTy Out) {
1310
+ PropertiesT Properties, Reduction &Redu) {
1309
1311
size_t NElements = Reduction::num_elements;
1310
1312
size_t WGSize = Range.get_local_range ().size ();
1313
+ bool IsPow2WG = (WGSize & (WGSize - 1 )) == 0 ;
1311
1314
size_t NWorkGroups = Range.get_group_range ().size ();
1315
+ auto Out = Redu.getWriteAccForPartialReds (NWorkGroups * NElements, CGH);
1312
1316
1313
1317
bool IsUpdateOfUserVar =
1314
1318
!Reduction::is_usm && !Redu.initializeToIdentity () && NWorkGroups == 1 ;
@@ -2204,44 +2208,21 @@ template <typename KernelName, typename KernelType, int Dims,
2204
2208
void reduCGFunc (handler &CGH, KernelType KernelFunc,
2205
2209
const nd_range<Dims> &Range, PropertiesT Properties,
2206
2210
Reduction &Redu) {
2207
- size_t WGSize = Range.get_local_range ().size ();
2208
- auto Out = [&]() {
2209
- if constexpr (Reduction::has_fast_atomics) {
2210
-
2211
- // User's initialized read-write accessor is re-used here if
2212
- // initialize_to_identity is not set (i.e. if user's variable is
2213
- // initialized). Otherwise, a new buffer is initialized with identity
2214
- // value and a new read-write accessor to that buffer is created. That is
2215
- // done because atomic operations update some initialized memory. User's
2216
- // USM pointer is not re-used even when initialize_to_identity is not set
2217
- // because it does not worth the creation of an additional variant of a
2218
- // user's kernel for that case.
2219
- return Redu.getReadWriteAccessorToInitializedMem (CGH);
2220
-
2221
- } else {
2222
- constexpr size_t NElements = Reduction::num_elements;
2223
- size_t NWorkGroups = Range.get_group_range ().size ();
2224
-
2225
- return Redu.getWriteAccForPartialReds (NWorkGroups * NElements, CGH);
2226
- }
2227
- }();
2228
-
2229
2211
if constexpr (Reduction::has_fast_reduce) {
2230
2212
if constexpr (Reduction::has_fast_atomics) {
2231
2213
reduCGFuncForNDRangeBothFastReduceAndAtomics<KernelName, KernelType>(
2232
- CGH, KernelFunc, Range, Properties, Redu, Out );
2214
+ CGH, KernelFunc, Range, Properties, Redu);
2233
2215
} else {
2234
2216
reduCGFuncForNDRangeFastReduceOnly<KernelName, KernelType>(
2235
- CGH, KernelFunc, Range, Properties, Redu, Out );
2217
+ CGH, KernelFunc, Range, Properties, Redu);
2236
2218
}
2237
2219
} else {
2238
- bool IsPow2WG = (WGSize & (WGSize - 1 )) == 0 ;
2239
2220
if constexpr (Reduction::has_fast_atomics) {
2240
2221
reduCGFuncForNDRangeFastAtomicsOnly<KernelName, KernelType>(
2241
- CGH, IsPow2WG, KernelFunc, Range, Properties, Redu, Out );
2222
+ CGH, KernelFunc, Range, Properties, Redu);
2242
2223
} else {
2243
2224
reduCGFuncForNDRangeBasic<KernelName, KernelType>(
2244
- CGH, IsPow2WG, KernelFunc, Range, Properties, Redu, Out );
2225
+ CGH, KernelFunc, Range, Properties, Redu);
2245
2226
}
2246
2227
}
2247
2228
}
0 commit comments