@@ -793,6 +793,43 @@ template <class FunctorTy> void withAuxHandler(handler &CGH, FunctorTy Func) {
793
793
}
794
794
} // namespace reduction
795
795
796
+ // This method is used for implementation of parallel_for accepting 1 reduction.
797
+ // TODO: remove this method when everything is switched to general algorithm
798
+ // implementing arbitrary number of reductions in parallel_for().
799
+ // / Copies the final reduction result kept in read-write accessor to user's
800
+ // / accessor. This method is not called for user's read-write accessors
801
+ // / requiring update-write to it.
802
+ template <typename KernelName, class Reduction >
803
+ std::enable_if_t <!Reduction::is_usm>
804
+ reduSaveFinalResultToUserMem (handler &CGH, Reduction &Redu) {
805
+ auto InAcc = Redu.getReadAccToPreviousPartialReds (CGH);
806
+ associateWithHandler (CGH, &Redu.getUserRedVar (), access::target::device);
807
+ CGH.copy (InAcc, Redu.getUserRedVar ());
808
+ }
809
+
810
+ // This method is used for implementation of parallel_for accepting 1 reduction.
811
+ // TODO: remove this method when everything is switched to general algorithm
812
+ // implementing arbitrary number of reductions in parallel_for().
813
+ // / Copies the final reduction result kept in read-write accessor to user's
814
+ // / USM memory.
815
+ template <typename KernelName, class Reduction >
816
+ std::enable_if_t <Reduction::is_usm>
817
+ reduSaveFinalResultToUserMem (handler &CGH, Reduction &Redu) {
818
+ size_t NElements = Reduction::num_elements;
819
+ auto InAcc = Redu.getReadAccToPreviousPartialReds (CGH);
820
+ auto UserVarPtr = Redu.getUserRedVar ();
821
+ bool IsUpdateOfUserVar = !Redu.initializeToIdentity ();
822
+ auto BOp = Redu.getBinaryOperation ();
823
+ CGH.single_task <KernelName>([=] {
824
+ for (int i = 0 ; i < NElements; ++i) {
825
+ if (IsUpdateOfUserVar)
826
+ UserVarPtr[i] = BOp (UserVarPtr[i], InAcc.get_pointer ()[i]);
827
+ else
828
+ UserVarPtr[i] = InAcc.get_pointer ()[i];
829
+ }
830
+ });
831
+ }
832
+
796
833
// / A helper to pass undefined (sycl::detail::auto_name) names unmodified. We
797
834
// / must do that to avoid name collisions.
798
835
template <template <typename ...> class Namer , class KernelName , class ... Ts>
@@ -834,7 +871,7 @@ template <class KernelName> struct RangeFastAtomics;
834
871
} // namespace reduction
835
872
template <typename KernelName, typename KernelType, int Dims,
836
873
typename PropertiesT, class Reduction >
837
- bool reduCGFuncForRangeFastAtomics (handler &CGH, KernelType KernelFunc,
874
+ void reduCGFuncForRangeFastAtomics (handler &CGH, KernelType KernelFunc,
838
875
const range<Dims> &Range,
839
876
const nd_range<1 > &NDRange,
840
877
PropertiesT Properties, Reduction &Redu) {
@@ -871,7 +908,11 @@ bool reduCGFuncForRangeFastAtomics(handler &CGH, KernelType KernelFunc,
871
908
Reducer.template atomic_combine (&Out[0 ]);
872
909
}
873
910
});
874
- return Reduction::is_usm || Redu.initializeToIdentity ();
911
+
912
+ if (Reduction::is_usm || Redu.initializeToIdentity ())
913
+ reduction::withAuxHandler (CGH, [&](handler &CopyHandler) {
914
+ reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
915
+ });
875
916
}
876
917
877
918
namespace reduction {
@@ -881,7 +922,7 @@ template <class KernelName, class NWorkGroupsFinished> struct RangeFastReduce;
881
922
} // namespace reduction
882
923
template <typename KernelName, typename KernelType, int Dims,
883
924
typename PropertiesT, class Reduction >
884
- bool reduCGFuncForRangeFastReduce (handler &CGH, KernelType KernelFunc,
925
+ void reduCGFuncForRangeFastReduce (handler &CGH, KernelType KernelFunc,
885
926
const range<Dims> &Range,
886
927
const nd_range<1 > &NDRange,
887
928
PropertiesT Properties, Reduction &Redu) {
@@ -972,9 +1013,6 @@ bool reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
972
1013
Rest (Redu.getReadWriteAccessorToInitializedGroupsCounter (CGH));
973
1014
else
974
1015
Rest (Redu.getGroupsCounterAccDiscrete (CGH));
975
-
976
- // We've updated user's variable, no extra work needed.
977
- return false ;
978
1016
}
979
1017
980
1018
namespace reduction {
@@ -984,7 +1022,7 @@ template <class KernelName> struct RangeBasic;
984
1022
} // namespace reduction
985
1023
template <typename KernelName, typename KernelType, int Dims,
986
1024
typename PropertiesT, class Reduction >
987
- bool reduCGFuncForRangeBasic (handler &CGH, KernelType KernelFunc,
1025
+ void reduCGFuncForRangeBasic (handler &CGH, KernelType KernelFunc,
988
1026
const range<Dims> &Range,
989
1027
const nd_range<1 > &NDRange, PropertiesT Properties,
990
1028
Reduction &Redu) {
@@ -1088,14 +1126,18 @@ bool reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
1088
1126
}
1089
1127
}
1090
1128
});
1091
- return Reduction::is_usm;
1129
+
1130
+ if (Reduction::is_usm)
1131
+ reduction::withAuxHandler (CGH, [&](handler &CopyHandler) {
1132
+ reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
1133
+ });
1092
1134
}
1093
1135
1094
1136
// / Returns "true" if the result has to be saved to user's variable by
1095
1137
// / reduSaveFinalResultToUserMem.
1096
1138
template <typename KernelName, typename KernelType, int Dims,
1097
1139
typename PropertiesT, class Reduction >
1098
- bool reduCGFuncForRange (handler &CGH, KernelType KernelFunc,
1140
+ void reduCGFuncForRange (handler &CGH, KernelType KernelFunc,
1099
1141
const range<Dims> &Range, size_t MaxWGSize,
1100
1142
uint32_t NumConcurrentWorkGroups,
1101
1143
PropertiesT Properties, Reduction &Redu) {
@@ -1110,14 +1152,14 @@ bool reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
1110
1152
nd_range<1 > NDRange{range<1 >{NDRItems}, range<1 >{WGSize}};
1111
1153
1112
1154
if constexpr (Reduction::has_fast_reduce)
1113
- return reduCGFuncForRangeFastReduce<KernelName>(CGH, KernelFunc, Range,
1114
- NDRange, Properties, Redu);
1155
+ reduCGFuncForRangeFastReduce<KernelName>(CGH, KernelFunc, Range, NDRange ,
1156
+ Properties, Redu);
1115
1157
else if constexpr (Reduction::has_fast_atomics)
1116
- return reduCGFuncForRangeFastAtomics<KernelName>(CGH, KernelFunc, Range,
1117
- NDRange, Properties, Redu);
1158
+ reduCGFuncForRangeFastAtomics<KernelName>(CGH, KernelFunc, Range, NDRange ,
1159
+ Properties, Redu);
1118
1160
else
1119
- return reduCGFuncForRangeBasic<KernelName>(CGH, KernelFunc, Range, NDRange,
1120
- Properties, Redu);
1161
+ reduCGFuncForRangeBasic<KernelName>(CGH, KernelFunc, Range, NDRange,
1162
+ Properties, Redu);
1121
1163
}
1122
1164
1123
1165
namespace reduction {
@@ -1158,6 +1200,12 @@ void reduCGFuncForNDRangeBothFastReduceAndAtomics(handler &CGH,
1158
1200
if (NDIt.get_local_linear_id () == 0 )
1159
1201
Reducer.atomic_combine (&Out[0 ]);
1160
1202
});
1203
+
1204
+ if (Reduction::is_usm || Redu.initializeToIdentity ()) {
1205
+ reduction::withAuxHandler (CGH, [&](handler &CopyHandler) {
1206
+ reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
1207
+ });
1208
+ }
1161
1209
}
1162
1210
1163
1211
namespace reduction {
@@ -1242,6 +1290,12 @@ void reduCGFuncForNDRangeFastAtomicsOnly(handler &CGH, KernelType KernelFunc,
1242
1290
Reducer.atomic_combine (&Out[0 ]);
1243
1291
}
1244
1292
});
1293
+
1294
+ if (Reduction::is_usm || Redu.initializeToIdentity ()) {
1295
+ reduction::withAuxHandler (CGH, [&](handler &CopyHandler) {
1296
+ reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
1297
+ });
1298
+ }
1245
1299
}
1246
1300
1247
1301
namespace reduction {
@@ -1544,43 +1598,6 @@ size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
1544
1598
return NWorkGroups;
1545
1599
}
1546
1600
1547
- // This method is used for implementation of parallel_for accepting 1 reduction.
1548
- // TODO: remove this method when everything is switched to general algorithm
1549
- // implementing arbitrary number of reductions in parallel_for().
1550
- // / Copies the final reduction result kept in read-write accessor to user's
1551
- // / accessor. This method is not called for user's read-write accessors
1552
- // / requiring update-write to it.
1553
- template <typename KernelName, class Reduction >
1554
- std::enable_if_t <!Reduction::is_usm>
1555
- reduSaveFinalResultToUserMem (handler &CGH, Reduction &Redu) {
1556
- auto InAcc = Redu.getReadAccToPreviousPartialReds (CGH);
1557
- associateWithHandler (CGH, &Redu.getUserRedVar (), access::target::device);
1558
- CGH.copy (InAcc, Redu.getUserRedVar ());
1559
- }
1560
-
1561
- // This method is used for implementation of parallel_for accepting 1 reduction.
1562
- // TODO: remove this method when everything is switched to general algorithm
1563
- // implementing arbitrary number of reductions in parallel_for().
1564
- // / Copies the final reduction result kept in read-write accessor to user's
1565
- // / USM memory.
1566
- template <typename KernelName, class Reduction >
1567
- std::enable_if_t <Reduction::is_usm>
1568
- reduSaveFinalResultToUserMem (handler &CGH, Reduction &Redu) {
1569
- size_t NElements = Reduction::num_elements;
1570
- auto InAcc = Redu.getReadAccToPreviousPartialReds (CGH);
1571
- auto UserVarPtr = Redu.getUserRedVar ();
1572
- bool IsUpdateOfUserVar = !Redu.initializeToIdentity ();
1573
- auto BOp = Redu.getBinaryOperation ();
1574
- CGH.single_task <KernelName>([=] {
1575
- for (int i = 0 ; i < NElements; ++i) {
1576
- if (IsUpdateOfUserVar)
1577
- UserVarPtr[i] = BOp (UserVarPtr[i], InAcc.get_pointer ()[i]);
1578
- else
1579
- UserVarPtr[i] = InAcc.get_pointer ()[i];
1580
- }
1581
- });
1582
- }
1583
-
1584
1601
// / For the given 'Reductions' types pack and indices enumerating them this
1585
1602
// / function either creates new temporary accessors for partial sums (if IsOneWG
1586
1603
// / is false) or returns user's accessor/USM-pointer if (IsOneWG is true).
@@ -2220,13 +2237,8 @@ void reduction_parallel_for(handler &CGH,
2220
2237
// queue/device, while it is safer to use queries to the kernel pre-compiled
2221
2238
// for the device.
2222
2239
size_t PrefWGSize = reduGetPreferredWGSize (Queue, OneElemSize);
2223
- if (reduCGFuncForRange<KernelName>(CGH, KernelFunc, Range, PrefWGSize,
2224
- NumConcurrentWorkGroups, Properties,
2225
- Redu)) {
2226
- reduction::withAuxHandler (CGH, [&](handler &CopyHandler) {
2227
- reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
2228
- });
2229
- }
2240
+ reduCGFuncForRange<KernelName>(CGH, KernelFunc, Range, PrefWGSize,
2241
+ NumConcurrentWorkGroups, Properties, Redu);
2230
2242
}
2231
2243
2232
2244
template <typename KernelName, int Dims, typename PropertiesT,
@@ -2317,43 +2329,24 @@ void reduction_parallel_for(handler &CGH,
2317
2329
nd_range<Dims> Range, PropertiesT Properties,
2318
2330
Reduction Redu, KernelType KernelFunc) {
2319
2331
if constexpr (Reduction::has_float64_atomics) {
2320
- device D = detail::getDeviceFromHandler (CGH);
2321
-
2322
- if (D.has (aspect::atomic64)) {
2323
- reduCGFuncForNDRangeBothFastReduceAndAtomics<KernelName>(
2332
+ if (detail::getDeviceFromHandler (CGH).has (aspect::atomic64))
2333
+ return reduCGFuncForNDRangeBothFastReduceAndAtomics<KernelName>(
2324
2334
CGH, KernelFunc, Range, Properties, Redu);
2325
- } else {
2326
- reduction_parallel_for_basic_impl<KernelName>(
2327
- CGH, Queue, Range, Properties, Redu, KernelFunc);
2328
- return ;
2329
- }
2335
+
2336
+ return reduction_parallel_for_basic_impl<KernelName>(
2337
+ CGH, Queue, Range, Properties, Redu, KernelFunc);
2330
2338
} else if constexpr (Reduction::has_fast_atomics) {
2331
2339
if constexpr (Reduction::has_fast_reduce) {
2332
- reduCGFuncForNDRangeBothFastReduceAndAtomics<KernelName, KernelType>(
2340
+ return reduCGFuncForNDRangeBothFastReduceAndAtomics<KernelName,
2341
+ KernelType>(
2333
2342
CGH, KernelFunc, Range, Properties, Redu);
2334
2343
} else {
2335
- reduCGFuncForNDRangeFastAtomicsOnly<KernelName, KernelType>(
2344
+ return reduCGFuncForNDRangeFastAtomicsOnly<KernelName, KernelType>(
2336
2345
CGH, KernelFunc, Range, Properties, Redu);
2337
2346
}
2338
2347
} else {
2339
- reduction_parallel_for_basic_impl<KernelName>(CGH, Queue, Range, Properties,
2340
- Redu, KernelFunc);
2341
- return ;
2342
- }
2343
-
2344
- // If the reduction variable must be initialized with the identity value
2345
- // before the kernel run, then an additional working accessor is created,
2346
- // initialized with the identity value and used in the kernel. That
2347
- // working accessor is then copied to user's accessor or USM pointer after
2348
- // the kernel run.
2349
- // For USM pointers without initialize_to_identity properties the same
2350
- // scheme with working accessor is used as re-using user's USM pointer in
2351
- // the kernel would require creation of another variant of user's kernel,
2352
- // which does not seem efficient.
2353
- if (Reduction::is_usm || Redu.initializeToIdentity ()) {
2354
- reduction::withAuxHandler (CGH, [&](handler &CopyHandler) {
2355
- reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
2356
- });
2348
+ return reduction_parallel_for_basic_impl<KernelName>(
2349
+ CGH, Queue, Range, Properties, Redu, KernelFunc);
2357
2350
}
2358
2351
}
2359
2352
0 commit comments