Skip to content

Commit 78e2599

Browse files
authored
[SYCL] Fix big and non-uniform work-groups handling in reduction kernels (#2859)
This patch also does minor optimization in the main kernels created for reductions. The previous code tried to handle non-uniform work-group sizes and it did it wrong way. That code was removed as it is user's responsibility to provide nd_range that is handled well by the devices, at least for main kernels. The patch conservatively limits the maximum work-group size handled by the reduction implementation to avoid various runtime errors caused by selecting too optimistic work-group size for reductions. This solution is rather temporary until reduction kernels precompilation/query approach is implemented. Signed-off-by: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
1 parent 0ffbf4b commit 78e2599

File tree

4 files changed

+181
-60
lines changed

4 files changed

+181
-60
lines changed

sycl/include/CL/sycl/ONEAPI/reduction.hpp

Lines changed: 38 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -619,24 +619,19 @@ struct get_reduction_aux_kernel_name_t {
619619
///
620620
/// Briefly: calls user's lambda, ONEAPI::reduce() + atomic, INT + ADD/MIN/MAX.
621621
template <typename KernelName, typename KernelType, int Dims, class Reduction,
622-
bool UniformWG, typename OutputT>
622+
bool IsPow2WG, typename OutputT>
623623
enable_if_t<Reduction::has_fast_reduce && Reduction::has_fast_atomics>
624624
reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
625625
Reduction &, OutputT Out) {
626-
size_t NWorkItems = Range.get_global_range().size();
627626
using Name = typename get_reduction_main_kernel_name_t<
628-
KernelName, KernelType, Reduction::is_usm, UniformWG, OutputT>::name;
627+
KernelName, KernelType, Reduction::is_usm, IsPow2WG, OutputT>::name;
629628
CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
630629
// Call user's function. Reducer.MValue gets initialized there.
631630
typename Reduction::reducer_type Reducer;
632631
KernelFunc(NDIt, Reducer);
633632

634633
typename Reduction::binary_operation BOp;
635-
typename Reduction::result_type Val =
636-
(UniformWG || NDIt.get_global_linear_id() < NWorkItems)
637-
? Reducer.MValue
638-
: Reducer.getIdentity();
639-
Reducer.MValue = ONEAPI::reduce(NDIt.get_group(), Val, BOp);
634+
Reducer.MValue = ONEAPI::reduce(NDIt.get_group(), Reducer.MValue, BOp);
640635
if (NDIt.get_local_linear_id() == 0)
641636
Reducer.atomic_combine(Reduction::getOutPointer(Out));
642637
});
@@ -651,22 +646,21 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
651646
///
652647
/// Briefly: calls user's lambda, tree-reduction + atomic, INT + AND/OR/XOR.
653648
template <typename KernelName, typename KernelType, int Dims, class Reduction,
654-
bool UniformPow2WG, typename OutputT>
649+
bool IsPow2WG, typename OutputT>
655650
enable_if_t<!Reduction::has_fast_reduce && Reduction::has_fast_atomics>
656651
reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
657652
Reduction &Redu, OutputT Out) {
658-
size_t NWorkItems = Range.get_global_range().size();
659653
size_t WGSize = Range.get_local_range().size();
660654

661655
// Use local memory to reduce elements in work-groups into zero-th element.
662656
// If WGSize is not power of two, then WGSize+1 elements are allocated.
663657
// The additional last element is used to catch reduce elements that could
664658
// otherwise be lost in the tree-reduction algorithm used in the kernel.
665-
size_t NLocalElements = WGSize + (UniformPow2WG ? 0 : 1);
659+
size_t NLocalElements = WGSize + (IsPow2WG ? 0 : 1);
666660
auto LocalReds = Redu.getReadWriteLocalAcc(NLocalElements, CGH);
667661

668662
using Name = typename get_reduction_main_kernel_name_t<
669-
KernelName, KernelType, Reduction::is_usm, UniformPow2WG, OutputT>::name;
663+
KernelName, KernelType, Reduction::is_usm, IsPow2WG, OutputT>::name;
670664
CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
671665
// Call user's functions. Reducer.MValue gets initialized there.
672666
typename Reduction::reducer_type Reducer;
@@ -676,12 +670,9 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
676670
size_t LID = NDIt.get_local_linear_id();
677671

678672
// Copy the element to local memory to prepare it for tree-reduction.
679-
typename Reduction::result_type ReduIdentity = Reducer.getIdentity();
680-
LocalReds[LID] = (UniformPow2WG || NDIt.get_global_linear_id() < NWorkItems)
681-
? Reducer.MValue
682-
: ReduIdentity;
683-
if (!UniformPow2WG)
684-
LocalReds[WGSize] = ReduIdentity;
673+
LocalReds[LID] = Reducer.MValue;
674+
if (!IsPow2WG)
675+
LocalReds[WGSize] = Reducer.getIdentity();
685676
NDIt.barrier();
686677

687678
// Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0].
@@ -692,15 +683,15 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
692683
for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
693684
if (LID < CurStep)
694685
LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
695-
else if (!UniformPow2WG && LID == CurStep && (PrevStep & 0x1))
686+
else if (!IsPow2WG && LID == CurStep && (PrevStep & 0x1))
696687
LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
697688
NDIt.barrier();
698689
PrevStep = CurStep;
699690
}
700691

701692
if (LID == 0) {
702693
Reducer.MValue =
703-
UniformPow2WG ? LocalReds[0] : BOp(LocalReds[0], LocalReds[WGSize]);
694+
IsPow2WG ? LocalReds[0] : BOp(LocalReds[0], LocalReds[WGSize]);
704695
Reducer.atomic_combine(Reduction::getOutPointer(Out));
705696
}
706697
});
@@ -712,14 +703,14 @@ enable_if_t<Reduction::has_fast_atomics>
712703
reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
713704
Reduction &Redu, OutputT Out) {
714705

715-
size_t NWorkItems = Range.get_global_range().size();
716706
size_t WGSize = Range.get_local_range().size();
717-
size_t NWorkGroups = Range.get_group_range().size();
718707

719-
bool HasUniformWG = NWorkGroups * WGSize == NWorkItems;
720-
if (!Reduction::has_fast_reduce)
721-
HasUniformWG = HasUniformWG && (WGSize & (WGSize - 1)) == 0;
722-
if (HasUniformWG)
708+
// If the work group size is not pow of 2, then the kernel runs some
709+
// additional code and checks in it.
710+
// If the reduction has fast reduce then the kernel does not care if the work
711+
// group size is pow of 2 or not, assume true for such cases.
712+
bool IsPow2WG = Reduction::has_fast_reduce || ((WGSize & (WGSize - 1)) == 0);
713+
if (IsPow2WG)
723714
reduCGFuncImpl<KernelName, KernelType, Dims, Reduction, true>(
724715
CGH, KernelFunc, Range, Redu, Out);
725716
else
@@ -736,33 +727,28 @@ reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
736727
///
737728
/// Briefly: user's lambda, ONEAPI:reduce(), FP + ADD/MIN/MAX.
738729
template <typename KernelName, typename KernelType, int Dims, class Reduction,
739-
bool UniformWG, typename OutputT>
730+
bool IsPow2WG, typename OutputT>
740731
enable_if_t<Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
741732
reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
742733
Reduction &, OutputT Out) {
743734

744-
size_t NWorkItems = Range.get_global_range().size();
745735
size_t NWorkGroups = Range.get_group_range().size();
746-
747736
// This additional check is needed for 'read_write' accessor case only.
748737
// It does not slow-down the kernel writing to 'discard_write' accessor as
749738
// the condition seems to be resolved at compile time for 'discard_write'.
750739
bool IsUpdateOfUserVar =
751740
Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1;
752741

753742
using Name = typename get_reduction_main_kernel_name_t<
754-
KernelName, KernelType, Reduction::is_usm, UniformWG, OutputT>::name;
743+
KernelName, KernelType, Reduction::is_usm, IsPow2WG, OutputT>::name;
755744
CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
756745
// Call user's functions. Reducer.MValue gets initialized there.
757746
typename Reduction::reducer_type Reducer;
758747
KernelFunc(NDIt, Reducer);
759748

760749
// Compute the partial sum/reduction for the work-group.
761750
size_t WGID = NDIt.get_group_linear_id();
762-
typename Reduction::result_type PSum =
763-
(UniformWG || (NDIt.get_group_linear_id() < NWorkItems))
764-
? Reducer.MValue
765-
: Reducer.getIdentity();
751+
typename Reduction::result_type PSum = Reducer.MValue;
766752
typename Reduction::binary_operation BOp;
767753
PSum = ONEAPI::reduce(NDIt.get_group(), PSum, BOp);
768754
if (NDIt.get_local_linear_id() == 0) {
@@ -782,11 +768,10 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
782768
///
783769
/// Briefly: user's lambda, tree-reduction, CUSTOM types/ops.
784770
template <typename KernelName, typename KernelType, int Dims, class Reduction,
785-
bool UniformPow2WG, typename OutputT>
771+
bool IsPow2WG, typename OutputT>
786772
enable_if_t<!Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
787773
reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
788774
Reduction &Redu, OutputT Out) {
789-
size_t NWorkItems = Range.get_global_range().size();
790775
size_t WGSize = Range.get_local_range().size();
791776
size_t NWorkGroups = Range.get_group_range().size();
792777

@@ -797,11 +782,11 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
797782
// If WGSize is not power of two, then WGSize+1 elements are allocated.
798783
// The additional last element is used to catch elements that could
799784
// otherwise be lost in the tree-reduction algorithm.
800-
size_t NumLocalElements = WGSize + (UniformPow2WG ? 0 : 1);
785+
size_t NumLocalElements = WGSize + (IsPow2WG ? 0 : 1);
801786
auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, CGH);
802787
typename Reduction::result_type ReduIdentity = Redu.getIdentity();
803788
using Name = typename get_reduction_main_kernel_name_t<
804-
KernelName, KernelType, Reduction::is_usm, UniformPow2WG, OutputT>::name;
789+
KernelName, KernelType, Reduction::is_usm, IsPow2WG, OutputT>::name;
805790
auto BOp = Redu.getBinaryOperation();
806791
CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
807792
// Call user's functions. Reducer.MValue gets initialized there.
@@ -810,10 +795,9 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
810795

811796
size_t WGSize = NDIt.get_local_range().size();
812797
size_t LID = NDIt.get_local_linear_id();
813-
size_t GID = NDIt.get_global_linear_id();
814798
// Copy the element to local memory to prepare it for tree-reduction.
815-
LocalReds[LID] = (GID < NWorkItems) ? Reducer.MValue : ReduIdentity;
816-
if (!UniformPow2WG)
799+
LocalReds[LID] = Reducer.MValue;
800+
if (!IsPow2WG)
817801
LocalReds[WGSize] = ReduIdentity;
818802
NDIt.barrier();
819803

@@ -824,7 +808,7 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
824808
for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
825809
if (LID < CurStep)
826810
LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
827-
else if (!UniformPow2WG && LID == CurStep && (PrevStep & 0x1))
811+
else if (!IsPow2WG && LID == CurStep && (PrevStep & 0x1))
828812
LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
829813
NDIt.barrier();
830814
PrevStep = CurStep;
@@ -834,7 +818,7 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
834818
if (LID == 0) {
835819
size_t GrID = NDIt.get_group_linear_id();
836820
typename Reduction::result_type PSum =
837-
UniformPow2WG ? LocalReds[0] : BOp(LocalReds[0], LocalReds[WGSize]);
821+
IsPow2WG ? LocalReds[0] : BOp(LocalReds[0], LocalReds[WGSize]);
838822
if (IsUpdateOfUserVar)
839823
PSum = BOp(*(Reduction::getOutPointer(Out)), PSum);
840824
Reduction::getOutPointer(Out)[GrID] = PSum;
@@ -846,27 +830,25 @@ template <typename KernelName, typename KernelType, int Dims, class Reduction>
846830
enable_if_t<!Reduction::has_fast_atomics>
847831
reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
848832
Reduction &Redu) {
849-
size_t NWorkItems = Range.get_global_range().size();
850833
size_t WGSize = Range.get_local_range().size();
851834
size_t NWorkGroups = Range.get_group_range().size();
852835

853-
// The last work-group may be not fully loaded with work, or the work group
854-
// size may be not power of two. Those two cases considered inefficient
855-
// as they require additional code and checks in the kernel.
856-
bool HasUniformWG = NWorkGroups * WGSize == NWorkItems;
857-
if (!Reduction::has_fast_reduce)
858-
HasUniformWG = HasUniformWG && ((WGSize & (WGSize - 1)) == 0);
836+
// If the work group size is not pow of 2, then the kernel runs some
837+
// additional code and checks in it.
838+
// If the reduction has fast reduce then the kernel does not care if the work
839+
// group size is pow of 2 or not, assume true for such cases.
840+
bool IsPow2WG = Reduction::has_fast_reduce || ((WGSize & (WGSize - 1)) == 0);
859841

860842
if (Reduction::is_usm && NWorkGroups == 1) {
861-
if (HasUniformWG)
843+
if (IsPow2WG)
862844
reduCGFuncImpl<KernelName, KernelType, Dims, Reduction, true>(
863845
CGH, KernelFunc, Range, Redu, Redu.getUSMPointer());
864846
else
865847
reduCGFuncImpl<KernelName, KernelType, Dims, Reduction, false>(
866848
CGH, KernelFunc, Range, Redu, Redu.getUSMPointer());
867849
} else {
868850
auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, CGH);
869-
if (HasUniformWG)
851+
if (IsPow2WG)
870852
reduCGFuncImpl<KernelName, KernelType, Dims, Reduction, true>(
871853
CGH, KernelFunc, Range, Redu, Out);
872854
else
@@ -889,10 +871,10 @@ reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups,
889871
size_t WGSize, Reduction &, InputT In, OutputT Out) {
890872
using Name = typename get_reduction_aux_kernel_name_t<
891873
KernelName, KernelType, Reduction::is_usm, UniformWG, OutputT>::name;
892-
893874
bool IsUpdateOfUserVar =
894875
Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1;
895-
nd_range<1> Range{range<1>(NWorkItems), range<1>(WGSize)};
876+
range<1> GlobalRange = {UniformWG ? NWorkItems : NWorkGroups * WGSize};
877+
nd_range<1> Range{GlobalRange, range<1>(WGSize)};
896878
CGH.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
897879
typename Reduction::binary_operation BOp;
898880
size_t WGID = NDIt.get_group_linear_id();
@@ -936,7 +918,8 @@ reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups,
936918
auto BOp = Redu.getBinaryOperation();
937919
using Name = typename get_reduction_aux_kernel_name_t<
938920
KernelName, KernelType, Reduction::is_usm, UniformPow2WG, OutputT>::name;
939-
nd_range<1> Range{range<1>(NWorkItems), range<1>(WGSize)};
921+
range<1> GlobalRange = {UniformPow2WG ? NWorkItems : NWorkGroups * WGSize};
922+
nd_range<1> Range{GlobalRange, range<1>(WGSize)};
940923
CGH.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
941924
size_t WGSize = NDIt.get_local_range().size();
942925
size_t LID = NDIt.get_local_linear_id();

sycl/include/CL/sycl/handler.hpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1193,7 +1193,9 @@ class __SYCL_EXPORT handler {
11931193
size_t MaxWGSize = ONEAPI::detail::reduGetMaxWGSize(MQueue, OneElemSize);
11941194
if (Range.get_local_range().size() > MaxWGSize)
11951195
throw sycl::runtime_error("The implementation handling parallel_for with"
1196-
" reduction requires smaller work group size.",
1196+
" reduction requires work group size not bigger"
1197+
" than " +
1198+
std::to_string(MaxWGSize),
11971199
PI_INVALID_WORK_GROUP_SIZE);
11981200

11991201
// 1. Call the kernel that includes user's lambda function.

sycl/source/detail/reduction.cpp

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,18 +52,45 @@ __SYCL_EXPORT size_t
5252
reduGetMaxWGSize(shared_ptr_class<sycl::detail::queue_impl> Queue,
5353
size_t LocalMemBytesPerWorkItem) {
5454
device Dev = Queue->get_device();
55-
size_t WGSize = Dev.get_info<info::device::max_work_group_size>();
55+
size_t MaxWGSize = Dev.get_info<info::device::max_work_group_size>();
56+
size_t WGSizePerMem = MaxWGSize * 2;
57+
size_t WGSize = MaxWGSize;
5658
if (LocalMemBytesPerWorkItem != 0) {
5759
size_t MemSize = Dev.get_info<info::device::local_mem_size>();
58-
size_t WGSizePerMem = MemSize / LocalMemBytesPerWorkItem;
60+
WGSizePerMem = MemSize / LocalMemBytesPerWorkItem;
5961

60-
// If the work group size is not pow of two, then an additional element
62+
// If the work group size is NOT power of two, then an additional element
6163
// in local memory is needed for the reduction algorithm and thus the real
6264
// work-group size requirement per available memory is stricter.
63-
if ((WGSize & (WGSize - 1)) == 0)
65+
if ((WGSizePerMem & (WGSizePerMem - 1)) != 0)
6466
WGSizePerMem--;
6567
WGSize = (std::min)(WGSizePerMem, WGSize);
6668
}
69+
// TODO: This is a temporary workaround for a big problem of detecting
70+
// the maximal usable work-group size. The detection method used above
71+
// is based on maximal work-group size possible on the device is too risky
72+
// as may return too big value. Even though it also tries using the memory
73+
// factor into consideration, it is too rough estimation. For example,
74+
// if (WGSize * LocalMemBytesPerWorkItem) is equal to local_mem_size, then
75+
// the reduction local accessor takes all available local memory for it needs
76+
// not leaving any local memory for other kernel needs (barriers,
77+
// builtin calls, etc), which often leads to crushes with CL_OUT_OF_RESOURCES
78+
// error, or in even worse cases it may cause silent writes/clobbers of
79+
// the local memory assigned to one work-group by code in another work-group.
80+
// It seems the only good solution for this work-group detection problem is
81+
// kernel precompilation and querying the kernel properties.
82+
if (WGSize >= 4) {
83+
// Let's return a twice smaller number, but... do that only if the kernel
84+
// is limited by memory, or the kernel uses opencl:cpu backend, which
85+
// surprisingly uses lots of resources to run the kernels with reductions
86+
// and often causes CL_OUT_OF_RESOURCES error even when reduction
87+
// does not use local accessors.
88+
if (WGSizePerMem < MaxWGSize * 2 ||
89+
(Queue->get_device().is_cpu() &&
90+
Queue->get_device().get_platform().get_backend() == backend::opencl))
91+
WGSize /= 2;
92+
}
93+
6794
return WGSize;
6895
}
6996

0 commit comments

Comments
 (0)