@@ -619,24 +619,19 @@ struct get_reduction_aux_kernel_name_t {
619
619
// /
620
620
// / Briefly: calls user's lambda, ONEAPI::reduce() + atomic, INT + ADD/MIN/MAX.
621
621
template <typename KernelName, typename KernelType, int Dims, class Reduction ,
622
- bool UniformWG , typename OutputT>
622
+ bool IsPow2WG , typename OutputT>
623
623
enable_if_t <Reduction::has_fast_reduce && Reduction::has_fast_atomics>
624
624
reduCGFuncImpl (handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
625
625
Reduction &, OutputT Out) {
626
- size_t NWorkItems = Range.get_global_range ().size ();
627
626
using Name = typename get_reduction_main_kernel_name_t <
628
- KernelName, KernelType, Reduction::is_usm, UniformWG , OutputT>::name;
627
+ KernelName, KernelType, Reduction::is_usm, IsPow2WG , OutputT>::name;
629
628
CGH.parallel_for <Name>(Range, [=](nd_item<Dims> NDIt) {
630
629
// Call user's function. Reducer.MValue gets initialized there.
631
630
typename Reduction::reducer_type Reducer;
632
631
KernelFunc (NDIt, Reducer);
633
632
634
633
typename Reduction::binary_operation BOp;
635
- typename Reduction::result_type Val =
636
- (UniformWG || NDIt.get_global_linear_id () < NWorkItems)
637
- ? Reducer.MValue
638
- : Reducer.getIdentity ();
639
- Reducer.MValue = ONEAPI::reduce (NDIt.get_group (), Val, BOp);
634
+ Reducer.MValue = ONEAPI::reduce (NDIt.get_group (), Reducer.MValue , BOp);
640
635
if (NDIt.get_local_linear_id () == 0 )
641
636
Reducer.atomic_combine (Reduction::getOutPointer (Out));
642
637
});
@@ -651,22 +646,21 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
651
646
// /
652
647
// / Briefly: calls user's lambda, tree-reduction + atomic, INT + AND/OR/XOR.
653
648
template <typename KernelName, typename KernelType, int Dims, class Reduction ,
654
- bool UniformPow2WG , typename OutputT>
649
+ bool IsPow2WG , typename OutputT>
655
650
enable_if_t <!Reduction::has_fast_reduce && Reduction::has_fast_atomics>
656
651
reduCGFuncImpl (handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
657
652
Reduction &Redu, OutputT Out) {
658
- size_t NWorkItems = Range.get_global_range ().size ();
659
653
size_t WGSize = Range.get_local_range ().size ();
660
654
661
655
// Use local memory to reduce elements in work-groups into zero-th element.
662
656
// If WGSize is not power of two, then WGSize+1 elements are allocated.
663
657
// The additional last element is used to catch reduce elements that could
664
658
// otherwise be lost in the tree-reduction algorithm used in the kernel.
665
- size_t NLocalElements = WGSize + (UniformPow2WG ? 0 : 1 );
659
+ size_t NLocalElements = WGSize + (IsPow2WG ? 0 : 1 );
666
660
auto LocalReds = Redu.getReadWriteLocalAcc (NLocalElements, CGH);
667
661
668
662
using Name = typename get_reduction_main_kernel_name_t <
669
- KernelName, KernelType, Reduction::is_usm, UniformPow2WG , OutputT>::name;
663
+ KernelName, KernelType, Reduction::is_usm, IsPow2WG , OutputT>::name;
670
664
CGH.parallel_for <Name>(Range, [=](nd_item<Dims> NDIt) {
671
665
// Call user's functions. Reducer.MValue gets initialized there.
672
666
typename Reduction::reducer_type Reducer;
@@ -676,12 +670,9 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
676
670
size_t LID = NDIt.get_local_linear_id ();
677
671
678
672
// Copy the element to local memory to prepare it for tree-reduction.
679
- typename Reduction::result_type ReduIdentity = Reducer.getIdentity ();
680
- LocalReds[LID] = (UniformPow2WG || NDIt.get_global_linear_id () < NWorkItems)
681
- ? Reducer.MValue
682
- : ReduIdentity;
683
- if (!UniformPow2WG)
684
- LocalReds[WGSize] = ReduIdentity;
673
+ LocalReds[LID] = Reducer.MValue ;
674
+ if (!IsPow2WG)
675
+ LocalReds[WGSize] = Reducer.getIdentity ();
685
676
NDIt.barrier ();
686
677
687
678
// Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0].
@@ -692,15 +683,15 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
692
683
for (size_t CurStep = PrevStep >> 1 ; CurStep > 0 ; CurStep >>= 1 ) {
693
684
if (LID < CurStep)
694
685
LocalReds[LID] = BOp (LocalReds[LID], LocalReds[LID + CurStep]);
695
- else if (!UniformPow2WG && LID == CurStep && (PrevStep & 0x1 ))
686
+ else if (!IsPow2WG && LID == CurStep && (PrevStep & 0x1 ))
696
687
LocalReds[WGSize] = BOp (LocalReds[WGSize], LocalReds[PrevStep - 1 ]);
697
688
NDIt.barrier ();
698
689
PrevStep = CurStep;
699
690
}
700
691
701
692
if (LID == 0 ) {
702
693
Reducer.MValue =
703
- UniformPow2WG ? LocalReds[0 ] : BOp (LocalReds[0 ], LocalReds[WGSize]);
694
+ IsPow2WG ? LocalReds[0 ] : BOp (LocalReds[0 ], LocalReds[WGSize]);
704
695
Reducer.atomic_combine (Reduction::getOutPointer (Out));
705
696
}
706
697
});
@@ -712,14 +703,14 @@ enable_if_t<Reduction::has_fast_atomics>
712
703
reduCGFunc (handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
713
704
Reduction &Redu, OutputT Out) {
714
705
715
- size_t NWorkItems = Range.get_global_range ().size ();
716
706
size_t WGSize = Range.get_local_range ().size ();
717
- size_t NWorkGroups = Range.get_group_range ().size ();
718
707
719
- bool HasUniformWG = NWorkGroups * WGSize == NWorkItems;
720
- if (!Reduction::has_fast_reduce)
721
- HasUniformWG = HasUniformWG && (WGSize & (WGSize - 1 )) == 0 ;
722
- if (HasUniformWG)
708
+ // If the work group size is not pow of 2, then the kernel runs some
709
+ // additional code and checks in it.
710
+ // If the reduction has fast reduce then the kernel does not care if the work
711
+ // group size is pow of 2 or not, assume true for such cases.
712
+ bool IsPow2WG = Reduction::has_fast_reduce || ((WGSize & (WGSize - 1 )) == 0 );
713
+ if (IsPow2WG)
723
714
reduCGFuncImpl<KernelName, KernelType, Dims, Reduction, true >(
724
715
CGH, KernelFunc, Range, Redu, Out);
725
716
else
@@ -736,33 +727,28 @@ reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
736
727
// /
737
728
// / Briefly: user's lambda, ONEAPI:reduce(), FP + ADD/MIN/MAX.
738
729
template <typename KernelName, typename KernelType, int Dims, class Reduction ,
739
- bool UniformWG , typename OutputT>
730
+ bool IsPow2WG , typename OutputT>
740
731
enable_if_t <Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
741
732
reduCGFuncImpl (handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
742
733
Reduction &, OutputT Out) {
743
734
744
- size_t NWorkItems = Range.get_global_range ().size ();
745
735
size_t NWorkGroups = Range.get_group_range ().size ();
746
-
747
736
// This additional check is needed for 'read_write' accessor case only.
748
737
// It does not slow-down the kernel writing to 'discard_write' accessor as
749
738
// the condition seems to be resolved at compile time for 'discard_write'.
750
739
bool IsUpdateOfUserVar =
751
740
Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1 ;
752
741
753
742
using Name = typename get_reduction_main_kernel_name_t <
754
- KernelName, KernelType, Reduction::is_usm, UniformWG , OutputT>::name;
743
+ KernelName, KernelType, Reduction::is_usm, IsPow2WG , OutputT>::name;
755
744
CGH.parallel_for <Name>(Range, [=](nd_item<Dims> NDIt) {
756
745
// Call user's functions. Reducer.MValue gets initialized there.
757
746
typename Reduction::reducer_type Reducer;
758
747
KernelFunc (NDIt, Reducer);
759
748
760
749
// Compute the partial sum/reduction for the work-group.
761
750
size_t WGID = NDIt.get_group_linear_id ();
762
- typename Reduction::result_type PSum =
763
- (UniformWG || (NDIt.get_group_linear_id () < NWorkItems))
764
- ? Reducer.MValue
765
- : Reducer.getIdentity ();
751
+ typename Reduction::result_type PSum = Reducer.MValue ;
766
752
typename Reduction::binary_operation BOp;
767
753
PSum = ONEAPI::reduce (NDIt.get_group (), PSum, BOp);
768
754
if (NDIt.get_local_linear_id () == 0 ) {
@@ -782,11 +768,10 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
782
768
// /
783
769
// / Briefly: user's lambda, tree-reduction, CUSTOM types/ops.
784
770
template <typename KernelName, typename KernelType, int Dims, class Reduction ,
785
- bool UniformPow2WG , typename OutputT>
771
+ bool IsPow2WG , typename OutputT>
786
772
enable_if_t <!Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
787
773
reduCGFuncImpl (handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
788
774
Reduction &Redu, OutputT Out) {
789
- size_t NWorkItems = Range.get_global_range ().size ();
790
775
size_t WGSize = Range.get_local_range ().size ();
791
776
size_t NWorkGroups = Range.get_group_range ().size ();
792
777
@@ -797,11 +782,11 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
797
782
// If WGSize is not power of two, then WGSize+1 elements are allocated.
798
783
// The additional last element is used to catch elements that could
799
784
// otherwise be lost in the tree-reduction algorithm.
800
- size_t NumLocalElements = WGSize + (UniformPow2WG ? 0 : 1 );
785
+ size_t NumLocalElements = WGSize + (IsPow2WG ? 0 : 1 );
801
786
auto LocalReds = Redu.getReadWriteLocalAcc (NumLocalElements, CGH);
802
787
typename Reduction::result_type ReduIdentity = Redu.getIdentity ();
803
788
using Name = typename get_reduction_main_kernel_name_t <
804
- KernelName, KernelType, Reduction::is_usm, UniformPow2WG , OutputT>::name;
789
+ KernelName, KernelType, Reduction::is_usm, IsPow2WG , OutputT>::name;
805
790
auto BOp = Redu.getBinaryOperation ();
806
791
CGH.parallel_for <Name>(Range, [=](nd_item<Dims> NDIt) {
807
792
// Call user's functions. Reducer.MValue gets initialized there.
@@ -810,10 +795,9 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
810
795
811
796
size_t WGSize = NDIt.get_local_range ().size ();
812
797
size_t LID = NDIt.get_local_linear_id ();
813
- size_t GID = NDIt.get_global_linear_id ();
814
798
// Copy the element to local memory to prepare it for tree-reduction.
815
- LocalReds[LID] = (GID < NWorkItems) ? Reducer.MValue : ReduIdentity ;
816
- if (!UniformPow2WG )
799
+ LocalReds[LID] = Reducer.MValue ;
800
+ if (!IsPow2WG )
817
801
LocalReds[WGSize] = ReduIdentity;
818
802
NDIt.barrier ();
819
803
@@ -824,7 +808,7 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
824
808
for (size_t CurStep = PrevStep >> 1 ; CurStep > 0 ; CurStep >>= 1 ) {
825
809
if (LID < CurStep)
826
810
LocalReds[LID] = BOp (LocalReds[LID], LocalReds[LID + CurStep]);
827
- else if (!UniformPow2WG && LID == CurStep && (PrevStep & 0x1 ))
811
+ else if (!IsPow2WG && LID == CurStep && (PrevStep & 0x1 ))
828
812
LocalReds[WGSize] = BOp (LocalReds[WGSize], LocalReds[PrevStep - 1 ]);
829
813
NDIt.barrier ();
830
814
PrevStep = CurStep;
@@ -834,7 +818,7 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
834
818
if (LID == 0 ) {
835
819
size_t GrID = NDIt.get_group_linear_id ();
836
820
typename Reduction::result_type PSum =
837
- UniformPow2WG ? LocalReds[0 ] : BOp (LocalReds[0 ], LocalReds[WGSize]);
821
+ IsPow2WG ? LocalReds[0 ] : BOp (LocalReds[0 ], LocalReds[WGSize]);
838
822
if (IsUpdateOfUserVar)
839
823
PSum = BOp (*(Reduction::getOutPointer (Out)), PSum);
840
824
Reduction::getOutPointer (Out)[GrID] = PSum;
@@ -846,27 +830,25 @@ template <typename KernelName, typename KernelType, int Dims, class Reduction>
846
830
enable_if_t <!Reduction::has_fast_atomics>
847
831
reduCGFunc (handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
848
832
Reduction &Redu) {
849
- size_t NWorkItems = Range.get_global_range ().size ();
850
833
size_t WGSize = Range.get_local_range ().size ();
851
834
size_t NWorkGroups = Range.get_group_range ().size ();
852
835
853
- // The last work-group may be not fully loaded with work, or the work group
854
- // size may be not power of two. Those two cases considered inefficient
855
- // as they require additional code and checks in the kernel.
856
- bool HasUniformWG = NWorkGroups * WGSize == NWorkItems;
857
- if (!Reduction::has_fast_reduce)
858
- HasUniformWG = HasUniformWG && ((WGSize & (WGSize - 1 )) == 0 );
836
+ // If the work group size is not pow of 2, then the kernel runs some
837
+ // additional code and checks in it.
838
+ // If the reduction has fast reduce then the kernel does not care if the work
839
+ // group size is pow of 2 or not, assume true for such cases.
840
+ bool IsPow2WG = Reduction::has_fast_reduce || ((WGSize & (WGSize - 1 )) == 0 );
859
841
860
842
if (Reduction::is_usm && NWorkGroups == 1 ) {
861
- if (HasUniformWG )
843
+ if (IsPow2WG )
862
844
reduCGFuncImpl<KernelName, KernelType, Dims, Reduction, true >(
863
845
CGH, KernelFunc, Range, Redu, Redu.getUSMPointer ());
864
846
else
865
847
reduCGFuncImpl<KernelName, KernelType, Dims, Reduction, false >(
866
848
CGH, KernelFunc, Range, Redu, Redu.getUSMPointer ());
867
849
} else {
868
850
auto Out = Redu.getWriteAccForPartialReds (NWorkGroups, CGH);
869
- if (HasUniformWG )
851
+ if (IsPow2WG )
870
852
reduCGFuncImpl<KernelName, KernelType, Dims, Reduction, true >(
871
853
CGH, KernelFunc, Range, Redu, Out);
872
854
else
@@ -889,10 +871,10 @@ reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups,
889
871
size_t WGSize, Reduction &, InputT In, OutputT Out) {
890
872
using Name = typename get_reduction_aux_kernel_name_t <
891
873
KernelName, KernelType, Reduction::is_usm, UniformWG, OutputT>::name;
892
-
893
874
bool IsUpdateOfUserVar =
894
875
Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1 ;
895
- nd_range<1 > Range{range<1 >(NWorkItems), range<1 >(WGSize)};
876
+ range<1 > GlobalRange = {UniformWG ? NWorkItems : NWorkGroups * WGSize};
877
+ nd_range<1 > Range{GlobalRange, range<1 >(WGSize)};
896
878
CGH.parallel_for <Name>(Range, [=](nd_item<1 > NDIt) {
897
879
typename Reduction::binary_operation BOp;
898
880
size_t WGID = NDIt.get_group_linear_id ();
@@ -936,7 +918,8 @@ reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups,
936
918
auto BOp = Redu.getBinaryOperation ();
937
919
using Name = typename get_reduction_aux_kernel_name_t <
938
920
KernelName, KernelType, Reduction::is_usm, UniformPow2WG, OutputT>::name;
939
- nd_range<1 > Range{range<1 >(NWorkItems), range<1 >(WGSize)};
921
+ range<1 > GlobalRange = {UniformPow2WG ? NWorkItems : NWorkGroups * WGSize};
922
+ nd_range<1 > Range{GlobalRange, range<1 >(WGSize)};
940
923
CGH.parallel_for <Name>(Range, [=](nd_item<1 > NDIt) {
941
924
size_t WGSize = NDIt.get_local_range ().size ();
942
925
size_t LID = NDIt.get_local_linear_id ();
0 commit comments