[SYCL][Reduction] Optimize reduCGFuncForRangeFastAtomics for discrete GPU (#6489)

aelovikov-intel · web-flow · commit c22a5d3f82cd · 2022-08-10T17:00:30.000+01:00
Main idea is that memory transfer host-&gt;GPU has significant cost here unlike the
integrated GPU/shared memory. As such, make sure sycl::no_init is used for
partial sums buffer (free change). Also, modify initialization of group counter
buffer to do it with an extra kernel as that is cheaper than transferring memory
host-&gt;device. This latter change is dependent on the runtime condition of the
device having host_unified_memory.

By itself the change would have caused incompatibilities in PartialSums's
accessor type. In order to deal with this, the optimization for a single
WorkGroup was moved from the host to device code. That also made possible to
always write final user's variable inside main kernel avoiding calling
reduSaveFinalResultToUserMem for this scenario. As such, changed
reduCGFuncForRange* to return a boolean indicating if such post-processing is
needed as that is a property of a particular implementation now.

Second part is to change how number of work groups is selected. Before this
change one WG for each physcial EU thread was created, but it doesn't seem to be
universally true. Make it one WG per EU (not thread) instead of discrete case.
diff --git a/sycl/include/sycl/ext/oneapi/reduction.hpp b/sycl/include/sycl/ext/oneapi/reduction.hpp
@@ -649,6 +649,13 @@ class reduction_impl_algo : public reduction_impl_common<T, BinaryOperation> {
     }
   }
 
+  template <class _T = T, int D = buffer_dim>
+  auto &getTempBuffer(size_t Size, handler &CGH) {
+    auto Buffer = std::make_shared<buffer<_T, D>>(range<1>(Size));
+    CGH.addReduction(Buffer);
+    return *Buffer;
+  }
+
   /// Returns an accessor accessing the memory that will hold the reduction
   /// partial sums.
   /// If \p Size is equal to one, then the reduction result is the final and
@@ -708,15 +715,28 @@ class reduction_impl_algo : public reduction_impl_common<T, BinaryOperation> {
     return {*CounterBuf, CGH};
   }
 
-  RedOutVar &getUserRedVar() { return MRedOut; }
-
-  static inline result_type *getOutPointer(const rw_accessor_type &OutAcc) {
-    return OutAcc.get_pointer().get();
+  // On discrete (vs. integrated) GPUs it's faster to initialize memory with an
+  // extra kernel than copy it from the host.
+  template <typename Name> auto getGroupsCounterAccDiscrete(handler &CGH) {
+    auto &Buf = getTempBuffer<int, 1>(1, CGH);
+    std::shared_ptr<detail::queue_impl> QueueCopy = CGH.MQueue;
+    auto Event = CGH.withAuxHandler(QueueCopy, [&](handler &InitHandler) {
+      auto Acc = accessor{Buf, InitHandler, sycl::write_only, sycl::no_init};
+      InitHandler.single_task<Name>([=]() { Acc[0] = 0; });
+    });
+    CGH.depends_on(Event);
+    return accessor{Buf, CGH};
   }
 
+  RedOutVar &getUserRedVar() { return MRedOut; }
+
   static inline result_type *getOutPointer(result_type *OutPtr) {
     return OutPtr;
   }
+  template <class AccessorType>
+  static inline result_type *getOutPointer(const AccessorType &OutAcc) {
+    return OutAcc.get_pointer().get();
+  }
 
 private:
   template <typename BufferT>
@@ -892,7 +912,7 @@ template <class KernelName> struct RangeFastAtomics;
 } // namespace main_krn
 } // namespace reduction
 template <typename KernelName, typename KernelType, int Dims, class Reduction>
-void reduCGFuncForRangeFastAtomics(handler &CGH, KernelType KernelFunc,
+bool reduCGFuncForRangeFastAtomics(handler &CGH, KernelType KernelFunc,
                                    const range<Dims> &Range,
                                    const nd_range<1> &NDRange,
                                    Reduction &Redu) {
@@ -927,29 +947,43 @@ void reduCGFuncForRangeFastAtomics(handler &CGH, KernelType KernelFunc,
       Reducer.template atomic_combine(Reduction::getOutPointer(Out));
     }
   });
+  return Reduction::is_usm || Redu.initializeToIdentity();
 }
 
 namespace reduction {
 namespace main_krn {
 template <class KernelName> struct RangeFastReduce;
 } // namespace main_krn
+namespace init_krn {
+template <class KernelName> struct GroupCounter;
+}
 } // namespace reduction
 template <typename KernelName, typename KernelType, int Dims, class Reduction>
-void reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
+bool reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
                                   const range<Dims> &Range,
                                   const nd_range<1> &NDRange, Reduction &Redu) {
   constexpr size_t NElements = Reduction::num_elements;
   size_t WGSize = NDRange.get_local_range().size();
   size_t NWorkGroups = NDRange.get_group_range().size();
 
+  auto &Out = Redu.getUserRedVar();
+  if constexpr (Reduction::is_acc)
+    associateWithHandler(CGH, &Out, access::target::device);
+
+  auto &PartialSumsBuf = Redu.getTempBuffer(NWorkGroups * NElements, CGH);
+  accessor PartialSums(PartialSumsBuf, CGH, sycl::read_write, sycl::no_init);
+
   bool IsUpdateOfUserVar = !Reduction::is_usm && !Redu.initializeToIdentity();
-  auto PartialSums =
-      Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH);
-  auto Out = (NWorkGroups == 1)
-                 ? PartialSums
-                 : Redu.getWriteAccForPartialReds(NElements, CGH);
+  using InitName =
+      __sycl_reduction_kernel<reduction::init_krn::GroupCounter, KernelName>;
+
+  // Integrated/discrete GPUs have different faster path.
   auto NWorkGroupsFinished =
-      Redu.getReadWriteAccessorToInitializedGroupsCounter(CGH);
+      sycl::detail::getDeviceFromHandler(CGH)
+              .get_info<info::device::host_unified_memory>()
+          ? Redu.getReadWriteAccessorToInitializedGroupsCounter(CGH)
+          : Redu.template getGroupsCounterAccDiscrete<InitName>(CGH);
+
   auto DoReducePartialSumsInLastWG =
       Reduction::template getReadWriteLocalAcc<int>(1, CGH);
 
@@ -967,50 +1001,57 @@ void reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
     // reduce_over_group is only defined for each T, not for span<T, ...>
     size_t LID = NDId.get_local_id(0);
     for (int E = 0; E < NElements; ++E) {
-      Reducer.getElement(E) =
-          reduce_over_group(Group, Reducer.getElement(E), BOp);
-
+      auto &RedElem = Reducer.getElement(E);
+      RedElem = reduce_over_group(Group, RedElem, BOp);
       if (LID == 0) {
-        if (NWorkGroups == 1 && IsUpdateOfUserVar)
-          Reducer.getElement(E) =
-              BOp(Reducer.getElement(E), Reduction::getOutPointer(Out)[E]);
-
-        // if NWorkGroups == 1, then PartialsSum and Out point to same memory.
-        Reduction::getOutPointer(
-            PartialSums)[NDId.get_group_linear_id() * NElements + E] =
-            Reducer.getElement(E);
+        if (NWorkGroups == 1) {
+          auto &OutElem = Reduction::getOutPointer(Out)[E];
+          // Can avoid using partial sum and write the final result immediately.
+          if (IsUpdateOfUserVar)
+            RedElem = BOp(RedElem, OutElem);
+          OutElem = RedElem;
+        } else {
+          PartialSums[NDId.get_group_linear_id() * NElements + E] =
+              Reducer.getElement(E);
+        }
       }
     }
 
+    if (NWorkGroups == 1)
+      // We're done.
+      return;
+
     // Signal this work-group has finished after all values are reduced
     if (LID == 0) {
       auto NFinished =
           sycl::atomic_ref<int, memory_order::relaxed, memory_scope::device,
                            access::address_space::global_space>(
               NWorkGroupsFinished[0]);
-      DoReducePartialSumsInLastWG[0] =
-          ++NFinished == NWorkGroups && NWorkGroups > 1;
+      DoReducePartialSumsInLastWG[0] = ++NFinished == NWorkGroups;
     }
 
     sycl::detail::workGroupBarrier();
     if (DoReducePartialSumsInLastWG[0]) {
       // Reduce each result separately
-      // TODO: Opportunity to parallelize across elements
+      // TODO: Opportunity to parallelize across elements.
       for (int E = 0; E < NElements; ++E) {
+        auto &OutElem = Reduction::getOutPointer(Out)[E];
         auto LocalSum = Reducer.getIdentity();
         for (size_t I = LID; I < NWorkGroups; I += WGSize)
           LocalSum = BOp(LocalSum, PartialSums[I * NElements + E]);
-        Reducer.getElement(E) = reduce_over_group(Group, LocalSum, BOp);
+        auto Result = reduce_over_group(Group, LocalSum, BOp);
 
         if (LID == 0) {
           if (IsUpdateOfUserVar)
-            Reducer.getElement(E) =
-                BOp(Reducer.getElement(E), Reduction::getOutPointer(Out)[E]);
-          Reduction::getOutPointer(Out)[E] = Reducer.getElement(E);
+            Result = BOp(Result, OutElem);
+          OutElem = Result;
         }
       }
     }
   });
+
+  // We've updated user's variable, no extra work needed.
+  return false;
 }
 
 namespace reduction {
@@ -1019,7 +1060,7 @@ template <class KernelName> struct RangeBasic;
 } // namespace main_krn
 } // namespace reduction
 template <typename KernelName, typename KernelType, int Dims, class Reduction>
-void reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
+bool reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
                              const range<Dims> &Range,
                              const nd_range<1> &NDRange, Reduction &Redu) {
   constexpr size_t NElements = Reduction::num_elements;
@@ -1125,10 +1166,13 @@ void reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
       }
     }
   });
+  return Reduction::is_usm || Reduction::is_dw_acc;
 }
 
+/// Returns "true" if the result has to be saved to user's variable by
+/// reduSaveFinalResultToUserMem.
 template <typename KernelName, typename KernelType, int Dims, class Reduction>
-void reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
+bool reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
                         const range<Dims> &Range, size_t MaxWGSize,
                         uint32_t NumConcurrentWorkGroups, Reduction &Redu) {
   size_t NWorkItems = Range.size();
@@ -1141,16 +1185,15 @@ void reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
   size_t NDRItems = NWorkGroups * WGSize;
   nd_range<1> NDRange{range<1>{NDRItems}, range<1>{WGSize}};
 
-  if constexpr (Reduction::has_fast_atomics) {
-    reduCGFuncForRangeFastAtomics<KernelName>(CGH, KernelFunc, Range, NDRange,
-                                              Redu);
-
-  } else if constexpr (Reduction::has_fast_reduce) {
-    reduCGFuncForRangeFastReduce<KernelName>(CGH, KernelFunc, Range, NDRange,
-                                             Redu);
-  } else {
-    reduCGFuncForRangeBasic<KernelName>(CGH, KernelFunc, Range, NDRange, Redu);
-  }
+  if constexpr (Reduction::has_fast_atomics)
+    return reduCGFuncForRangeFastAtomics<KernelName>(CGH, KernelFunc, Range,
+                                                     NDRange, Redu);
+  else if constexpr (Reduction::has_fast_reduce)
+    return reduCGFuncForRangeFastReduce<KernelName>(CGH, KernelFunc, Range,
+                                                    NDRange, Redu);
+  else
+    return reduCGFuncForRangeBasic<KernelName>(CGH, KernelFunc, Range, NDRange,
+                                               Redu);
 }
 
 namespace reduction {
diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp
@@ -251,7 +251,7 @@ using sycl::detail::queue_impl;
 /// If we are given sycl::range and not sycl::nd_range we have more freedom in
 /// how to split the iteration space.
 template <typename KernelName, typename KernelType, int Dims, class Reduction>
-void reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
+bool reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
                         const range<Dims> &Range, size_t MaxWGSize,
                         uint32_t NumConcurrentWorkGroups, Reduction &Redu);
 
@@ -1649,11 +1649,9 @@ class __SYCL_EXPORT handler {
     // for the device.
     size_t MaxWGSize =
         ext::oneapi::detail::reduGetMaxWGSize(MQueue, OneElemSize);
-    ext::oneapi::detail::reduCGFuncForRange<KernelName>(
-        *this, KernelFunc, Range, MaxWGSize, NumConcurrentWorkGroups, Redu);
-    if (Reduction::is_usm ||
-        (Reduction::has_fast_atomics && Redu.initializeToIdentity()) ||
-        (!Reduction::has_fast_atomics && Reduction::is_dw_acc)) {
+    if (ext::oneapi::detail::reduCGFuncForRange<KernelName>(
+            *this, KernelFunc, Range, MaxWGSize, NumConcurrentWorkGroups,
+            Redu)) {
       this->finalize();
       MLastEvent = withAuxHandler(QueueCopy, [&](handler &CopyHandler) {
         ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
diff --git a/sycl/source/detail/reduction.cpp b/sycl/source/detail/reduction.cpp
@@ -56,10 +56,8 @@ __SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(
   device Dev = Queue->get_device();
   uint32_t NumThreads = Dev.get_info<info::device::max_compute_units>();
   // TODO: The heuristics here require additional tuning for various devices
-  // and vendors. For now this code assumes that execution units have about
-  // 8 working threads, which gives good results on some known/supported
-  // GPU devices.
-  if (Dev.is_gpu())
+  // and vendors. Also, it would be better to check vendor/generation/etc.
+  if (Dev.is_gpu() && Dev.get_info<info::device::host_unified_memory>())
     NumThreads *= 8;
   return NumThreads;
 }