[SYCL] Make reduction compatible with MSVC host compiler (#6601)

aelovikov-intel · web-flow · commit 94c4b80a2a22 · 2022-08-19T19:29:23.000+01:00
This PR addresses two problems:

1) MSVC has a bug in handling this in default mode (fine in /permissive-
though). The issue affected compilation using MSVC as a host compiler.

Simplified description of the problem:

  template &lt;class Derived&gt;
  class Base {
      using T = int;
  };

  template &lt;class T&gt;
  class A : public Base&lt;A&lt;T&gt;&gt; {
      // That's what we had in the codebase prior to this change. MSVC
      // complains here by default, accepts in "/permissive-".
      using T2 = T;
  };

  class Base2 {
      using T = int;
  };

  template &lt;class T&gt;
  class A2 : public Base2 {
      using T2 = T; // That's where the error has to be emitted.
  };

  int main() {
      A&lt;int&gt; a;
      A2&lt;int&gt; a2;
      return 0;
  }

2) constexpr variable are part of lambda capture and result in
incompatibilities between clang device and MSVC host. As such, don't use
those when they're used inside kernel lambdas. Instead, make them
regular variable and pay the price of increased number of arguments to
the kernel.
diff --git a/sycl/include/sycl/ext/oneapi/reduction.hpp b/sycl/include/sycl/ext/oneapi/reduction.hpp
@@ -192,57 +192,52 @@ struct ReducerTraits<reducer<T, BinaryOperation, Dims, Extent, View, Subst>> {
 /// Also, for int32/64 types the atomic_combine() is lowered to
 /// sycl::atomic::fetch_add().
 template <class Reducer> class combiner {
-  using T = typename ReducerTraits<Reducer>::type;
-  using BinaryOperation = typename ReducerTraits<Reducer>::op;
+  using Ty = typename ReducerTraits<Reducer>::type;
+  using BinaryOp = typename ReducerTraits<Reducer>::op;
   static constexpr int Dims = ReducerTraits<Reducer>::dims;
   static constexpr size_t Extent = ReducerTraits<Reducer>::extent;
 
 public:
-  template <typename _T = T, int _Dims = Dims>
-  enable_if_t<(_Dims == 0) &&
-              sycl::detail::IsPlus<_T, BinaryOperation>::value &&
+  template <typename _T = Ty, int _Dims = Dims>
+  enable_if_t<(_Dims == 0) && sycl::detail::IsPlus<_T, BinaryOp>::value &&
               sycl::detail::is_geninteger<_T>::value>
   operator++() {
-    static_cast<Reducer *>(this)->combine(static_cast<T>(1));
+    static_cast<Reducer *>(this)->combine(static_cast<_T>(1));
   }
 
-  template <typename _T = T, int _Dims = Dims>
-  enable_if_t<(_Dims == 0) &&
-              sycl::detail::IsPlus<_T, BinaryOperation>::value &&
+  template <typename _T = Ty, int _Dims = Dims>
+  enable_if_t<(_Dims == 0) && sycl::detail::IsPlus<_T, BinaryOp>::value &&
               sycl::detail::is_geninteger<_T>::value>
   operator++(int) {
-    static_cast<Reducer *>(this)->combine(static_cast<T>(1));
+    static_cast<Reducer *>(this)->combine(static_cast<_T>(1));
   }
 
-  template <typename _T = T, int _Dims = Dims>
-  enable_if_t<(_Dims == 0) && sycl::detail::IsPlus<_T, BinaryOperation>::value>
+  template <typename _T = Ty, int _Dims = Dims>
+  enable_if_t<(_Dims == 0) && sycl::detail::IsPlus<_T, BinaryOp>::value>
   operator+=(const _T &Partial) {
     static_cast<Reducer *>(this)->combine(Partial);
   }
 
-  template <typename _T = T, int _Dims = Dims>
-  enable_if_t<(_Dims == 0) &&
-              sycl::detail::IsMultiplies<_T, BinaryOperation>::value>
+  template <typename _T = Ty, int _Dims = Dims>
+  enable_if_t<(_Dims == 0) && sycl::detail::IsMultiplies<_T, BinaryOp>::value>
   operator*=(const _T &Partial) {
     static_cast<Reducer *>(this)->combine(Partial);
   }
 
-  template <typename _T = T, int _Dims = Dims>
-  enable_if_t<(_Dims == 0) && sycl::detail::IsBitOR<_T, BinaryOperation>::value>
+  template <typename _T = Ty, int _Dims = Dims>
+  enable_if_t<(_Dims == 0) && sycl::detail::IsBitOR<_T, BinaryOp>::value>
   operator|=(const _T &Partial) {
     static_cast<Reducer *>(this)->combine(Partial);
   }
 
-  template <typename _T = T, int _Dims = Dims>
-  enable_if_t<(_Dims == 0) &&
-              sycl::detail::IsBitXOR<_T, BinaryOperation>::value>
+  template <typename _T = Ty, int _Dims = Dims>
+  enable_if_t<(_Dims == 0) && sycl::detail::IsBitXOR<_T, BinaryOp>::value>
   operator^=(const _T &Partial) {
     static_cast<Reducer *>(this)->combine(Partial);
   }
 
-  template <typename _T = T, int _Dims = Dims>
-  enable_if_t<(_Dims == 0) &&
-              sycl::detail::IsBitAND<_T, BinaryOperation>::value>
+  template <typename _T = Ty, int _Dims = Dims>
+  enable_if_t<(_Dims == 0) && sycl::detail::IsBitAND<_T, BinaryOp>::value>
   operator&=(const _T &Partial) {
     static_cast<Reducer *>(this)->combine(Partial);
   }
@@ -266,53 +261,53 @@ template <class Reducer> class combiner {
     }
   }
 
-  template <class _T, access::address_space Space, class BinaryOperation>
+  template <class _T, access::address_space Space, class BinaryOp>
   static constexpr bool BasicCheck =
-      std::is_same<typename remove_AS<_T>::type, T>::value &&
+      std::is_same<typename remove_AS<_T>::type, Ty>::value &&
       (Space == access::address_space::global_space ||
        Space == access::address_space::local_space);
 
 public:
   /// Atomic ADD operation: *ReduVarPtr += MValue;
   template <access::address_space Space = access::address_space::global_space,
-            typename _T = T, class _BinaryOperation = BinaryOperation>
+            typename _T = Ty, class _BinaryOperation = BinaryOp>
   enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
-              (IsReduOptForFastAtomicFetch<T, _BinaryOperation>::value ||
-               IsReduOptForAtomic64Op<T, _BinaryOperation>::value) &&
-              sycl::detail::IsPlus<T, _BinaryOperation>::value>
+              (IsReduOptForFastAtomicFetch<_T, _BinaryOperation>::value ||
+               IsReduOptForAtomic64Op<_T, _BinaryOperation>::value) &&
+              sycl::detail::IsPlus<_T, _BinaryOperation>::value>
   atomic_combine(_T *ReduVarPtr) const {
     atomic_combine_impl<Space>(
         ReduVarPtr, [](auto Ref, auto Val) { return Ref.fetch_add(Val); });
   }
 
   /// Atomic BITWISE OR operation: *ReduVarPtr |= MValue;
   template <access::address_space Space = access::address_space::global_space,
-            typename _T = T, class _BinaryOperation = BinaryOperation>
+            typename _T = Ty, class _BinaryOperation = BinaryOp>
   enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
-              IsReduOptForFastAtomicFetch<T, _BinaryOperation>::value &&
-              sycl::detail::IsBitOR<T, _BinaryOperation>::value>
+              IsReduOptForFastAtomicFetch<_T, _BinaryOperation>::value &&
+              sycl::detail::IsBitOR<_T, _BinaryOperation>::value>
   atomic_combine(_T *ReduVarPtr) const {
     atomic_combine_impl<Space>(
         ReduVarPtr, [](auto Ref, auto Val) { return Ref.fetch_or(Val); });
   }
 
   /// Atomic BITWISE XOR operation: *ReduVarPtr ^= MValue;
   template <access::address_space Space = access::address_space::global_space,
-            typename _T = T, class _BinaryOperation = BinaryOperation>
+            typename _T = Ty, class _BinaryOperation = BinaryOp>
   enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
-              IsReduOptForFastAtomicFetch<T, _BinaryOperation>::value &&
-              sycl::detail::IsBitXOR<T, _BinaryOperation>::value>
+              IsReduOptForFastAtomicFetch<_T, _BinaryOperation>::value &&
+              sycl::detail::IsBitXOR<_T, _BinaryOperation>::value>
   atomic_combine(_T *ReduVarPtr) const {
     atomic_combine_impl<Space>(
         ReduVarPtr, [](auto Ref, auto Val) { return Ref.fetch_xor(Val); });
   }
 
   /// Atomic BITWISE AND operation: *ReduVarPtr &= MValue;
   template <access::address_space Space = access::address_space::global_space,
-            typename _T = T, class _BinaryOperation = BinaryOperation>
-  enable_if_t<std::is_same<typename remove_AS<_T>::type, T>::value &&
-              IsReduOptForFastAtomicFetch<T, _BinaryOperation>::value &&
-              sycl::detail::IsBitAND<T, _BinaryOperation>::value &&
+            typename _T = Ty, class _BinaryOperation = BinaryOp>
+  enable_if_t<std::is_same<typename remove_AS<_T>::type, _T>::value &&
+              IsReduOptForFastAtomicFetch<_T, _BinaryOperation>::value &&
+              sycl::detail::IsBitAND<_T, _BinaryOperation>::value &&
               (Space == access::address_space::global_space ||
                Space == access::address_space::local_space)>
   atomic_combine(_T *ReduVarPtr) const {
@@ -322,23 +317,23 @@ template <class Reducer> class combiner {
 
   /// Atomic MIN operation: *ReduVarPtr = sycl::minimum(*ReduVarPtr, MValue);
   template <access::address_space Space = access::address_space::global_space,
-            typename _T = T, class _BinaryOperation = BinaryOperation>
+            typename _T = Ty, class _BinaryOperation = BinaryOp>
   enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
-              (IsReduOptForFastAtomicFetch<T, _BinaryOperation>::value ||
-               IsReduOptForAtomic64Op<T, _BinaryOperation>::value) &&
-              sycl::detail::IsMinimum<T, _BinaryOperation>::value>
+              (IsReduOptForFastAtomicFetch<_T, _BinaryOperation>::value ||
+               IsReduOptForAtomic64Op<_T, _BinaryOperation>::value) &&
+              sycl::detail::IsMinimum<_T, _BinaryOperation>::value>
   atomic_combine(_T *ReduVarPtr) const {
     atomic_combine_impl<Space>(
         ReduVarPtr, [](auto Ref, auto Val) { return Ref.fetch_min(Val); });
   }
 
   /// Atomic MAX operation: *ReduVarPtr = sycl::maximum(*ReduVarPtr, MValue);
   template <access::address_space Space = access::address_space::global_space,
-            typename _T = T, class _BinaryOperation = BinaryOperation>
+            typename _T = Ty, class _BinaryOperation = BinaryOp>
   enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
-              (IsReduOptForFastAtomicFetch<T, _BinaryOperation>::value ||
-               IsReduOptForAtomic64Op<T, _BinaryOperation>::value) &&
-              sycl::detail::IsMaximum<T, _BinaryOperation>::value>
+              (IsReduOptForFastAtomicFetch<_T, _BinaryOperation>::value ||
+               IsReduOptForAtomic64Op<_T, _BinaryOperation>::value) &&
+              sycl::detail::IsMaximum<_T, _BinaryOperation>::value>
   atomic_combine(_T *ReduVarPtr) const {
     atomic_combine_impl<Space>(
         ReduVarPtr, [](auto Ref, auto Val) { return Ref.fetch_max(Val); });
@@ -928,7 +923,7 @@ bool reduCGFuncForRangeFastAtomics(handler &CGH, KernelType KernelFunc,
                                    const range<Dims> &Range,
                                    const nd_range<1> &NDRange,
                                    Reduction &Redu) {
-  constexpr size_t NElements = Reduction::num_elements;
+  size_t NElements = Reduction::num_elements;
   auto Out = Redu.getReadWriteAccessorToInitializedMem(CGH);
   auto GroupSum = Reduction::getReadWriteLocalAcc(NElements, CGH);
   using Name = __sycl_reduction_kernel<reduction::main_krn::RangeFastAtomics,
@@ -976,7 +971,7 @@ template <typename KernelName, typename KernelType, int Dims, class Reduction>
 bool reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
                                   const range<Dims> &Range,
                                   const nd_range<1> &NDRange, Reduction &Redu) {
-  constexpr size_t NElements = Reduction::num_elements;
+  size_t NElements = Reduction::num_elements;
   size_t WGSize = NDRange.get_local_range().size();
   size_t NWorkGroups = NDRange.get_group_range().size();
 
@@ -1078,7 +1073,7 @@ template <typename KernelName, typename KernelType, int Dims, class Reduction>
 bool reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
                              const range<Dims> &Range,
                              const nd_range<1> &NDRange, Reduction &Redu) {
-  constexpr size_t NElements = Reduction::num_elements;
+  size_t NElements = Reduction::num_elements;
   size_t WGSize = NDRange.get_local_range().size();
   size_t NWorkGroups = NDRange.get_group_range().size();
 
@@ -1230,7 +1225,7 @@ template <typename KernelName, typename KernelType, int Dims, class Reduction>
 void reduCGFuncForNDRangeBothFastReduceAndAtomics(
     handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
     Reduction &, typename Reduction::rw_accessor_type Out) {
-  constexpr size_t NElements = Reduction::num_elements;
+  size_t NElements = Reduction::num_elements;
   using Name = __sycl_reduction_kernel<
       reduction::main_krn::NDRangeBothFastReduceAndAtomics, KernelName>;
   CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
@@ -1266,7 +1261,7 @@ void reduCGFuncForNDRangeFastAtomicsOnly(
     handler &CGH, bool IsPow2WG, KernelType KernelFunc,
     const nd_range<Dims> &Range, Reduction &,
     typename Reduction::rw_accessor_type Out) {
-  constexpr size_t NElements = Reduction::num_elements;
+  size_t NElements = Reduction::num_elements;
   size_t WGSize = Range.get_local_range().size();
 
   // Use local memory to reduce elements in work-groups into zero-th element.
@@ -1345,7 +1340,7 @@ template <typename KernelName, typename KernelType, int Dims, class Reduction>
 void reduCGFuncForNDRangeFastReduceOnly(
     handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
     Reduction &Redu, typename Reduction::rw_accessor_type Out) {
-  constexpr size_t NElements = Reduction::num_elements;
+  size_t NElements = Reduction::num_elements;
   size_t NWorkGroups = Range.get_group_range().size();
   bool IsUpdateOfUserVar =
       !Reduction::is_usm && !Redu.initializeToIdentity() && NWorkGroups == 1;
@@ -1392,7 +1387,7 @@ void reduCGFuncForNDRangeBasic(handler &CGH, bool IsPow2WG,
                                KernelType KernelFunc,
                                const nd_range<Dims> &Range, Reduction &Redu,
                                typename Reduction::rw_accessor_type Out) {
-  constexpr size_t NElements = Reduction::num_elements;
+  size_t NElements = Reduction::num_elements;
   size_t WGSize = Range.get_local_range().size();
   size_t NWorkGroups = Range.get_group_range().size();
 
@@ -1477,7 +1472,7 @@ void reduAuxCGFuncFastReduceImpl(handler &CGH, bool UniformWG,
                                  size_t NWorkItems, size_t NWorkGroups,
                                  size_t WGSize, Reduction &Redu, InputT In,
                                  OutputT Out) {
-  constexpr size_t NElements = Reduction::num_elements;
+  size_t NElements = Reduction::num_elements;
   using Name =
       __sycl_reduction_kernel<reduction::aux_krn::FastReduce, KernelName>;
   bool IsUpdateOfUserVar =
@@ -1523,7 +1518,7 @@ void reduAuxCGFuncNoFastReduceNorAtomicImpl(handler &CGH, bool UniformPow2WG,
                                             size_t NWorkGroups, size_t WGSize,
                                             Reduction &Redu, InputT In,
                                             OutputT Out) {
-  constexpr size_t NElements = Reduction::num_elements;
+  size_t NElements = Reduction::num_elements;
   bool IsUpdateOfUserVar =
       !Reduction::is_usm && !Redu.initializeToIdentity() && NWorkGroups == 1;
 
@@ -1642,7 +1637,7 @@ reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu) {
 template <typename KernelName, class Reduction>
 std::enable_if_t<Reduction::is_usm>
 reduSaveFinalResultToUserMem(handler &CGH, Reduction &Redu) {
-  constexpr size_t NElements = Reduction::num_elements;
+  size_t NElements = Reduction::num_elements;
   auto InAcc = Redu.getReadAccToPreviousPartialReds(CGH);
   auto UserVarPtr = Redu.getUserRedVar();
   bool IsUpdateOfUserVar = !Redu.initializeToIdentity();
@@ -2120,7 +2115,7 @@ void reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
   static_assert(
       Reduction::has_float64_atomics,
       "Only suitable for reductions that have FP64 atomic operations.");
-  constexpr size_t NElements = Reduction::num_elements;
+  size_t NElements = Reduction::num_elements;
   using Name =
       __sycl_reduction_kernel<reduction::main_krn::NDRangeAtomic64, KernelName>;
   CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {