Merge from 'sycl' to 'sycl-web' (3 commits)

iclsrc · iclsrc · commit 712f0a23b0fa · 2022-11-15T02:25:11.000-08:00
diff --git a/.github/workflows/sycl_containers.yaml b/.github/workflows/sycl_containers.yaml
@@ -190,11 +190,10 @@ jobs:
         id: deps
         run: |
           DEPS=`cat devops/dependencies.json`
-          DEPS="${DEPS//'%'/'%25'}"
-          DEPS="${DEPS//$'\n'/'%0A'}"
-          DEPS="${DEPS//$'\r'/'%0D'}"
+          DEPS="${DEPS//$'\r'/''}"
+          DEPS="${DEPS//$'\n'/' '}"
           echo $DEPS
-          echo "::set-output name=deps::$DEPS"
+          echo "deps=$DEPS" >>$GITHUB_OUTPUT
       - name: Build and Push Container
         uses: ./devops/actions/build_container
         with:
@@ -228,11 +227,10 @@ jobs:
         id: deps
         run: |
           DEPS=`cat devops/dependencies.json`
-          DEPS="${DEPS//'%'/'%25'}"
-          DEPS="${DEPS//$'\n'/'%0A'}"
-          DEPS="${DEPS//$'\r'/'%0D'}"
+          DEPS="${DEPS//$'\r'/''}"
+          DEPS="${DEPS//$'\n'/' '}"
           echo $DEPS
-          echo "::set-output name=deps::$DEPS"
+          echo "deps=$DEPS" >>$GITHUB_OUTPUT
       - name: Build and Push Container
         uses: ./devops/actions/build_container
         with:
diff --git a/.github/workflows/sycl_nightly.yml b/.github/workflows/sycl_nightly.yml
@@ -7,6 +7,7 @@ on:
   pull_request:
     paths:
       - 'devops/containers/ubuntu2004_preinstalled.Dockerfile'
+      - 'devops/containers/ubuntu2204_preinstalled.Dockerfile'
       - '.github/workflows/sycl_nightly.yml'
 
 jobs:
@@ -26,6 +27,17 @@ jobs:
       build_artifact_suffix: default
       build_configure_extra_args: ''
 
+  ubuntu2204_build_test:
+    if: github.repository == 'intel/llvm'
+    uses: ./.github/workflows/sycl_linux_build_and_test.yml
+    needs: test_matrix
+    secrets: inherit
+    with:
+      build_cache_root: "/__w/"
+      build_artifact_suffix: default
+      build_configure_extra_args: ''
+      build_image: "ghcr.io/intel/llvm/ubuntu2204_build:latest"
+
   ubuntu2004_opaque_pointers_build_test:
     if: github.repository == 'intel/llvm'
     uses: ./.github/workflows/sycl_linux_build_and_test.yml
@@ -37,6 +49,18 @@ jobs:
       build_artifact_suffix: opaque_pointers
       build_configure_extra_args: "--hip --cuda --enable-esimd-emulator --cmake-opt=-DDPCPP_ENABLE_OPAQUE_POINTERS=TRUE"
 
+  ubuntu2204_opaque_pointers_build_test:
+    if: github.repository == 'intel/llvm'
+    uses: ./.github/workflows/sycl_linux_build_and_test.yml
+    needs: test_matrix
+    secrets: inherit
+    with:
+      build_cache_root: "/__w/"
+      build_cache_suffix: opaque_pointers
+      build_artifact_suffix: opaque_pointers
+      build_configure_extra_args: "--hip --cuda --enable-esimd-emulator --cmake-opt=-DDPCPP_ENABLE_OPAQUE_POINTERS=TRUE"
+      build_image: "ghcr.io/intel/llvm/ubuntu2204_build:latest"
+
   windows_default:
     name: Windows
     if: github.repository == 'intel/llvm'
@@ -78,3 +102,40 @@ jobs:
         tags: |
           ghcr.io/${{ github.repository }}/sycl_ubuntu2004_nightly:no-drivers-${{ github.sha }}
           ghcr.io/${{ github.repository }}/sycl_ubuntu2004_nightly:no-drivers
+
+  ubuntu2204_docker_build_push:
+    if: github.repository == 'intel/llvm'
+    runs-on: ubuntu-latest
+    needs: ubuntu2204_build_test
+    steps:
+    - uses: actions/checkout@v3
+    - uses: actions/download-artifact@v3
+      with:
+        name: sycl_linux_default
+        path: devops/
+    - name: Build and Push Container (with drivers)
+      uses: ./devops/actions/build_container
+      with:
+        push: ${{ github.event_name != 'pull_request' }}
+        file: ubuntu2204_preinstalled
+        username: ${{ github.repository_owner }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+        build-args: |
+          base_image=ghcr.io/intel/llvm/ubuntu2204_intel_drivers
+          base_tag=latest
+        tags: |
+          ghcr.io/${{ github.repository }}/sycl_ubuntu2204_nightly:${{ github.sha }}
+          ghcr.io/${{ github.repository }}/sycl_ubuntu2204_nightly:latest
+    - name: Build and Push Container (no drivers)
+      uses: ./devops/actions/build_container
+      with:
+        push: ${{ github.event_name != 'pull_request' }}
+        file: ubuntu2204_preinstalled
+        username: ${{ github.repository_owner }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+        build-args: |
+          base_image=ghcr.io/intel/llvm/ubuntu2204_base
+          base_tag=latest
+        tags: |
+          ghcr.io/${{ github.repository }}/sycl_ubuntu2204_nightly:no-drivers-${{ github.sha }}
+          ghcr.io/${{ github.repository }}/sycl_ubuntu2204_nightly:no-drivers
diff --git a/.github/workflows/sycl_precommit.yml b/.github/workflows/sycl_precommit.yml
@@ -10,6 +10,8 @@ on:
     - '.github/CODEOWNERS'
     - '.github/workflows/sycl_update_gpu_driver.yml'
     - '.github/workflows/sycl_containers.yaml'
+    - '.github/workflows/sycl_nightly.yml'
+    - '.github/workflows/sycl_post_commit.yml'
     - '.github/workflows/sycl_windows_build_and_test.yml'
     - '.github/workflows/sycl_macos_build_and_test.yml'
     - 'devops/containers/**'
diff --git a/devops/containers/ubuntu2204_preinstalled.Dockerfile b/devops/containers/ubuntu2204_preinstalled.Dockerfile
@@ -0,0 +1,14 @@
+ARG base_tag=latest
+ARG base_image=ghcr.io/intel/llvm/ubuntu2204_intel_drivers
+
+FROM $base_image:$base_tag
+
+COPY scripts/drivers_entrypoint.sh /drivers_entrypoint.sh
+RUN mkdir -p /opt/sycl
+ADD llvm_sycl.tar.xz /opt/sycl
+
+ENV PATH /opt/sycl/bin:$PATH
+ENV LD_LIBRARY_PATH /opt/sycl/lib:$LD_LIBRARY_PATH
+
+ENTRYPOINT ["/bin/bash", "/drivers_entrypoint.sh"]
+
diff --git a/sycl/include/sycl/info/info_desc.hpp b/sycl/include/sycl/info/info_desc.hpp
@@ -98,7 +98,7 @@ namespace device {
 #include <sycl/info/device_traits_deprecated.def>
 #undef __SYCL_PARAM_TRAITS_DEPRECATED
 
-template <int Dimensions> struct max_work_item_sizes;
+template <int Dimensions = 3> struct max_work_item_sizes;
 #define __SYCL_PARAM_TRAITS_TEMPLATE_SPEC(DescType, Desc, ReturnT, PiCode)     \
   template <> struct Desc {                                                    \
     using return_type = ReturnT;                                               \
diff --git a/sycl/include/sycl/reduction.hpp b/sycl/include/sycl/reduction.hpp
@@ -1571,48 +1571,6 @@ template <> struct NDRangeReduction<reduction::strategy::basic> {
   }
 };
 
-// Auto-dispatch. Must be the last one.
-template <> struct NDRangeReduction<reduction::strategy::auto_select> {
-  // Some readability aliases, to increase signal/noise ratio below.
-  template <reduction::strategy Strategy>
-  using Impl = NDRangeReduction<Strategy>;
-  using S = reduction::strategy;
-
-  template <typename KernelName, int Dims, typename PropertiesT,
-            typename KernelType, typename Reduction>
-  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
-                  nd_range<Dims> NDRange, PropertiesT &Properties,
-                  Reduction &Redu, KernelType &KernelFunc) {
-    auto Delegate = [&](auto Impl) {
-      Impl.template run<KernelName>(CGH, Queue, NDRange, Properties, Redu,
-                                    KernelFunc);
-    };
-
-    if constexpr (Reduction::has_float64_atomics) {
-      if (getDeviceFromHandler(CGH).has(aspect::atomic64))
-        return Delegate(Impl<S::group_reduce_and_atomic_cross_wg>{});
-
-      if constexpr (Reduction::has_fast_reduce)
-        return Delegate(Impl<S::group_reduce_and_multiple_kernels>{});
-      else
-        return Delegate(Impl<S::basic>{});
-    } else if constexpr (Reduction::has_fast_atomics) {
-      if constexpr (Reduction::has_fast_reduce) {
-        return Delegate(Impl<S::group_reduce_and_atomic_cross_wg>{});
-      } else {
-        return Delegate(Impl<S::local_mem_tree_and_atomic_cross_wg>{});
-      }
-    } else {
-      if constexpr (Reduction::has_fast_reduce)
-        return Delegate(Impl<S::group_reduce_and_multiple_kernels>{});
-      else
-        return Delegate(Impl<S::basic>{});
-    }
-
-    assert(false && "Must be unreachable!");
-  }
-};
-
 /// For the given 'Reductions' types pack and indices enumerating them this
 /// function either creates new temporary accessors for partial sums (if IsOneWG
 /// is false) or returns user's accessor/USM-pointer if (IsOneWG is true).
@@ -2230,21 +2188,109 @@ tuple_select_elements(TupleT Tuple, std::index_sequence<Is...>) {
   return {std::get<Is>(std::move(Tuple))...};
 }
 
+template <> struct NDRangeReduction<reduction::strategy::multi> {
+  template <typename KernelName, int Dims, typename PropertiesT,
+            typename... RestT>
+  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
+                  nd_range<Dims> NDRange, PropertiesT &Properties,
+                  RestT... Rest) {
+    std::tuple<RestT...> ArgsTuple(Rest...);
+    constexpr size_t NumArgs = sizeof...(RestT);
+    auto KernelFunc = std::get<NumArgs - 1>(ArgsTuple);
+    auto ReduIndices = std::make_index_sequence<NumArgs - 1>();
+    auto ReduTuple = detail::tuple_select_elements(ArgsTuple, ReduIndices);
+
+    size_t LocalMemPerWorkItem = reduGetMemPerWorkItem(ReduTuple, ReduIndices);
+    // TODO: currently the maximal work group size is determined for the given
+    // queue/device, while it is safer to use queries to the kernel compiled
+    // for the device.
+    size_t MaxWGSize = reduGetMaxWGSize(Queue, LocalMemPerWorkItem);
+    if (NDRange.get_local_range().size() > MaxWGSize)
+      throw sycl::runtime_error("The implementation handling parallel_for with"
+                                " reduction requires work group size not bigger"
+                                " than " +
+                                    std::to_string(MaxWGSize),
+                                PI_ERROR_INVALID_WORK_GROUP_SIZE);
+
+    reduCGFuncMulti<KernelName>(CGH, KernelFunc, NDRange, Properties, ReduTuple,
+                                ReduIndices);
+    reduction::finalizeHandler(CGH);
+
+    size_t NWorkItems = NDRange.get_group_range().size();
+    while (NWorkItems > 1) {
+      reduction::withAuxHandler(CGH, [&](handler &AuxHandler) {
+        NWorkItems = reduAuxCGFunc<KernelName, decltype(KernelFunc)>(
+            AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
+      });
+    } // end while (NWorkItems > 1)
+  }
+};
+
+// Auto-dispatch. Must be the last one.
+template <> struct NDRangeReduction<reduction::strategy::auto_select> {
+  // Some readability aliases, to increase signal/noise ratio below.
+  template <reduction::strategy Strategy>
+  using Impl = NDRangeReduction<Strategy>;
+  using Strat = reduction::strategy;
+
+  template <typename KernelName, int Dims, typename PropertiesT,
+            typename KernelType, typename Reduction>
+  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
+                  nd_range<Dims> NDRange, PropertiesT &Properties,
+                  Reduction &Redu, KernelType &KernelFunc) {
+    auto Delegate = [&](auto Impl) {
+      Impl.template run<KernelName>(CGH, Queue, NDRange, Properties, Redu,
+                                    KernelFunc);
+    };
+
+    if constexpr (Reduction::has_float64_atomics) {
+      if (getDeviceFromHandler(CGH).has(aspect::atomic64))
+        return Delegate(Impl<Strat::group_reduce_and_atomic_cross_wg>{});
+
+      if constexpr (Reduction::has_fast_reduce)
+        return Delegate(Impl<Strat::group_reduce_and_multiple_kernels>{});
+      else
+        return Delegate(Impl<Strat::basic>{});
+    } else if constexpr (Reduction::has_fast_atomics) {
+      if constexpr (Reduction::has_fast_reduce) {
+        return Delegate(Impl<Strat::group_reduce_and_atomic_cross_wg>{});
+      } else {
+        return Delegate(Impl<Strat::local_mem_tree_and_atomic_cross_wg>{});
+      }
+    } else {
+      if constexpr (Reduction::has_fast_reduce)
+        return Delegate(Impl<Strat::group_reduce_and_multiple_kernels>{});
+      else
+        return Delegate(Impl<Strat::basic>{});
+    }
+
+    assert(false && "Must be unreachable!");
+  }
+  template <typename KernelName, int Dims, typename PropertiesT,
+            typename... RestT>
+  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
+                  nd_range<Dims> NDRange, PropertiesT &Properties,
+                  RestT... Rest) {
+    return Impl<Strat::multi>::run<KernelName>(CGH, Queue, NDRange, Properties,
+                                               Rest...);
+  }
+};
+
 template <typename KernelName, reduction::strategy Strategy, int Dims,
-          typename PropertiesT, typename KernelType, typename Reduction>
+          typename PropertiesT, typename... RestT>
 void reduction_parallel_for(handler &CGH,
                             std::shared_ptr<detail::queue_impl> Queue,
                             nd_range<Dims> NDRange, PropertiesT Properties,
-                            Reduction Redu, KernelType KernelFunc) {
-  NDRangeReduction<Strategy>::template run<KernelName>(
-      CGH, Queue, NDRange, Properties, Redu, KernelFunc);
+                            RestT... Rest) {
+  NDRangeReduction<Strategy>::template run<KernelName>(CGH, Queue, NDRange,
+                                                       Properties, Rest...);
 }
 
 __SYCL_EXPORT uint32_t
 reduGetMaxNumConcurrentWorkGroups(std::shared_ptr<queue_impl> Queue);
 
-template <typename KernelName, int Dims, typename PropertiesT,
-          typename KernelType, typename Reduction>
+template <typename KernelName, reduction::strategy Strategy, int Dims,
+          typename PropertiesT, typename KernelType, typename Reduction>
 void reduction_parallel_for(handler &CGH,
                             std::shared_ptr<detail::queue_impl> Queue,
                             range<Dims> Range, PropertiesT Properties,
@@ -2303,7 +2349,10 @@ void reduction_parallel_for(handler &CGH,
       KernelFunc(getDelinearizedId(Range, I), Reducer);
   };
 
-  constexpr auto Strategy = [&]() {
+  constexpr auto StrategyToUse = [&]() {
+    if constexpr (Strategy != reduction::strategy::auto_select)
+      return Strategy;
+
     if constexpr (Reduction::has_fast_reduce)
       return reduction::strategy::group_reduce_and_last_wg_detection;
     else if constexpr (Reduction::has_fast_atomics)
@@ -2312,57 +2361,8 @@ void reduction_parallel_for(handler &CGH,
       return reduction::strategy::range_basic;
   }();
 
-  reduction_parallel_for<KernelName, Strategy>(CGH, Queue, NDRange, Properties,
-                                               Redu, UpdatedKernelFunc);
-}
-
-template <> struct NDRangeReduction<reduction::strategy::multi> {
-  template <typename KernelName, int Dims, typename PropertiesT,
-            typename... RestT>
-  static void run(handler &CGH, std::shared_ptr<detail::queue_impl> &Queue,
-                  nd_range<Dims> NDRange, PropertiesT &Properties,
-                  RestT... Rest) {
-    std::tuple<RestT...> ArgsTuple(Rest...);
-    constexpr size_t NumArgs = sizeof...(RestT);
-    auto KernelFunc = std::get<NumArgs - 1>(ArgsTuple);
-    auto ReduIndices = std::make_index_sequence<NumArgs - 1>();
-    auto ReduTuple = detail::tuple_select_elements(ArgsTuple, ReduIndices);
-
-    size_t LocalMemPerWorkItem = reduGetMemPerWorkItem(ReduTuple, ReduIndices);
-    // TODO: currently the maximal work group size is determined for the given
-    // queue/device, while it is safer to use queries to the kernel compiled
-    // for the device.
-    size_t MaxWGSize = reduGetMaxWGSize(Queue, LocalMemPerWorkItem);
-    if (NDRange.get_local_range().size() > MaxWGSize)
-      throw sycl::runtime_error("The implementation handling parallel_for with"
-                                " reduction requires work group size not bigger"
-                                " than " +
-                                    std::to_string(MaxWGSize),
-                                PI_ERROR_INVALID_WORK_GROUP_SIZE);
-
-    reduCGFuncMulti<KernelName>(CGH, KernelFunc, NDRange, Properties, ReduTuple,
-                                ReduIndices);
-    reduction::finalizeHandler(CGH);
-
-    size_t NWorkItems = NDRange.get_group_range().size();
-    while (NWorkItems > 1) {
-      reduction::withAuxHandler(CGH, [&](handler &AuxHandler) {
-        NWorkItems = reduAuxCGFunc<KernelName, decltype(KernelFunc)>(
-            AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
-      });
-    } // end while (NWorkItems > 1)
-  }
-};
-
-template <typename KernelName, int Dims, typename PropertiesT,
-          typename... RestT>
-void reduction_parallel_for(handler &CGH,
-                            std::shared_ptr<detail::queue_impl> Queue,
-                            nd_range<Dims> NDRange, PropertiesT Properties,
-                            RestT... Rest) {
-  constexpr auto Strategy = reduction::strategy::multi;
-  NDRangeReduction<Strategy>::template run<KernelName>(CGH, Queue, NDRange,
-                                                       Properties, Rest...);
+  reduction_parallel_for<KernelName, StrategyToUse>(
+      CGH, Queue, NDRange, Properties, Redu, UpdatedKernelFunc);
 }
 } // namespace detail
 
diff --git a/sycl/include/sycl/reduction_forward.hpp b/sycl/include/sycl/reduction_forward.hpp