[SYCL][COMPAT] Add max_active_work_groups_per_cu (#15802)

joeatodd · web-flow · commit 4cc64bd3f6f7 · 2024-10-23T11:56:42.000+01:00
This new API returns the maximum work-groups which can occupy a single
compute unit of the given device.
diff --git a/sycl/doc/syclcompat/README.md b/sycl/doc/syclcompat/README.md
@@ -1584,6 +1584,27 @@ public:
 } // namespace syclcompat
 ```
 
+SYCLcompat provides a wrapper API `max_active_work_groups_per_cu` providing
+'work-groups per compute unit' semantics. It is templated on the kernel
+functor, and takes a work-group size represented by either `sycl::range<Dim>`
+or `syclcompat::dim3`, the local memory size in bytes, and an optional queue.
+The function returns the maximum number of work-groups which can be executed
+per compute unit. May return *zero* even when below resource limits (i.e.
+returning `0` does not imply the kernel cannot execute).
+```cpp
+namespace syclcompat{
+template <class KernelName>
+size_t max_active_work_groups_per_cu(
+    syclcompat::dim3 wg_dim3, size_t local_mem_size,
+    sycl::queue queue = syclcompat::get_default_queue());
+
+template <class KernelName, int RangeDim>
+size_t max_active_work_groups_per_cu(
+    sycl::range<RangeDim> wg_range, size_t local_mem_size,
+    sycl::queue queue = syclcompat::get_default_queue());
+}
+```
+
 To assist machine translation, helper aliases are provided for inlining and
 alignment attributes. The class template declarations `sycl_compat_kernel_name`
 and `sycl_compat_kernel_scalar` are used to assist automatic generation of
diff --git a/sycl/include/syclcompat/util.hpp b/sycl/include/syclcompat/util.hpp
@@ -39,6 +39,7 @@
 
 #include <syclcompat/math.hpp>
 #include <syclcompat/memory.hpp>
+#include <syclcompat/dims.hpp>
 
 #if defined(__NVPTX__)
 #include <sycl/ext/oneapi/experimental/cuda/masked_shuffles.hpp>
@@ -919,6 +920,46 @@ class group : public group_base<dimensions> {
 };
 } // namespace experimental
 
+// Calculate the number of work-groups per compute unit
+// \tparam [in] KernelName SYCL kernel name to calculate for
+// \param [in] q SYCL queue used to execute kernel
+// \param [in] wg_dim3 dim3 representing work-group shape
+// \param [in] local_mem_size Local memory usage per work-group in bytes
+// \return size_t representing maximum work-groups per compute unit
+template <class KernelName>
+size_t max_active_work_groups_per_cu(
+    syclcompat::dim3 wg_dim3, size_t local_mem_size,
+    sycl::queue queue = syclcompat::get_default_queue()) {
+  namespace syclex = sycl::ext::oneapi::experimental;
+  // max_num_work_groups only supports range<3>
+  auto ctx = queue.get_context();
+  auto bundle = sycl::get_kernel_bundle<sycl::bundle_state::executable>(ctx);
+  auto kernel = bundle.template get_kernel<KernelName>();
+  sycl::range<3> wg_range_3d(wg_dim3);
+  size_t max_wgs = kernel.template ext_oneapi_get_info<
+      syclex::info::kernel_queue_specific::max_num_work_groups>(queue, wg_range_3d,
+                                                                local_mem_size);
+  size_t max_compute_units =
+      queue.get_device().get_info<sycl::info::device::max_compute_units>();
+  // Spec dictates max_compute_units > 0, so no need to catch div 0
+  return max_wgs / max_compute_units;
+}
+
+// Calculate the number of work-groups per compute unit
+// \tparam [in] KernelName SYCL kernel name to calculate for
+// \tparam [in] RangeDim the dimension of the sycl::range
+// \param [in] q SYCL queue used to execute kernel
+// \param [in] wg_range SYCL work-group range
+// \param [in] local_mem_size Local memory usage per work-group in bytes
+// \return size_t representing maximum work-groups per compute unit
+template <class KernelName, int RangeDim>
+size_t max_active_work_groups_per_cu(
+    sycl::range<RangeDim> wg_range, size_t local_mem_size,
+    sycl::queue queue = syclcompat::get_default_queue()) {
+  return max_active_work_groups_per_cu<KernelName>(syclcompat::dim3(wg_range),
+                                                   local_mem_size, queue);
+}
+
 /// If x <= 2, then return a pointer to the default queue;
 /// otherwise, return x reinterpreted as a queue_ptr.
 inline queue_ptr int_as_queue_ptr(uintptr_t x) {
diff --git a/sycl/test-e2e/syclcompat/util/max_active_work_groups_per_cu.cpp b/sycl/test-e2e/syclcompat/util/max_active_work_groups_per_cu.cpp
@@ -0,0 +1,133 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCLcompat
+ *
+ *  max_active_work_groups_per_cu.cpp
+ *
+ *  Description:
+ *    Test the syclcompat::max_active_work_groups_per_cu API
+ **************************************************************************/
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+#include "sycl/accessor.hpp"
+#include <sycl/detail/core.hpp>
+#include <syclcompat/util.hpp>
+
+template <class T, size_t Dim>
+using sycl_global_accessor =
+    sycl::accessor<T, Dim, sycl::access::mode::read_write,
+                   sycl::access::target::global_buffer>;
+
+using value_type = int;
+
+template <int RangeDim> struct MyKernel {
+  MyKernel(sycl_global_accessor<value_type, RangeDim> acc) : acc_{acc} {}
+  void operator()(sycl::nd_item<RangeDim> item) const {
+    auto gid = item.get_global_id();
+    acc_[gid] = item.get_global_linear_id();
+  }
+  sycl_global_accessor<value_type, RangeDim> acc_;
+  static constexpr bool has_local_mem = false;
+};
+
+template <int RangeDim> struct MyLocalMemKernel {
+  MyLocalMemKernel(sycl_global_accessor<value_type, RangeDim> acc,
+                   sycl::local_accessor<value_type, RangeDim> lacc)
+      : acc_{acc}, lacc_{lacc} {}
+  void operator()(sycl::nd_item<RangeDim> item) const {
+    auto gid = item.get_global_id();
+    acc_[gid] = item.get_global_linear_id();
+    auto lid = item.get_local_id();
+    lacc_[lid] = item.get_global_linear_id();
+  }
+  sycl_global_accessor<value_type, RangeDim> acc_;
+  sycl::local_accessor<value_type, RangeDim> lacc_;
+  static constexpr bool has_local_mem = true;
+};
+
+template <template <int> class KernelName, int RangeDim>
+void test_max_active_work_groups_per_cu(sycl::queue q,
+                                        sycl::range<RangeDim> wg_range,
+                                        size_t local_mem_size = 0) {
+  if constexpr (!KernelName<RangeDim>::has_local_mem)
+    assert(local_mem_size == 0 && "Bad test setup");
+
+  size_t max_per_cu = syclcompat::max_active_work_groups_per_cu<KernelName<RangeDim>>(
+      wg_range, local_mem_size, q);
+ 
+  // Check we get the same result passing equivalent dim3
+  syclcompat::dim3 wg_dim3{wg_range};
+  size_t max_per_cu_dim3 = syclcompat::max_active_work_groups_per_cu<KernelName<RangeDim>>(
+      wg_dim3, local_mem_size, q);
+  assert(max_per_cu == max_per_cu_dim3);
+
+  // Compare w/ reference impl
+  size_t max_compute_units =
+      q.get_device().get_info<sycl::info::device::max_compute_units>();
+  namespace syclex = sycl::ext::oneapi::experimental;
+  auto ctx = q.get_context();
+  auto bundle = sycl::get_kernel_bundle<sycl::bundle_state::executable>(ctx);
+  auto kernel = bundle.template get_kernel<KernelName<RangeDim>>();
+  size_t max_wgs = kernel.template ext_oneapi_get_info<
+      syclex::info::kernel_queue_specific::max_num_work_groups>(
+      q, sycl::range<3>{syclcompat::dim3{wg_range}}, local_mem_size);
+  assert(max_per_cu == max_wgs / max_compute_units);
+
+  // We aren't interested in the launch, it's here to define the kernel
+  if (false) {
+    sycl::range<RangeDim> global_range = wg_range;
+    if(max_per_cu > 0)
+      global_range[0] = global_range[0] * max_per_cu * max_compute_units;
+    sycl::nd_range<RangeDim> my_range{global_range, wg_range};
+    sycl::buffer<value_type, RangeDim> buf{global_range};
+
+    q.submit([&](sycl::handler &cgh) {
+      auto acc = buf.template get_access<sycl::access::mode::read_write>(cgh);
+      if constexpr (KernelName<RangeDim>::has_local_mem) {
+        sycl::local_accessor<value_type, RangeDim> lacc(
+            my_range.get_local_range(), cgh);
+        cgh.parallel_for(my_range, KernelName<RangeDim>{acc, lacc});
+      } else {
+        cgh.parallel_for(my_range, KernelName<RangeDim>{acc});
+      }
+    });
+  }
+}
+
+int main() {
+  sycl::queue q{};
+  sycl::range<1> range_1d{32};
+  sycl::range<2> range_2d{1, 32};
+  sycl::range<3> range_3d{1, 1, 32};
+  syclcompat::dim3 wg_dim3{32, 1, 1};
+
+  size_t lmem_size_small = sizeof(value_type) * 32;
+  size_t lmem_size_medium = lmem_size_small * 32;
+  size_t lmem_size_large = lmem_size_medium * 32;
+
+  test_max_active_work_groups_per_cu<MyKernel, 3>(q, range_3d);
+  test_max_active_work_groups_per_cu<MyKernel, 2>(q, range_2d);
+  test_max_active_work_groups_per_cu<MyKernel, 1>(q, range_1d);
+  test_max_active_work_groups_per_cu<MyLocalMemKernel, 3>(q, range_3d,
+                                                          lmem_size_small);
+  test_max_active_work_groups_per_cu<MyLocalMemKernel, 3>(q, range_3d,
+                                                          lmem_size_medium);
+  test_max_active_work_groups_per_cu<MyLocalMemKernel, 3>(q, range_3d,
+                                                          lmem_size_large);
+  test_max_active_work_groups_per_cu<MyLocalMemKernel, 1>(q, range_1d,
+                                                          lmem_size_large);
+  return 0;
+}