[SYCL] [NATIVECPU] Remove dependencies to sycl in UR adapter (#11685)

PietroGhg · martygrant · commit 0868c6ed141c · 2023-11-09T14:57:05.000Z
This PR removes dependencies to the sycl headers/library in the Native
CPU UR adapter.
`sycl/include/sycl/detail/native_cpu.hpp` has been moved to
`sycl/plugins/unified_runtime/ur/adapters/native_cpu/nativecpu_state.hpp`,
and the definitions of work item builtins has been moved from that
header to the compiler, which now emits them in the
`PrepareSYCLNativeCPUPass`.
diff --git a/sycl/plugins/unified_runtime/ur/adapters/native_cpu/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/native_cpu/enqueue.cpp
@@ -6,40 +6,37 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <sycl/detail/cg_types.hpp>
+#include <array>
+#include <cstdint>
 
 #include "ur_api.h"
 
 #include "common.hpp"
 #include "kernel.hpp"
 #include "memory.hpp"
 
-sycl::detail::NDRDescT getNDRDesc(uint32_t WorkDim,
-                                  const size_t *GlobalWorkOffset,
-                                  const size_t *GlobalWorkSize,
-                                  const size_t *LocalWorkSize) {
-  // Todo: we flip indexes here, I'm not sure we should, if we don't we need to
-  // un-flip them in the spirv builtins definitions as well
-  sycl::detail::NDRDescT Res;
-  switch (WorkDim) {
-  case 1:
-    Res.set<1>(sycl::nd_range<1>({GlobalWorkSize[0]}, {LocalWorkSize[0]},
-                                 {GlobalWorkOffset[0]}));
-    break;
-  case 2:
-    Res.set<2>(sycl::nd_range<2>({GlobalWorkSize[0], GlobalWorkSize[1]},
-                                 {LocalWorkSize[0], LocalWorkSize[1]},
-                                 {GlobalWorkOffset[0], GlobalWorkOffset[1]}));
-    break;
-  case 3:
-    Res.set<3>(sycl::nd_range<3>(
-        {GlobalWorkSize[0], GlobalWorkSize[1], GlobalWorkSize[2]},
-        {LocalWorkSize[0], LocalWorkSize[1], LocalWorkSize[2]},
-        {GlobalWorkOffset[0], GlobalWorkOffset[1], GlobalWorkOffset[2]}));
-    break;
+namespace native_cpu {
+struct NDRDescT {
+  using RangeT = std::array<size_t, 3>;
+  uint32_t WorkDim;
+  RangeT GlobalOffset;
+  RangeT GlobalSize;
+  RangeT LocalSize;
+  NDRDescT(uint32_t WorkDim, const size_t *GlobalWorkOffset,
+           const size_t *GlobalWorkSize, const size_t *LocalWorkSize) {
+    for (uint32_t I = 0; I < WorkDim; I++) {
+      GlobalOffset[I] = GlobalWorkOffset[I];
+      GlobalSize[I] = GlobalWorkSize[I];
+      LocalSize[I] = LocalWorkSize[I];
+    }
+    for (uint32_t I = WorkDim; I < 3; I++) {
+      GlobalSize[I] = 1;
+      LocalSize[I] = LocalSize[0] ? 1 : 0;
+      GlobalOffset[I] = 0;
+    }
   }
-  return Res;
-}
+};
+} // namespace native_cpu
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
@@ -62,11 +59,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
 
   // TODO: add proper error checking
   // TODO: add proper event dep management
-  sycl::detail::NDRDescT ndr =
-      getNDRDesc(workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize);
+  native_cpu::NDRDescT ndr(workDim, pGlobalWorkOffset, pGlobalWorkSize,
+                           pLocalWorkSize);
   hKernel->handleLocalArgs();
 
-  __nativecpu_state state(ndr.GlobalSize[0], ndr.GlobalSize[1],
+  native_cpu::state state(ndr.GlobalSize[0], ndr.GlobalSize[1],
                           ndr.GlobalSize[2], ndr.LocalSize[0], ndr.LocalSize[1],
                           ndr.LocalSize[2], ndr.GlobalOffset[0],
                           ndr.GlobalOffset[1], ndr.GlobalOffset[2]);
@@ -124,7 +121,7 @@ static inline ur_result_t enqueueMemBufferReadWriteRect_impl(
     ur_rect_region_t region, size_t BufferRowPitch, size_t BufferSlicePitch,
     size_t HostRowPitch, size_t HostSlicePitch,
     typename std::conditional<IsRead, void *, const void *>::type DstMem,
-    pi_uint32, const ur_event_handle_t *, ur_event_handle_t *) {
+    uint32_t, const ur_event_handle_t *, ur_event_handle_t *) {
   // TODO: events, blocking, check other constraints, performance optimizations
   //       More sharing with level_zero where possible
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/native_cpu/kernel.hpp b/sycl/plugins/unified_runtime/ur/adapters/native_cpu/kernel.hpp
@@ -9,11 +9,21 @@
 #pragma once
 
 #include "common.hpp"
-#include <sycl/detail/native_cpu.hpp>
+#include "nativecpu_state.hpp"
 #include <ur_api.h>
 
-using nativecpu_kernel_t = void(const sycl::detail::NativeCPUArgDesc *,
-                                __nativecpu_state *);
+namespace native_cpu {
+
+struct NativeCPUArgDesc {
+  void *MPtr;
+
+  NativeCPUArgDesc(void *Ptr) : MPtr(Ptr){};
+};
+
+} // namespace native_cpu
+
+using nativecpu_kernel_t = void(const native_cpu::NativeCPUArgDesc *,
+                                native_cpu::state *);
 using nativecpu_ptr_t = nativecpu_kernel_t *;
 using nativecpu_task_t = std::function<nativecpu_kernel_t>;
 
@@ -31,7 +41,7 @@ struct ur_kernel_handle_t_ : RefCounted {
 
   const char *_name;
   nativecpu_task_t _subhandler;
-  std::vector<sycl::detail::NativeCPUArgDesc> _args;
+  std::vector<native_cpu::NativeCPUArgDesc> _args;
   std::vector<local_arg_info_t> _localArgInfo;
 
   // To be called before enqueing the kernel.
diff --git a/sycl/plugins/unified_runtime/ur/adapters/native_cpu/nativecpu_state.hpp b/sycl/plugins/unified_runtime/ur/adapters/native_cpu/nativecpu_state.hpp
@@ -0,0 +1,55 @@
+//===-------------- nativecpu_state.hpp - SYCL Native CPU state -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+#pragma once
+#include <cstdlib>
+namespace native_cpu {
+
+struct state {
+  size_t MGlobal_id[3];
+  size_t MGlobal_range[3];
+  size_t MWorkGroup_size[3];
+  size_t MWorkGroup_id[3];
+  size_t MLocal_id[3];
+  size_t MNumGroups[3];
+  size_t MGlobalOffset[3];
+  state(size_t globalR0, size_t globalR1, size_t globalR2, size_t localR0,
+        size_t localR1, size_t localR2, size_t globalO0, size_t globalO1,
+        size_t globalO2)
+      : MGlobal_range{globalR0, globalR1, globalR2},
+        MWorkGroup_size{localR0, localR1, localR2},
+        MNumGroups{globalR0 / localR0, globalR1 / localR1, globalR2 / localR2},
+        MGlobalOffset{globalO0, globalO1, globalO2} {
+    MGlobal_id[0] = 0;
+    MGlobal_id[1] = 0;
+    MGlobal_id[2] = 0;
+    MWorkGroup_id[0] = 0;
+    MWorkGroup_id[1] = 0;
+    MWorkGroup_id[2] = 0;
+    MLocal_id[0] = 0;
+    MLocal_id[1] = 0;
+    MLocal_id[2] = 0;
+  }
+
+  void update(size_t group0, size_t group1, size_t group2, size_t local0,
+              size_t local1, size_t local2) {
+    MWorkGroup_id[0] = group0;
+    MWorkGroup_id[1] = group1;
+    MWorkGroup_id[2] = group2;
+    MLocal_id[0] = local0;
+    MLocal_id[1] = local1;
+    MLocal_id[2] = local2;
+    MGlobal_id[0] =
+        MWorkGroup_size[0] * MWorkGroup_id[0] + MLocal_id[0] + MGlobalOffset[0];
+    MGlobal_id[1] =
+        MWorkGroup_size[1] * MWorkGroup_id[1] + MLocal_id[1] + MGlobalOffset[1];
+    MGlobal_id[2] =
+        MWorkGroup_size[2] * MWorkGroup_id[2] + MLocal_id[2] + MGlobalOffset[2];
+  }
+};
+
+} // namespace native_cpu