Add basic CUDA impl using ScopedStream

Hugh Delaney · Hugh Delaney · commit 2acb53628a5b · 2024-06-27T15:51:39.000+01:00
Use ScopedStream to return the same stream during the lifetime of the
RAII object. This allows us to create events outside a user submitted
func, and submit work within the user submitted func, since the stream
given to the user from urQueueGetNativeHandle is guaranteed to be the
same stream that we record events on.
diff --git a/source/adapters/cuda/enqueue_native.cpp b/source/adapters/cuda/enqueue_native.cpp
@@ -1,4 +1,4 @@
-//===--------- native_enqueue.cpp - CUDA Adapter --------------------------===//
+//===--------- enqueue_native.cpp - CUDA Adapter --------------------------===//
 //
 // Copyright (C) 2024 Intel Corporation
 //
@@ -10,9 +10,44 @@
 
 #include <ur_api.h>
 
-ur_result_t urNativeEnqueueExp(ur_queue_handle_t,
-                               ur_exp_enqueue_native_command_function_t, void *,
-                               uint32_t, const ur_event_handle_t *,
-                               ur_event_handle_t *) {
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+#include "context.hpp"
+#include "event.hpp"
+#include "queue.hpp"
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueNativeCommandExp(
+    ur_queue_handle_t hQueue,
+    ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, void *data,
+    const ur_exp_enqueue_native_command_properties_t *,
+    uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  // TODO: how should mem migration work across a context here?
+  // Perhaps we will need to add a phMemObjArgs so that we are able to make
+  // sure memory migration happens across devices in the same context
+
+  try {
+    ScopedContext ActiveContext(hQueue->getDevice());
+    ScopedStream ActiveStream(hQueue, NumEventsInWaitList, phEventWaitList);
+    std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
+
+    if (phEvent) {
+      RetImplEvent =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
+              UR_COMMAND_ENQUEUE_NATIVE_EXP, hQueue, ActiveStream.getStream()));
+      UR_CHECK_ERROR(RetImplEvent->start());
+    }
+
+    pfnNativeEnqueue(hQueue, data); // This is using urQueueGetNativeHandle to
+                                    // get the CUDA stream. It must be the
+                                    // same stream as is used before and after
+    if (phEvent) {
+      UR_CHECK_ERROR(RetImplEvent->record());
+      *phEvent = RetImplEvent.release();
+    }
+
+  } catch (ur_result_t Err) {
+    return Err;
+  } catch (CUresult CuErr) {
+    return mapErrorUR(CuErr);
+  }
+  return UR_RESULT_SUCCESS;
 }
diff --git a/source/adapters/cuda/queue.cpp b/source/adapters/cuda/queue.cpp
@@ -33,6 +33,8 @@ void ur_queue_handle_t_::transferStreamWaitForBarrierIfNeeded(
 }
 
 CUstream ur_queue_handle_t_::getNextComputeStream(uint32_t *StreamToken) {
+  if (getThreadLocalStream() != CUstream{0})
+    return getThreadLocalStream();
   uint32_t StreamI;
   uint32_t Token;
   while (true) {
@@ -68,6 +70,8 @@ CUstream ur_queue_handle_t_::getNextComputeStream(uint32_t *StreamToken) {
 CUstream ur_queue_handle_t_::getNextComputeStream(
     uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList,
     ur_stream_guard_ &Guard, uint32_t *StreamToken) {
+  if (getThreadLocalStream() != CUstream{0})
+    return getThreadLocalStream();
   for (uint32_t i = 0; i < NumEventsInWaitList; i++) {
     uint32_t Token = EventWaitList[i]->getComputeStreamToken();
     if (reinterpret_cast<ur_queue_handle_t>(EventWaitList[i]->getQueue()) ==
@@ -94,6 +98,8 @@ CUstream ur_queue_handle_t_::getNextComputeStream(
 }
 
 CUstream ur_queue_handle_t_::getNextTransferStream() {
+  if (getThreadLocalStream() != CUstream{0})
+    return getThreadLocalStream();
   if (TransferStreams.empty()) { // for example in in-order queue
     return getNextComputeStream();
   }
diff --git a/source/adapters/cuda/queue.hpp b/source/adapters/cuda/queue.hpp
@@ -101,6 +101,13 @@ struct ur_queue_handle_t_ {
                                    const ur_event_handle_t *EventWaitList,
                                    ur_stream_guard_ &Guard,
                                    uint32_t *StreamToken = nullptr);
+
+  // Thread local stream will be used if ScopedStream is active
+  static CUstream &getThreadLocalStream() {
+    static thread_local CUstream stream{0};
+    return stream;
+  }
+
   native_type getNextTransferStream();
   native_type get() { return getNextComputeStream(); };
   ur_device_handle_t getDevice() const noexcept { return Device; };
@@ -265,3 +272,24 @@ struct ur_queue_handle_t_ {
 
   bool backendHasOwnership() const noexcept { return HasOwnership; }
 };
+
+// RAII object to make hQueue stream getter methods all return the same stream
+// within the lifetime of this object.
+//
+// This is useful for urEnqueueNativeCommandExp where we want guarantees that
+// the user submitted native calls will be dispatched to a known stream, which
+// must be "got" within the user submitted fuction.
+class ScopedStream {
+  ur_queue_handle_t hQueue;
+
+public:
+  ScopedStream(ur_queue_handle_t hQueue, uint32_t NumEventsInWaitList,
+               const ur_event_handle_t *EventWaitList)
+      : hQueue{hQueue} {
+    ur_stream_guard_ Guard;
+    hQueue->getThreadLocalStream() =
+        hQueue->getNextComputeStream(NumEventsInWaitList, EventWaitList, Guard);
+  }
+  CUstream getStream() { return hQueue->getThreadLocalStream(); }
+  ~ScopedStream() { hQueue->getThreadLocalStream() = CUstream{0}; }
+};