[SYCL][Graph] Remove explicit L0 wait from SYCL-RT (#18064)

keyRadical · web-flow · commit 877beeec6f34 · 2025-04-17T18:12:44.000Z
This PR: - Removes explicit L0 event wait from SYCL-RT and moves it to the Command buffer at UR level, - Removes dead code from `exec_graph_impl::enqueue()`. Fixes #17734
diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp
@@ -1004,39 +1004,19 @@ exec_graph_impl::enqueue(const std::shared_ptr<sycl::detail::queue_impl> &Queue,
     auto CommandBuffer = CurrentPartition->MCommandBuffers[Queue->get_device()];
 
     if (CommandBuffer) {
-      // if previous submissions are incompleted, we automatically
-      // add completion events of previous submissions as dependencies.
-      // With Level-Zero backend we cannot resubmit a command-buffer until the
-      // previous one has already completed.
-      // Indeed, since a command-list does not accept a list a dependencies at
-      // submission, we circumvent this lack by adding a barrier that waits on a
-      // specific event and then define the conditions to signal this event in
-      // another command-list. Consequently, if a second submission is
-      // performed, the signal conditions of this single event are redefined by
-      // this second submission. Thus, this can lead to an undefined behaviour
-      // and potential hangs. We have therefore to expliclty wait in the host
-      // for previous submission to complete before resubmitting the
-      // command-buffer for level-zero backend.
-      // TODO https://github.com/intel/llvm/issues/17734
-      // Remove this backend specific behavior and allow multiple concurrent
-      // submissions of the UR command-buffer.
       for (std::vector<sycl::detail::EventImplPtr>::iterator It =
                MExecutionEvents.begin();
            It != MExecutionEvents.end();) {
         auto Event = *It;
         if (!Event->isCompleted()) {
-          if (Queue->get_device().get_backend() ==
-              sycl::backend::ext_oneapi_level_zero) {
-            Event->wait(Event);
-          } else {
-            auto &AttachedEventsList = Event->getPostCompleteEvents();
-            CGData.MEvents.reserve(AttachedEventsList.size() + 1);
-            CGData.MEvents.push_back(Event);
-            // Add events of the previous execution of all graph partitions.
-            for (auto &AttachedEvent : AttachedEventsList) {
-              CGData.MEvents.push_back(AttachedEvent);
-            }
-          }
+          auto &AttachedEventsList = Event->getPostCompleteEvents();
+          CGData.MEvents.reserve(CGData.MEvents.size() +
+                                 AttachedEventsList.size() + 1);
+          CGData.MEvents.push_back(Event);
+          // Add events of the previous execution of all graph partitions.
+          CGData.MEvents.insert(CGData.MEvents.end(),
+                                AttachedEventsList.begin(),
+                                AttachedEventsList.end());
           ++It;
         } else {
           // Remove completed events
@@ -1102,46 +1082,6 @@ exec_graph_impl::enqueue(const std::shared_ptr<sycl::detail::queue_impl> &Queue,
 
       NewEvent = sycl::detail::Scheduler::getInstance().addCG(
           NodeImpl->getCGCopy(), Queue, /*EventNeeded=*/true);
-    } else {
-      std::vector<std::shared_ptr<sycl::detail::event_impl>> ScheduledEvents;
-      for (auto &NodeImpl : CurrentPartition->MSchedule) {
-        std::vector<ur_event_handle_t> RawEvents;
-
-        // If the node has no requirements for accessors etc. then we skip the
-        // scheduler and enqueue directly.
-        if (NodeImpl->MCGType == sycl::detail::CGType::Kernel &&
-            NodeImpl->MCommandGroup->getRequirements().size() +
-                    static_cast<sycl::detail::CGExecKernel *>(
-                        NodeImpl->MCommandGroup.get())
-                        ->MStreams.size() ==
-                0) {
-          sycl::detail::CGExecKernel *CG =
-              static_cast<sycl::detail::CGExecKernel *>(
-                  NodeImpl->MCommandGroup.get());
-          auto OutEvent = CreateNewEvent();
-          sycl::detail::enqueueImpKernel(
-              Queue, CG->MNDRDesc, CG->MArgs, CG->MKernelBundle,
-              CG->MSyclKernel, CG->MKernelName, RawEvents, OutEvent,
-              // TODO: Pass accessor mem allocations
-              nullptr,
-              // TODO: Extract from handler
-              UR_KERNEL_CACHE_CONFIG_DEFAULT, CG->MKernelIsCooperative,
-              CG->MKernelUsesClusterLaunch, CG->MKernelWorkGroupMemorySize);
-          ScheduledEvents.push_back(NewEvent);
-        } else if (!NodeImpl->isEmpty()) {
-          // Empty nodes are node processed as other nodes, but only their
-          // dependencies are propagated in findRealDeps
-          sycl::detail::EventImplPtr EventImpl =
-              sycl::detail::Scheduler::getInstance().addCG(
-                  NodeImpl->getCGCopy(), Queue, /*EventNeeded=*/true);
-
-          ScheduledEvents.push_back(EventImpl);
-        }
-      }
-      // Create an event which has all kernel events as dependencies
-      NewEvent = std::make_shared<sycl::detail::event_impl>(Queue);
-      NewEvent->setStateIncomplete();
-      NewEvent->getPreparedDepsEvents() = ScheduledEvents;
     }
     PartitionsExecutionEvents[CurrentPartition] = NewEvent;
   }
diff --git a/unified-runtime/source/adapters/level_zero/command_buffer.cpp b/unified-runtime/source/adapters/level_zero/command_buffer.cpp
@@ -1507,6 +1507,25 @@ ur_result_t getZeCommandQueue(ur_queue_handle_t Queue, bool UseCopyEngine,
   return UR_RESULT_SUCCESS;
 }
 
+/**
+ * Waits for any ongoing executions of the command-buffer to finish.
+ * @param CommandBuffer The command-buffer to wait for.
+ * @return UR_RESULT_SUCCESS or an error code on failure
+ */
+ur_result_t
+waitForOngoingExecution(ur_exp_command_buffer_handle_t CommandBuffer) {
+
+  if (ur_event_handle_t &CurrentSubmissionEvent =
+          CommandBuffer->CurrentSubmissionEvent) {
+    ZE2UR_CALL(zeEventHostSynchronize,
+               (CurrentSubmissionEvent->ZeEvent, UINT64_MAX));
+    UR_CALL(urEventReleaseInternal(CurrentSubmissionEvent));
+    CurrentSubmissionEvent = nullptr;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
 /**
  * Waits for the all the dependencies of the command-buffer
  * @param[in] CommandBuffer The command-buffer.
@@ -1754,6 +1773,16 @@ ur_result_t enqueueWaitEventPath(ur_exp_command_buffer_handle_t CommandBuffer,
   ZE2UR_CALL(zeCommandListAppendBarrier,
              (SignalCommandList->first, (*Event)->ZeEvent, 0, nullptr));
 
+  /* The event needs to be retained since it will be used later by the
+     command-buffer. If there is an existing event from a
+     previous submission of the command-buffer, release it since it is no longer
+     needed. */
+  if (CommandBuffer->CurrentSubmissionEvent) {
+    UR_CALL(urEventReleaseInternal(CommandBuffer->CurrentSubmissionEvent));
+  }
+  (*Event)->RefCount.increment();
+  CommandBuffer->CurrentSubmissionEvent = *Event;
+
   UR_CALL(Queue->executeCommandList(SignalCommandList, false /*IsBlocking*/,
                                     false /*OKToBatchCommand*/));
 
@@ -1767,6 +1796,8 @@ ur_result_t urEnqueueCommandBufferExp(
 
   std::scoped_lock<ur_shared_mutex> Lock(UrQueue->Mutex);
 
+  UR_CALL(waitForOngoingExecution(CommandBuffer));
+
   const bool IsInternal = (Event == nullptr);
   const bool DoProfiling =
       (UrQueue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) &&
@@ -1794,8 +1825,6 @@ ur_result_t urEnqueueCommandBufferExp(
                                  EventWaitList, OutEvent, ZeCommandListHelper,
                                  DoProfiling));
   }
-  // Mark that synchronization will be required for later updates
-  CommandBuffer->NeedsUpdateSynchronization = true;
 
   return UR_RESULT_SUCCESS;
 }
@@ -2230,37 +2259,6 @@ ur_result_t updateCommandBuffer(
   return UR_RESULT_SUCCESS;
 }
 
-/**
- * Waits for any ongoing executions of the command-buffer to finish before
- * updating.
- * @param CommandBuffer The command-buffer to wait for.
- * @return UR_RESULT_SUCCESS or an error code on failure
- */
-ur_result_t
-waitForOngoingExecution(ur_exp_command_buffer_handle_t CommandBuffer) {
-  // Calling function has taken a lock for the command-buffer so we can safely
-  // check and modify this value here.
-  // If command-buffer was recently synchronized we can return early.
-  if (!CommandBuffer->NeedsUpdateSynchronization) {
-    return UR_RESULT_SUCCESS;
-  }
-
-  if (CommandBuffer->UseImmediateAppendPath) {
-    if (ur_event_handle_t &CurrentSubmissionEvent =
-            CommandBuffer->CurrentSubmissionEvent) {
-      ZE2UR_CALL(zeEventHostSynchronize,
-                 (CurrentSubmissionEvent->ZeEvent, UINT64_MAX));
-      UR_CALL(urEventReleaseInternal(CurrentSubmissionEvent));
-      CurrentSubmissionEvent = nullptr;
-    }
-  } else if (ze_fence_handle_t &ZeFence = CommandBuffer->ZeActiveFence) {
-    ZE2UR_CALL(zeFenceHostSynchronize, (ZeFence, UINT64_MAX));
-  }
-  // Mark that command-buffer was recently synchronized
-  CommandBuffer->NeedsUpdateSynchronization = false;
-  return UR_RESULT_SUCCESS;
-}
-
 } // namespace
 
 ur_result_t urCommandBufferUpdateKernelLaunchExp(
diff --git a/unified-runtime/source/adapters/level_zero/command_buffer.hpp b/unified-runtime/source/adapters/level_zero/command_buffer.hpp
@@ -144,10 +144,6 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object {
   // This list is needed to release all kernels retained by the
   // command_buffer.
   std::vector<ur_kernel_handle_t> KernelsList;
-  // Track whether synchronization is required when updating the command-buffer
-  // Set this value to true when a command-buffer is enqueued, and false after
-  // any fence or event synchronization to avoid repeated calls to synchronize.
-  bool NeedsUpdateSynchronization = false;
   // Track handle objects to free when command-buffer is destroyed.
   std::vector<std::unique_ptr<ur_exp_command_buffer_command_handle_t_>>
       CommandHandles;
diff --git a/unified-runtime/test/conformance/exp_command_buffer/fill.cpp b/unified-runtime/test/conformance/exp_command_buffer/fill.cpp
@@ -128,10 +128,6 @@ TEST_P(urCommandBufferFillCommandsTest, Buffer) {
 }
 
 TEST_P(urCommandBufferFillCommandsTest, ExecuteTwice) {
-  // TODO https://github.com/intel/llvm/issues/17734
-  // Fail on Level-Zero due to blocking wait code in graph_impl.cpp specific
-  // to the level-zero backend that needs moved into the Level-Zero v1 adapter.
-  UUR_KNOWN_FAILURE_ON(uur::LevelZero{});
   ASSERT_SUCCESS(urCommandBufferAppendMemBufferFillExp(
       cmd_buf_handle, buffer, pattern.data(), pattern_size, 0, size, 0, nullptr,
       0, nullptr, &sync_point, nullptr, nullptr));