From 6d287603312e1b855bb6da93159d3ab5770e8846 Mon Sep 17 00:00:00 2001 From: Konrad Kusiak Date: Wed, 21 May 2025 11:50:56 +0100 Subject: [PATCH 01/11] Modified the adapters such that it is valid to call release on CB while it is executing --- sycl/source/detail/graph_impl.cpp | 5 --- .../scripts/core/EXP-COMMAND-BUFFER.rst | 1 + .../scripts/core/exp-command-buffer.yml | 2 +- .../source/adapters/cuda/command_buffer.cpp | 4 ++ .../source/adapters/cuda/command_buffer.hpp | 2 + .../source/adapters/hip/command_buffer.cpp | 4 ++ .../source/adapters/hip/command_buffer.hpp | 2 + .../adapters/level_zero/command_buffer.cpp | 39 ++++++++++--------- .../source/adapters/opencl/command_buffer.cpp | 4 ++ .../exp_command_buffer/enqueue.cpp | 10 +++++ 10 files changed, 48 insertions(+), 25 deletions(-) diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index caf50642c3a2c..1f90250f2c8ac 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -1002,11 +1002,6 @@ exec_graph_impl::~exec_graph_impl() { const sycl::detail::AdapterPtr &Adapter = sycl::detail::getSyclObjImpl(MContext)->getAdapter(); MSchedule.clear(); - // We need to wait on all command buffer executions before we can release - // them. - for (auto &Event : MExecutionEvents) { - Event->wait(Event); - } // Clean up any graph-owned allocations that were allocated MGraphImpl->getMemPool().deallocateAndUnmapAll(); diff --git a/unified-runtime/scripts/core/EXP-COMMAND-BUFFER.rst b/unified-runtime/scripts/core/EXP-COMMAND-BUFFER.rst index 136c7024632ec..2b443c13f6a3d 100644 --- a/unified-runtime/scripts/core/EXP-COMMAND-BUFFER.rst +++ b/unified-runtime/scripts/core/EXP-COMMAND-BUFFER.rst @@ -563,3 +563,4 @@ Contributors * Maxime France-Pillois `maxime.francepillois@codeplay.com `_ * Aaron Greig `aaron.greig@codeplay.com `_ * Fábio Mestre `fabio.mestre@codeplay.com `_ +* Konrad Kusiak `konrad.kusiak@codeplay.com `_ diff --git a/unified-runtime/scripts/core/exp-command-buffer.yml b/unified-runtime/scripts/core/exp-command-buffer.yml index 05e61985cd4b8..d38c05894d13f 100644 --- a/unified-runtime/scripts/core/exp-command-buffer.yml +++ b/unified-runtime/scripts/core/exp-command-buffer.yml @@ -310,7 +310,7 @@ returns: - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY --- #-------------------------------------------------------------------------- type: function -desc: "Decrement the command-buffer object's reference count and delete the command-buffer object if the reference count becomes zero." +desc: "Decrement the command-buffer object's reference count and delete the command-buffer object if the reference count becomes zero. It will try synchronizing the command-buffer, hence it is legal to call it while command-buffer is still executing." class: $xCommandBuffer name: ReleaseExp params: diff --git a/unified-runtime/source/adapters/cuda/command_buffer.cpp b/unified-runtime/source/adapters/cuda/command_buffer.cpp index 2399b8f81857c..912f863f780ab 100644 --- a/unified-runtime/source/adapters/cuda/command_buffer.cpp +++ b/unified-runtime/source/adapters/cuda/command_buffer.cpp @@ -387,6 +387,9 @@ urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { if (hCommandBuffer->decrementReferenceCount() == 0) { + if (hCommandBuffer->CurrentExecution) { + UR_CHECK_ERROR(hCommandBuffer->CurrentExecution->wait()); + } // Ref count has reached zero, release of created commands for (auto &Command : hCommandBuffer->CommandHandles) { commandHandleDestroy(Command); @@ -1172,6 +1175,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCommandBufferExp( if (phEvent) { UR_CHECK_ERROR(RetImplEvent->record()); *phEvent = RetImplEvent.release(); + hCommandBuffer->CurrentExecution = *phEvent; } return UR_RESULT_SUCCESS; } catch (ur_result_t Err) { diff --git a/unified-runtime/source/adapters/cuda/command_buffer.hpp b/unified-runtime/source/adapters/cuda/command_buffer.hpp index e11b9ab74969a..b7a3509ea72da 100644 --- a/unified-runtime/source/adapters/cuda/command_buffer.hpp +++ b/unified-runtime/source/adapters/cuda/command_buffer.hpp @@ -192,6 +192,8 @@ struct ur_exp_command_buffer_handle_t_ : ur::cuda::handle_base { // Atomic variable counting the number of reference to this command_buffer // using std::atomic prevents data race when incrementing/decrementing. std::atomic_uint32_t RefCount; + // The event of current graph execution. + ur_event_handle_t CurrentExecution = nullptr; // Ordered map of sync_points to ur_events, so that we can find the last // node added to an in-order command-buffer. diff --git a/unified-runtime/source/adapters/hip/command_buffer.cpp b/unified-runtime/source/adapters/hip/command_buffer.cpp index af058cdde4cf0..5b62a0d4252f2 100644 --- a/unified-runtime/source/adapters/hip/command_buffer.cpp +++ b/unified-runtime/source/adapters/hip/command_buffer.cpp @@ -273,6 +273,9 @@ urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { if (hCommandBuffer->decrementReferenceCount() == 0) { + if (hCommandBuffer->CurrentExecution) { + UR_CHECK_ERROR(hCommandBuffer->CurrentExecution->wait()); + } delete hCommandBuffer; } return UR_RESULT_SUCCESS; @@ -811,6 +814,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCommandBufferExp( if (phEvent) { UR_CHECK_ERROR(RetImplEvent->record()); *phEvent = RetImplEvent.release(); + hCommandBuffer->CurrentExecution = *phEvent; } } catch (ur_result_t Err) { return Err; diff --git a/unified-runtime/source/adapters/hip/command_buffer.hpp b/unified-runtime/source/adapters/hip/command_buffer.hpp index 3d0047adee013..7b6da91feaafe 100644 --- a/unified-runtime/source/adapters/hip/command_buffer.hpp +++ b/unified-runtime/source/adapters/hip/command_buffer.hpp @@ -128,6 +128,8 @@ struct ur_exp_command_buffer_handle_t_ : ur::hip::handle_base { // Atomic variable counting the number of reference to this command_buffer // using std::atomic prevents data race when incrementing/decrementing. std::atomic_uint32_t RefCount; + // The event of current graph execution. + ur_event_handle_t CurrentExecution = nullptr; // Ordered map of sync_points to ur_events std::map SyncPoints; diff --git a/unified-runtime/source/adapters/level_zero/command_buffer.cpp b/unified-runtime/source/adapters/level_zero/command_buffer.cpp index 020afb90564ff..7622731fc33c2 100644 --- a/unified-runtime/source/adapters/level_zero/command_buffer.cpp +++ b/unified-runtime/source/adapters/level_zero/command_buffer.cpp @@ -671,6 +671,25 @@ ur_result_t createMainCommandList(ur_context_handle_t Context, return UR_RESULT_SUCCESS; } +/** + * Waits for any ongoing executions of the command-buffer to finish. + * @param CommandBuffer The command-buffer to wait for. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t +waitForOngoingExecution(ur_exp_command_buffer_handle_t CommandBuffer) { + + if (ur_event_handle_t &CurrentSubmissionEvent = + CommandBuffer->CurrentSubmissionEvent) { + ZE2UR_CALL(zeEventHostSynchronize, + (CurrentSubmissionEvent->ZeEvent, UINT64_MAX)); + UR_CALL(urEventReleaseInternal(CurrentSubmissionEvent)); + CurrentSubmissionEvent = nullptr; + } + + return UR_RESULT_SUCCESS; +} + /** * Checks whether the command-buffer can be constructed using in order * command-lists. @@ -832,6 +851,7 @@ urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t CommandBuffer) { if (!CommandBuffer->RefCount.decrementAndTest()) return UR_RESULT_SUCCESS; + UR_CALL(waitForOngoingExecution(CommandBuffer)); CommandBuffer->cleanupCommandBufferResources(); delete CommandBuffer; return UR_RESULT_SUCCESS; @@ -1453,25 +1473,6 @@ ur_result_t getZeCommandQueue(ur_queue_handle_t Queue, bool UseCopyEngine, return UR_RESULT_SUCCESS; } -/** - * Waits for any ongoing executions of the command-buffer to finish. - * @param CommandBuffer The command-buffer to wait for. - * @return UR_RESULT_SUCCESS or an error code on failure - */ -ur_result_t -waitForOngoingExecution(ur_exp_command_buffer_handle_t CommandBuffer) { - - if (ur_event_handle_t &CurrentSubmissionEvent = - CommandBuffer->CurrentSubmissionEvent) { - ZE2UR_CALL(zeEventHostSynchronize, - (CurrentSubmissionEvent->ZeEvent, UINT64_MAX)); - UR_CALL(urEventReleaseInternal(CurrentSubmissionEvent)); - CurrentSubmissionEvent = nullptr; - } - - return UR_RESULT_SUCCESS; -} - /** * Waits for the all the dependencies of the command-buffer * @param[in] CommandBuffer The command-buffer. diff --git a/unified-runtime/source/adapters/opencl/command_buffer.cpp b/unified-runtime/source/adapters/opencl/command_buffer.cpp index e048b2d22175c..affc5b5882870 100644 --- a/unified-runtime/source/adapters/opencl/command_buffer.cpp +++ b/unified-runtime/source/adapters/opencl/command_buffer.cpp @@ -115,6 +115,10 @@ urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { if (hCommandBuffer->decrementReferenceCount() == 0) { + if (hCommandBuffer->LastSubmission) { + cl_int RetErr = clWaitForEvents(1, &(hCommandBuffer->LastSubmission)); + CL_RETURN_ON_FAILURE(RetErr); + } delete hCommandBuffer; } diff --git a/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp b/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp index 4f2a10593c19d..cbe7bcba51416 100644 --- a/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp +++ b/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp @@ -190,3 +190,13 @@ TEST_P(urEnqueueCommandBufferExpTest, SerializeInOrOutOfOrderQueue) { ASSERT_EQ(reference, Output[i]); } } + +// Tests releasing command-buffer while it is still executing relying +// on synchronization during urCommandBufferReleaseExp call. +TEST_P(urEnqueueCommandBufferExpTest, EnqueueAndRelease) { + ASSERT_SUCCESS(urEnqueueCommandBufferExp(out_of_order_queue, cmd_buf_handle, + 0, nullptr, nullptr)); + + // Release the command buffer + ASSERT_SUCCESS(urCommandBufferReleaseExp(cmd_buf_handle)); +} From 8ec66e8d9b9b975ab011140b52bccb9889db5ee8 Mon Sep 17 00:00:00 2001 From: Konrad Kusiak Date: Mon, 26 May 2025 13:48:03 +0100 Subject: [PATCH 02/11] Modified spec wording and made creating an event branchless on CUDA and HIP --- unified-runtime/include/ur_api.h | 5 ++++- unified-runtime/scripts/core/exp-command-buffer.yml | 2 +- unified-runtime/source/adapters/cuda/command_buffer.cpp | 6 +++--- unified-runtime/source/adapters/cuda/command_buffer.hpp | 2 +- unified-runtime/source/adapters/hip/command_buffer.cpp | 6 +++--- unified-runtime/source/adapters/hip/command_buffer.hpp | 2 +- unified-runtime/source/loader/ur_libapi.cpp | 5 ++++- unified-runtime/source/ur_api.cpp | 5 ++++- 8 files changed, 21 insertions(+), 12 deletions(-) diff --git a/unified-runtime/include/ur_api.h b/unified-runtime/include/ur_api.h index 9c967b411131d..f6c3bc32a5b1f 100644 --- a/unified-runtime/include/ur_api.h +++ b/unified-runtime/include/ur_api.h @@ -11078,7 +11078,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferRetainExp( /////////////////////////////////////////////////////////////////////////////// /// @brief Decrement the command-buffer object's reference count and delete the -/// command-buffer object if the reference count becomes zero. +/// command-buffer object if the reference count becomes zero. It is +/// legal to call the entry-point while hCommandBuffer is still +/// executing, which will block on completion if the reference count of +/// hCommandBuffer becomes zero. /// /// @returns /// - ::UR_RESULT_SUCCESS diff --git a/unified-runtime/scripts/core/exp-command-buffer.yml b/unified-runtime/scripts/core/exp-command-buffer.yml index d38c05894d13f..85e584dc4839f 100644 --- a/unified-runtime/scripts/core/exp-command-buffer.yml +++ b/unified-runtime/scripts/core/exp-command-buffer.yml @@ -310,7 +310,7 @@ returns: - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY --- #-------------------------------------------------------------------------- type: function -desc: "Decrement the command-buffer object's reference count and delete the command-buffer object if the reference count becomes zero. It will try synchronizing the command-buffer, hence it is legal to call it while command-buffer is still executing." +desc: "Decrement the command-buffer object's reference count and delete the command-buffer object if the reference count becomes zero. It is legal to call the entry-point while hCommandBuffer is still executing, which will block on completion if the reference count of hCommandBuffer becomes zero." class: $xCommandBuffer name: ReleaseExp params: diff --git a/unified-runtime/source/adapters/cuda/command_buffer.cpp b/unified-runtime/source/adapters/cuda/command_buffer.cpp index 912f863f780ab..1e40a69e787df 100644 --- a/unified-runtime/source/adapters/cuda/command_buffer.cpp +++ b/unified-runtime/source/adapters/cuda/command_buffer.cpp @@ -1172,10 +1172,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCommandBufferExp( // Launch graph UR_CHECK_ERROR(cuGraphLaunch(hCommandBuffer->CudaGraphExec, CuStream)); + UR_CHECK_ERROR(RetImplEvent->record()); + hCommandBuffer->CurrentExecution = RetImplEvent.release(); if (phEvent) { - UR_CHECK_ERROR(RetImplEvent->record()); - *phEvent = RetImplEvent.release(); - hCommandBuffer->CurrentExecution = *phEvent; + *phEvent = hCommandBuffer->CurrentExecution; } return UR_RESULT_SUCCESS; } catch (ur_result_t Err) { diff --git a/unified-runtime/source/adapters/cuda/command_buffer.hpp b/unified-runtime/source/adapters/cuda/command_buffer.hpp index b7a3509ea72da..d58cd4f87cda8 100644 --- a/unified-runtime/source/adapters/cuda/command_buffer.hpp +++ b/unified-runtime/source/adapters/cuda/command_buffer.hpp @@ -192,7 +192,7 @@ struct ur_exp_command_buffer_handle_t_ : ur::cuda::handle_base { // Atomic variable counting the number of reference to this command_buffer // using std::atomic prevents data race when incrementing/decrementing. std::atomic_uint32_t RefCount; - // The event of current graph execution. + // Track the event of the current graph execution. ur_event_handle_t CurrentExecution = nullptr; // Ordered map of sync_points to ur_events, so that we can find the last diff --git a/unified-runtime/source/adapters/hip/command_buffer.cpp b/unified-runtime/source/adapters/hip/command_buffer.cpp index 5b62a0d4252f2..a8ce8d024440b 100644 --- a/unified-runtime/source/adapters/hip/command_buffer.cpp +++ b/unified-runtime/source/adapters/hip/command_buffer.cpp @@ -811,10 +811,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCommandBufferExp( // Launch graph UR_CHECK_ERROR(hipGraphLaunch(hCommandBuffer->HIPGraphExec, HIPStream)); + UR_CHECK_ERROR(RetImplEvent->record()); + hCommandBuffer->CurrentExecution = RetImplEvent.release(); if (phEvent) { - UR_CHECK_ERROR(RetImplEvent->record()); - *phEvent = RetImplEvent.release(); - hCommandBuffer->CurrentExecution = *phEvent; + *phEvent = hCommandBuffer->CurrentExecution; } } catch (ur_result_t Err) { return Err; diff --git a/unified-runtime/source/adapters/hip/command_buffer.hpp b/unified-runtime/source/adapters/hip/command_buffer.hpp index 7b6da91feaafe..3dd1417fef427 100644 --- a/unified-runtime/source/adapters/hip/command_buffer.hpp +++ b/unified-runtime/source/adapters/hip/command_buffer.hpp @@ -128,7 +128,7 @@ struct ur_exp_command_buffer_handle_t_ : ur::hip::handle_base { // Atomic variable counting the number of reference to this command_buffer // using std::atomic prevents data race when incrementing/decrementing. std::atomic_uint32_t RefCount; - // The event of current graph execution. + // Track the event of the current graph execution. ur_event_handle_t CurrentExecution = nullptr; // Ordered map of sync_points to ur_events diff --git a/unified-runtime/source/loader/ur_libapi.cpp b/unified-runtime/source/loader/ur_libapi.cpp index ed14b29cd8393..c13a5260ea860 100644 --- a/unified-runtime/source/loader/ur_libapi.cpp +++ b/unified-runtime/source/loader/ur_libapi.cpp @@ -8503,7 +8503,10 @@ ur_result_t UR_APICALL urCommandBufferRetainExp( /////////////////////////////////////////////////////////////////////////////// /// @brief Decrement the command-buffer object's reference count and delete the -/// command-buffer object if the reference count becomes zero. +/// command-buffer object if the reference count becomes zero. It is +/// legal to call the entry-point while hCommandBuffer is still +/// executing, which will block on completion if the reference count of +/// hCommandBuffer becomes zero. /// /// @returns /// - ::UR_RESULT_SUCCESS diff --git a/unified-runtime/source/ur_api.cpp b/unified-runtime/source/ur_api.cpp index 549bd97e87f6b..e2c03a20eb853 100644 --- a/unified-runtime/source/ur_api.cpp +++ b/unified-runtime/source/ur_api.cpp @@ -7406,7 +7406,10 @@ ur_result_t UR_APICALL urCommandBufferRetainExp( /////////////////////////////////////////////////////////////////////////////// /// @brief Decrement the command-buffer object's reference count and delete the -/// command-buffer object if the reference count becomes zero. +/// command-buffer object if the reference count becomes zero. It is +/// legal to call the entry-point while hCommandBuffer is still +/// executing, which will block on completion if the reference count of +/// hCommandBuffer becomes zero. /// /// @returns /// - ::UR_RESULT_SUCCESS From e3a6a53316527f759d0737d43b6f171dc3e1189a Mon Sep 17 00:00:00 2001 From: Konrad Kusiak Date: Mon, 26 May 2025 15:34:20 +0100 Subject: [PATCH 03/11] Added backticks --- unified-runtime/include/ur_api.h | 4 ++-- unified-runtime/scripts/core/exp-command-buffer.yml | 2 +- unified-runtime/source/loader/ur_libapi.cpp | 4 ++-- unified-runtime/source/ur_api.cpp | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/unified-runtime/include/ur_api.h b/unified-runtime/include/ur_api.h index f6c3bc32a5b1f..484ca4ef94768 100644 --- a/unified-runtime/include/ur_api.h +++ b/unified-runtime/include/ur_api.h @@ -11079,9 +11079,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferRetainExp( /////////////////////////////////////////////////////////////////////////////// /// @brief Decrement the command-buffer object's reference count and delete the /// command-buffer object if the reference count becomes zero. It is -/// legal to call the entry-point while hCommandBuffer is still +/// legal to call the entry-point while `hCommandBuffer` is still /// executing, which will block on completion if the reference count of -/// hCommandBuffer becomes zero. +/// `hCommandBuffer` becomes zero. /// /// @returns /// - ::UR_RESULT_SUCCESS diff --git a/unified-runtime/scripts/core/exp-command-buffer.yml b/unified-runtime/scripts/core/exp-command-buffer.yml index 85e584dc4839f..e8f2caa15d59d 100644 --- a/unified-runtime/scripts/core/exp-command-buffer.yml +++ b/unified-runtime/scripts/core/exp-command-buffer.yml @@ -310,7 +310,7 @@ returns: - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY --- #-------------------------------------------------------------------------- type: function -desc: "Decrement the command-buffer object's reference count and delete the command-buffer object if the reference count becomes zero. It is legal to call the entry-point while hCommandBuffer is still executing, which will block on completion if the reference count of hCommandBuffer becomes zero." +desc: "Decrement the command-buffer object's reference count and delete the command-buffer object if the reference count becomes zero. It is legal to call the entry-point while `hCommandBuffer` is still executing, which will block on completion if the reference count of `hCommandBuffer` becomes zero." class: $xCommandBuffer name: ReleaseExp params: diff --git a/unified-runtime/source/loader/ur_libapi.cpp b/unified-runtime/source/loader/ur_libapi.cpp index c13a5260ea860..f202f98eaaa21 100644 --- a/unified-runtime/source/loader/ur_libapi.cpp +++ b/unified-runtime/source/loader/ur_libapi.cpp @@ -8504,9 +8504,9 @@ ur_result_t UR_APICALL urCommandBufferRetainExp( /////////////////////////////////////////////////////////////////////////////// /// @brief Decrement the command-buffer object's reference count and delete the /// command-buffer object if the reference count becomes zero. It is -/// legal to call the entry-point while hCommandBuffer is still +/// legal to call the entry-point while `hCommandBuffer` is still /// executing, which will block on completion if the reference count of -/// hCommandBuffer becomes zero. +/// `hCommandBuffer` becomes zero. /// /// @returns /// - ::UR_RESULT_SUCCESS diff --git a/unified-runtime/source/ur_api.cpp b/unified-runtime/source/ur_api.cpp index e2c03a20eb853..fc2889cee9a94 100644 --- a/unified-runtime/source/ur_api.cpp +++ b/unified-runtime/source/ur_api.cpp @@ -7407,9 +7407,9 @@ ur_result_t UR_APICALL urCommandBufferRetainExp( /////////////////////////////////////////////////////////////////////////////// /// @brief Decrement the command-buffer object's reference count and delete the /// command-buffer object if the reference count becomes zero. It is -/// legal to call the entry-point while hCommandBuffer is still +/// legal to call the entry-point while `hCommandBuffer` is still /// executing, which will block on completion if the reference count of -/// hCommandBuffer becomes zero. +/// `hCommandBuffer` becomes zero. /// /// @returns /// - ::UR_RESULT_SUCCESS From a51c27921bde90e7eff24465ee400ccb8740d33b Mon Sep 17 00:00:00 2001 From: Konrad Kusiak Date: Tue, 27 May 2025 16:31:54 +0100 Subject: [PATCH 04/11] Changed to EXPECT_SUCCESS and added queue wait before exiting --- .../test/conformance/exp_command_buffer/enqueue.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp b/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp index cbe7bcba51416..51a59ddcc50b0 100644 --- a/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp +++ b/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp @@ -197,6 +197,9 @@ TEST_P(urEnqueueCommandBufferExpTest, EnqueueAndRelease) { ASSERT_SUCCESS(urEnqueueCommandBufferExp(out_of_order_queue, cmd_buf_handle, 0, nullptr, nullptr)); - // Release the command buffer - ASSERT_SUCCESS(urCommandBufferReleaseExp(cmd_buf_handle)); + // Release the command buffer without explicitly waiting beforehand + EXPECT_SUCCESS(urCommandBufferReleaseExp(cmd_buf_handle)); + + // Wait before exiting + ASSERT_SUCCESS(urQueueFinish(out_of_order_queue)); } From 3953a9ad67624ad442b0c50595e44299e9d00876 Mon Sep 17 00:00:00 2001 From: Konrad Kusiak Date: Wed, 28 May 2025 23:42:57 +0100 Subject: [PATCH 05/11] Brought back additional fence for wait event path --- .../adapters/level_zero/command_buffer.cpp | 28 +++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/unified-runtime/source/adapters/level_zero/command_buffer.cpp b/unified-runtime/source/adapters/level_zero/command_buffer.cpp index 7622731fc33c2..6b3f46b1fd851 100644 --- a/unified-runtime/source/adapters/level_zero/command_buffer.cpp +++ b/unified-runtime/source/adapters/level_zero/command_buffer.cpp @@ -677,7 +677,7 @@ ur_result_t createMainCommandList(ur_context_handle_t Context, * @return UR_RESULT_SUCCESS or an error code on failure */ ur_result_t -waitForOngoingExecution(ur_exp_command_buffer_handle_t CommandBuffer) { +waitForLastSubmission(ur_exp_command_buffer_handle_t CommandBuffer) { if (ur_event_handle_t &CurrentSubmissionEvent = CommandBuffer->CurrentSubmissionEvent) { @@ -690,6 +690,30 @@ waitForOngoingExecution(ur_exp_command_buffer_handle_t CommandBuffer) { return UR_RESULT_SUCCESS; } +/** + * Waits for any ongoing executions of the command-buffer to finish + * but put fence in case of wait event path. + * @param CommandBuffer The command-buffer to wait for. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t +waitForOngoingExecution(ur_exp_command_buffer_handle_t CommandBuffer) { + + if (CommandBuffer->UseImmediateAppendPath) { + if (ur_event_handle_t &CurrentSubmissionEvent = + CommandBuffer->CurrentSubmissionEvent) { + ZE2UR_CALL(zeEventHostSynchronize, + (CurrentSubmissionEvent->ZeEvent, UINT64_MAX)); + UR_CALL(urEventReleaseInternal(CurrentSubmissionEvent)); + CurrentSubmissionEvent = nullptr; + } + } else if (ze_fence_handle_t &ZeFence = CommandBuffer->ZeActiveFence) { + ZE2UR_CALL(zeFenceHostSynchronize, (ZeFence, UINT64_MAX)); + } + + return UR_RESULT_SUCCESS; +} + /** * Checks whether the command-buffer can be constructed using in order * command-lists. @@ -1743,7 +1767,7 @@ ur_result_t urEnqueueCommandBufferExp( std::scoped_lock Lock(UrQueue->Mutex); - UR_CALL(waitForOngoingExecution(CommandBuffer)); + UR_CALL(waitForLastSubmission(CommandBuffer)); const bool IsInternal = (Event == nullptr); const bool DoProfiling = From 2a70f96c4994300f986e5aee83e6a02b3f962e1f Mon Sep 17 00:00:00 2001 From: Konrad Kusiak Date: Mon, 9 Jun 2025 14:38:14 +0100 Subject: [PATCH 06/11] Removed explicit sync where necessary --- .../source/adapters/cuda/command_buffer.cpp | 8 ++--- .../source/adapters/cuda/command_buffer.hpp | 2 -- .../source/adapters/hip/command_buffer.cpp | 8 ++--- .../source/adapters/hip/command_buffer.hpp | 2 -- .../adapters/level_zero/command_buffer.cpp | 30 ++----------------- .../source/adapters/opencl/command_buffer.cpp | 4 --- .../exp_command_buffer/enqueue.cpp | 7 ++--- 7 files changed, 9 insertions(+), 52 deletions(-) diff --git a/unified-runtime/source/adapters/cuda/command_buffer.cpp b/unified-runtime/source/adapters/cuda/command_buffer.cpp index 1e40a69e787df..2399b8f81857c 100644 --- a/unified-runtime/source/adapters/cuda/command_buffer.cpp +++ b/unified-runtime/source/adapters/cuda/command_buffer.cpp @@ -387,9 +387,6 @@ urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { if (hCommandBuffer->decrementReferenceCount() == 0) { - if (hCommandBuffer->CurrentExecution) { - UR_CHECK_ERROR(hCommandBuffer->CurrentExecution->wait()); - } // Ref count has reached zero, release of created commands for (auto &Command : hCommandBuffer->CommandHandles) { commandHandleDestroy(Command); @@ -1172,10 +1169,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCommandBufferExp( // Launch graph UR_CHECK_ERROR(cuGraphLaunch(hCommandBuffer->CudaGraphExec, CuStream)); - UR_CHECK_ERROR(RetImplEvent->record()); - hCommandBuffer->CurrentExecution = RetImplEvent.release(); if (phEvent) { - *phEvent = hCommandBuffer->CurrentExecution; + UR_CHECK_ERROR(RetImplEvent->record()); + *phEvent = RetImplEvent.release(); } return UR_RESULT_SUCCESS; } catch (ur_result_t Err) { diff --git a/unified-runtime/source/adapters/cuda/command_buffer.hpp b/unified-runtime/source/adapters/cuda/command_buffer.hpp index d58cd4f87cda8..e11b9ab74969a 100644 --- a/unified-runtime/source/adapters/cuda/command_buffer.hpp +++ b/unified-runtime/source/adapters/cuda/command_buffer.hpp @@ -192,8 +192,6 @@ struct ur_exp_command_buffer_handle_t_ : ur::cuda::handle_base { // Atomic variable counting the number of reference to this command_buffer // using std::atomic prevents data race when incrementing/decrementing. std::atomic_uint32_t RefCount; - // Track the event of the current graph execution. - ur_event_handle_t CurrentExecution = nullptr; // Ordered map of sync_points to ur_events, so that we can find the last // node added to an in-order command-buffer. diff --git a/unified-runtime/source/adapters/hip/command_buffer.cpp b/unified-runtime/source/adapters/hip/command_buffer.cpp index a8ce8d024440b..af058cdde4cf0 100644 --- a/unified-runtime/source/adapters/hip/command_buffer.cpp +++ b/unified-runtime/source/adapters/hip/command_buffer.cpp @@ -273,9 +273,6 @@ urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { if (hCommandBuffer->decrementReferenceCount() == 0) { - if (hCommandBuffer->CurrentExecution) { - UR_CHECK_ERROR(hCommandBuffer->CurrentExecution->wait()); - } delete hCommandBuffer; } return UR_RESULT_SUCCESS; @@ -811,10 +808,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCommandBufferExp( // Launch graph UR_CHECK_ERROR(hipGraphLaunch(hCommandBuffer->HIPGraphExec, HIPStream)); - UR_CHECK_ERROR(RetImplEvent->record()); - hCommandBuffer->CurrentExecution = RetImplEvent.release(); if (phEvent) { - *phEvent = hCommandBuffer->CurrentExecution; + UR_CHECK_ERROR(RetImplEvent->record()); + *phEvent = RetImplEvent.release(); } } catch (ur_result_t Err) { return Err; diff --git a/unified-runtime/source/adapters/hip/command_buffer.hpp b/unified-runtime/source/adapters/hip/command_buffer.hpp index 3dd1417fef427..3d0047adee013 100644 --- a/unified-runtime/source/adapters/hip/command_buffer.hpp +++ b/unified-runtime/source/adapters/hip/command_buffer.hpp @@ -128,8 +128,6 @@ struct ur_exp_command_buffer_handle_t_ : ur::hip::handle_base { // Atomic variable counting the number of reference to this command_buffer // using std::atomic prevents data race when incrementing/decrementing. std::atomic_uint32_t RefCount; - // Track the event of the current graph execution. - ur_event_handle_t CurrentExecution = nullptr; // Ordered map of sync_points to ur_events std::map SyncPoints; diff --git a/unified-runtime/source/adapters/level_zero/command_buffer.cpp b/unified-runtime/source/adapters/level_zero/command_buffer.cpp index 6b3f46b1fd851..6101e3b6d4ea0 100644 --- a/unified-runtime/source/adapters/level_zero/command_buffer.cpp +++ b/unified-runtime/source/adapters/level_zero/command_buffer.cpp @@ -672,12 +672,12 @@ ur_result_t createMainCommandList(ur_context_handle_t Context, } /** - * Waits for any ongoing executions of the command-buffer to finish. + * Waits for any ongoing executions of the command-buffer to finish * @param CommandBuffer The command-buffer to wait for. * @return UR_RESULT_SUCCESS or an error code on failure */ ur_result_t -waitForLastSubmission(ur_exp_command_buffer_handle_t CommandBuffer) { +waitForOngoingExecution(ur_exp_command_buffer_handle_t CommandBuffer) { if (ur_event_handle_t &CurrentSubmissionEvent = CommandBuffer->CurrentSubmissionEvent) { @@ -690,30 +690,6 @@ waitForLastSubmission(ur_exp_command_buffer_handle_t CommandBuffer) { return UR_RESULT_SUCCESS; } -/** - * Waits for any ongoing executions of the command-buffer to finish - * but put fence in case of wait event path. - * @param CommandBuffer The command-buffer to wait for. - * @return UR_RESULT_SUCCESS or an error code on failure - */ -ur_result_t -waitForOngoingExecution(ur_exp_command_buffer_handle_t CommandBuffer) { - - if (CommandBuffer->UseImmediateAppendPath) { - if (ur_event_handle_t &CurrentSubmissionEvent = - CommandBuffer->CurrentSubmissionEvent) { - ZE2UR_CALL(zeEventHostSynchronize, - (CurrentSubmissionEvent->ZeEvent, UINT64_MAX)); - UR_CALL(urEventReleaseInternal(CurrentSubmissionEvent)); - CurrentSubmissionEvent = nullptr; - } - } else if (ze_fence_handle_t &ZeFence = CommandBuffer->ZeActiveFence) { - ZE2UR_CALL(zeFenceHostSynchronize, (ZeFence, UINT64_MAX)); - } - - return UR_RESULT_SUCCESS; -} - /** * Checks whether the command-buffer can be constructed using in order * command-lists. @@ -1767,7 +1743,7 @@ ur_result_t urEnqueueCommandBufferExp( std::scoped_lock Lock(UrQueue->Mutex); - UR_CALL(waitForLastSubmission(CommandBuffer)); + UR_CALL(waitForOngoingExecution(CommandBuffer)); const bool IsInternal = (Event == nullptr); const bool DoProfiling = diff --git a/unified-runtime/source/adapters/opencl/command_buffer.cpp b/unified-runtime/source/adapters/opencl/command_buffer.cpp index affc5b5882870..e048b2d22175c 100644 --- a/unified-runtime/source/adapters/opencl/command_buffer.cpp +++ b/unified-runtime/source/adapters/opencl/command_buffer.cpp @@ -115,10 +115,6 @@ urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { if (hCommandBuffer->decrementReferenceCount() == 0) { - if (hCommandBuffer->LastSubmission) { - cl_int RetErr = clWaitForEvents(1, &(hCommandBuffer->LastSubmission)); - CL_RETURN_ON_FAILURE(RetErr); - } delete hCommandBuffer; } diff --git a/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp b/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp index 51a59ddcc50b0..7b4eabdb268f1 100644 --- a/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp +++ b/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp @@ -194,12 +194,9 @@ TEST_P(urEnqueueCommandBufferExpTest, SerializeInOrOutOfOrderQueue) { // Tests releasing command-buffer while it is still executing relying // on synchronization during urCommandBufferReleaseExp call. TEST_P(urEnqueueCommandBufferExpTest, EnqueueAndRelease) { - ASSERT_SUCCESS(urEnqueueCommandBufferExp(out_of_order_queue, cmd_buf_handle, - 0, nullptr, nullptr)); + ASSERT_SUCCESS(urEnqueueCommandBufferExp( + in_or_out_of_order_queue, cmd_buf_handle, 0, nullptr, nullptr)); // Release the command buffer without explicitly waiting beforehand EXPECT_SUCCESS(urCommandBufferReleaseExp(cmd_buf_handle)); - - // Wait before exiting - ASSERT_SUCCESS(urQueueFinish(out_of_order_queue)); } From 20e750c45b587181c4af25a72a55198dd36edd26 Mon Sep 17 00:00:00 2001 From: Konrad Kusiak Date: Tue, 10 Jun 2025 14:30:50 +0100 Subject: [PATCH 07/11] Retain CB before releasing --- unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp b/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp index 7b4eabdb268f1..2e407a4a2326f 100644 --- a/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp +++ b/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp @@ -197,6 +197,8 @@ TEST_P(urEnqueueCommandBufferExpTest, EnqueueAndRelease) { ASSERT_SUCCESS(urEnqueueCommandBufferExp( in_or_out_of_order_queue, cmd_buf_handle, 0, nullptr, nullptr)); + EXPECT_SUCCESS(urCommandBufferRetainExp(cmd_buf_handle)); + // Release the command buffer without explicitly waiting beforehand EXPECT_SUCCESS(urCommandBufferReleaseExp(cmd_buf_handle)); } From 1efb6d27db8faa4ce5ba437bc02d8187855cd277 Mon Sep 17 00:00:00 2001 From: Konrad Kusiak Date: Tue, 10 Jun 2025 14:48:02 +0100 Subject: [PATCH 08/11] Set cmd_buf_handle to nullptr after releasing --- .../test/conformance/exp_command_buffer/enqueue.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp b/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp index 2e407a4a2326f..9fab249bf831c 100644 --- a/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp +++ b/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp @@ -197,8 +197,7 @@ TEST_P(urEnqueueCommandBufferExpTest, EnqueueAndRelease) { ASSERT_SUCCESS(urEnqueueCommandBufferExp( in_or_out_of_order_queue, cmd_buf_handle, 0, nullptr, nullptr)); - EXPECT_SUCCESS(urCommandBufferRetainExp(cmd_buf_handle)); - // Release the command buffer without explicitly waiting beforehand EXPECT_SUCCESS(urCommandBufferReleaseExp(cmd_buf_handle)); + cmd_buf_handle = nullptr; } From 62fda4b04d50b4e257e8b900b8f62558b6febc7d Mon Sep 17 00:00:00 2001 From: Konrad Kusiak Date: Tue, 10 Jun 2025 17:29:25 +0100 Subject: [PATCH 09/11] Added new e2e test and modified the spec --- .../sycl_ext_oneapi_graph.asciidoc | 9 ++-- .../Explicit/release_while_executing.cpp | 10 ++++ .../Graph/Inputs/release_while_executing.cpp | 47 +++++++++++++++++++ .../RecordReplay/release_while_executing.cpp | 10 ++++ 4 files changed, 72 insertions(+), 4 deletions(-) create mode 100644 sycl/test-e2e/Graph/Explicit/release_while_executing.cpp create mode 100644 sycl/test-e2e/Graph/Inputs/release_while_executing.cpp create mode 100644 sycl/test-e2e/Graph/RecordReplay/release_while_executing.cpp diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc index f0a1e36e0ef34..effe37e7b9265 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc @@ -765,10 +765,11 @@ _executable_ by the user invoking `command_graph::finalize()` to create a new executable instance of the graph. An executable graph cannot be converted to a modifiable graph. After finalizing a graph in the modifiable state, it is valid for a user to add additional nodes and finalize again to create subsequent -executable graphs. The state of a `command_graph` object is made explicit by -templating on state to make the class strongly typed, with the default template -argument being `graph_state::modifiable` to reduce code verbosity on -construction. +executable graphs. When an executable graph is destroyed, it will be freed +as soon as the enqueued work is completed. The state of a `command_graph` object +is made explicit by templating on state to make the class strongly typed, with +the default template argument being `graph_state::modifiable` to reduce code +verbosity on construction. .Graph State Diagram [source, mermaid] diff --git a/sycl/test-e2e/Graph/Explicit/release_while_executing.cpp b/sycl/test-e2e/Graph/Explicit/release_while_executing.cpp new file mode 100644 index 0000000000000..5751486d76dea --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/release_while_executing.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/release_while_executing.cpp" diff --git a/sycl/test-e2e/Graph/Inputs/release_while_executing.cpp b/sycl/test-e2e/Graph/Inputs/release_while_executing.cpp new file mode 100644 index 0000000000000..ac2fdcd28b95c --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/release_while_executing.cpp @@ -0,0 +1,47 @@ +// Tests destroying finalized command_graph before it is finished executing, +// relying on the backends to properly synchronize and wait for the submitted +// work to finish. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + using T = int; + + std::vector DataA(Size), ReferenceA(Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(ReferenceA.begin(), ReferenceA.end(), 2); + + T *PtrA = malloc_device(Size, Queue); + + // Create the command_graph in a seperate scope so that it's destroyed before + // Queue.wait() + { + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + Queue.copy(DataA.data(), PtrA, Size); + Queue.wait_and_throw(); + + auto Node = add_node(Graph, Queue, [&](handler &CGH) { + CGH.parallel_for(Size, [=](item<1> Item) { PtrA[Item.get_id()] += 1; }); + }); + + auto GraphExec = Graph.finalize(); + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + } + + Queue.wait_and_throw(); + + Queue.copy(PtrA, DataA.data(), Size); + Queue.wait_and_throw(); + + free(PtrA, Queue); + + for (size_t i = 0; i < Size; i++) { + assert(check_value(i, ReferenceA[i], DataA[i], "DataA")); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/release_while_executing.cpp b/sycl/test-e2e/Graph/RecordReplay/release_while_executing.cpp new file mode 100644 index 0000000000000..cca6b7085a697 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/release_while_executing.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/release_while_executing.cpp" From a61980ecdfde740f59c7eda26e1d32d8cc22474a Mon Sep 17 00:00:00 2001 From: Konrad Kusiak Date: Tue, 10 Jun 2025 17:44:11 +0100 Subject: [PATCH 10/11] rephrased --- .../experimental/sycl_ext_oneapi_graph.asciidoc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc index effe37e7b9265..441bc6a6eb383 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc @@ -765,11 +765,11 @@ _executable_ by the user invoking `command_graph::finalize()` to create a new executable instance of the graph. An executable graph cannot be converted to a modifiable graph. After finalizing a graph in the modifiable state, it is valid for a user to add additional nodes and finalize again to create subsequent -executable graphs. When an executable graph is destroyed, it will be freed -as soon as the enqueued work is completed. The state of a `command_graph` object -is made explicit by templating on state to make the class strongly typed, with -the default template argument being `graph_state::modifiable` to reduce code -verbosity on construction. +executable graphs. When an executable graph is destroyed, the underlying +resources will be freed only once any enqueued submissions of the graph have +completed. The state of a `command_graph` object is made explicit by templating +on state to make the class strongly typed, with the default template argument +being `graph_state::modifiable` to reduce code verbosity on construction. .Graph State Diagram [source, mermaid] From f441469c908a4041f0f0604bcca7e1fba0e3f0b8 Mon Sep 17 00:00:00 2001 From: Konrad Kusiak Date: Wed, 11 Jun 2025 12:20:34 +0100 Subject: [PATCH 11/11] Added execution event sync on CB release in V2 adapter --- .../source/adapters/level_zero/v2/command_buffer.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp b/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp index 4d0b1ef5dc752..e5713074a096a 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp @@ -263,6 +263,10 @@ urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) try { if (!hCommandBuffer->RefCount.decrementAndTest()) return UR_RESULT_SUCCESS; + if (auto executionEvent = hCommandBuffer->getExecutionEventUnlocked()) { + ZE2UR_CALL(zeEventHostSynchronize, + (executionEvent->getZeEvent(), UINT64_MAX)); + } delete hCommandBuffer; return UR_RESULT_SUCCESS; } catch (...) {