diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc index f0a1e36e0ef34..441bc6a6eb383 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc @@ -765,10 +765,11 @@ _executable_ by the user invoking `command_graph::finalize()` to create a new executable instance of the graph. An executable graph cannot be converted to a modifiable graph. After finalizing a graph in the modifiable state, it is valid for a user to add additional nodes and finalize again to create subsequent -executable graphs. The state of a `command_graph` object is made explicit by -templating on state to make the class strongly typed, with the default template -argument being `graph_state::modifiable` to reduce code verbosity on -construction. +executable graphs. When an executable graph is destroyed, the underlying +resources will be freed only once any enqueued submissions of the graph have +completed. The state of a `command_graph` object is made explicit by templating +on state to make the class strongly typed, with the default template argument +being `graph_state::modifiable` to reduce code verbosity on construction. .Graph State Diagram [source, mermaid] diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index caf50642c3a2c..1f90250f2c8ac 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -1002,11 +1002,6 @@ exec_graph_impl::~exec_graph_impl() { const sycl::detail::AdapterPtr &Adapter = sycl::detail::getSyclObjImpl(MContext)->getAdapter(); MSchedule.clear(); - // We need to wait on all command buffer executions before we can release - // them. - for (auto &Event : MExecutionEvents) { - Event->wait(Event); - } // Clean up any graph-owned allocations that were allocated MGraphImpl->getMemPool().deallocateAndUnmapAll(); diff --git a/sycl/test-e2e/Graph/Explicit/release_while_executing.cpp b/sycl/test-e2e/Graph/Explicit/release_while_executing.cpp new file mode 100644 index 0000000000000..5751486d76dea --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/release_while_executing.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/release_while_executing.cpp" diff --git a/sycl/test-e2e/Graph/Inputs/release_while_executing.cpp b/sycl/test-e2e/Graph/Inputs/release_while_executing.cpp new file mode 100644 index 0000000000000..ac2fdcd28b95c --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/release_while_executing.cpp @@ -0,0 +1,47 @@ +// Tests destroying finalized command_graph before it is finished executing, +// relying on the backends to properly synchronize and wait for the submitted +// work to finish. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + using T = int; + + std::vector DataA(Size), ReferenceA(Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(ReferenceA.begin(), ReferenceA.end(), 2); + + T *PtrA = malloc_device(Size, Queue); + + // Create the command_graph in a seperate scope so that it's destroyed before + // Queue.wait() + { + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + Queue.copy(DataA.data(), PtrA, Size); + Queue.wait_and_throw(); + + auto Node = add_node(Graph, Queue, [&](handler &CGH) { + CGH.parallel_for(Size, [=](item<1> Item) { PtrA[Item.get_id()] += 1; }); + }); + + auto GraphExec = Graph.finalize(); + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + } + + Queue.wait_and_throw(); + + Queue.copy(PtrA, DataA.data(), Size); + Queue.wait_and_throw(); + + free(PtrA, Queue); + + for (size_t i = 0; i < Size; i++) { + assert(check_value(i, ReferenceA[i], DataA[i], "DataA")); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/release_while_executing.cpp b/sycl/test-e2e/Graph/RecordReplay/release_while_executing.cpp new file mode 100644 index 0000000000000..cca6b7085a697 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/release_while_executing.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/release_while_executing.cpp" diff --git a/unified-runtime/include/ur_api.h b/unified-runtime/include/ur_api.h index 9c967b411131d..484ca4ef94768 100644 --- a/unified-runtime/include/ur_api.h +++ b/unified-runtime/include/ur_api.h @@ -11078,7 +11078,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferRetainExp( /////////////////////////////////////////////////////////////////////////////// /// @brief Decrement the command-buffer object's reference count and delete the -/// command-buffer object if the reference count becomes zero. +/// command-buffer object if the reference count becomes zero. It is +/// legal to call the entry-point while `hCommandBuffer` is still +/// executing, which will block on completion if the reference count of +/// `hCommandBuffer` becomes zero. /// /// @returns /// - ::UR_RESULT_SUCCESS diff --git a/unified-runtime/scripts/core/EXP-COMMAND-BUFFER.rst b/unified-runtime/scripts/core/EXP-COMMAND-BUFFER.rst index 136c7024632ec..2b443c13f6a3d 100644 --- a/unified-runtime/scripts/core/EXP-COMMAND-BUFFER.rst +++ b/unified-runtime/scripts/core/EXP-COMMAND-BUFFER.rst @@ -563,3 +563,4 @@ Contributors * Maxime France-Pillois `maxime.francepillois@codeplay.com `_ * Aaron Greig `aaron.greig@codeplay.com `_ * Fábio Mestre `fabio.mestre@codeplay.com `_ +* Konrad Kusiak `konrad.kusiak@codeplay.com `_ diff --git a/unified-runtime/scripts/core/exp-command-buffer.yml b/unified-runtime/scripts/core/exp-command-buffer.yml index 05e61985cd4b8..e8f2caa15d59d 100644 --- a/unified-runtime/scripts/core/exp-command-buffer.yml +++ b/unified-runtime/scripts/core/exp-command-buffer.yml @@ -310,7 +310,7 @@ returns: - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY --- #-------------------------------------------------------------------------- type: function -desc: "Decrement the command-buffer object's reference count and delete the command-buffer object if the reference count becomes zero." +desc: "Decrement the command-buffer object's reference count and delete the command-buffer object if the reference count becomes zero. It is legal to call the entry-point while `hCommandBuffer` is still executing, which will block on completion if the reference count of `hCommandBuffer` becomes zero." class: $xCommandBuffer name: ReleaseExp params: diff --git a/unified-runtime/source/adapters/level_zero/command_buffer.cpp b/unified-runtime/source/adapters/level_zero/command_buffer.cpp index 020afb90564ff..6101e3b6d4ea0 100644 --- a/unified-runtime/source/adapters/level_zero/command_buffer.cpp +++ b/unified-runtime/source/adapters/level_zero/command_buffer.cpp @@ -671,6 +671,25 @@ ur_result_t createMainCommandList(ur_context_handle_t Context, return UR_RESULT_SUCCESS; } +/** + * Waits for any ongoing executions of the command-buffer to finish + * @param CommandBuffer The command-buffer to wait for. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t +waitForOngoingExecution(ur_exp_command_buffer_handle_t CommandBuffer) { + + if (ur_event_handle_t &CurrentSubmissionEvent = + CommandBuffer->CurrentSubmissionEvent) { + ZE2UR_CALL(zeEventHostSynchronize, + (CurrentSubmissionEvent->ZeEvent, UINT64_MAX)); + UR_CALL(urEventReleaseInternal(CurrentSubmissionEvent)); + CurrentSubmissionEvent = nullptr; + } + + return UR_RESULT_SUCCESS; +} + /** * Checks whether the command-buffer can be constructed using in order * command-lists. @@ -832,6 +851,7 @@ urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t CommandBuffer) { if (!CommandBuffer->RefCount.decrementAndTest()) return UR_RESULT_SUCCESS; + UR_CALL(waitForOngoingExecution(CommandBuffer)); CommandBuffer->cleanupCommandBufferResources(); delete CommandBuffer; return UR_RESULT_SUCCESS; @@ -1453,25 +1473,6 @@ ur_result_t getZeCommandQueue(ur_queue_handle_t Queue, bool UseCopyEngine, return UR_RESULT_SUCCESS; } -/** - * Waits for any ongoing executions of the command-buffer to finish. - * @param CommandBuffer The command-buffer to wait for. - * @return UR_RESULT_SUCCESS or an error code on failure - */ -ur_result_t -waitForOngoingExecution(ur_exp_command_buffer_handle_t CommandBuffer) { - - if (ur_event_handle_t &CurrentSubmissionEvent = - CommandBuffer->CurrentSubmissionEvent) { - ZE2UR_CALL(zeEventHostSynchronize, - (CurrentSubmissionEvent->ZeEvent, UINT64_MAX)); - UR_CALL(urEventReleaseInternal(CurrentSubmissionEvent)); - CurrentSubmissionEvent = nullptr; - } - - return UR_RESULT_SUCCESS; -} - /** * Waits for the all the dependencies of the command-buffer * @param[in] CommandBuffer The command-buffer. diff --git a/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp b/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp index 4d0b1ef5dc752..e5713074a096a 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp @@ -263,6 +263,10 @@ urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) try { if (!hCommandBuffer->RefCount.decrementAndTest()) return UR_RESULT_SUCCESS; + if (auto executionEvent = hCommandBuffer->getExecutionEventUnlocked()) { + ZE2UR_CALL(zeEventHostSynchronize, + (executionEvent->getZeEvent(), UINT64_MAX)); + } delete hCommandBuffer; return UR_RESULT_SUCCESS; } catch (...) { diff --git a/unified-runtime/source/loader/ur_libapi.cpp b/unified-runtime/source/loader/ur_libapi.cpp index ed14b29cd8393..f202f98eaaa21 100644 --- a/unified-runtime/source/loader/ur_libapi.cpp +++ b/unified-runtime/source/loader/ur_libapi.cpp @@ -8503,7 +8503,10 @@ ur_result_t UR_APICALL urCommandBufferRetainExp( /////////////////////////////////////////////////////////////////////////////// /// @brief Decrement the command-buffer object's reference count and delete the -/// command-buffer object if the reference count becomes zero. +/// command-buffer object if the reference count becomes zero. It is +/// legal to call the entry-point while `hCommandBuffer` is still +/// executing, which will block on completion if the reference count of +/// `hCommandBuffer` becomes zero. /// /// @returns /// - ::UR_RESULT_SUCCESS diff --git a/unified-runtime/source/ur_api.cpp b/unified-runtime/source/ur_api.cpp index 549bd97e87f6b..fc2889cee9a94 100644 --- a/unified-runtime/source/ur_api.cpp +++ b/unified-runtime/source/ur_api.cpp @@ -7406,7 +7406,10 @@ ur_result_t UR_APICALL urCommandBufferRetainExp( /////////////////////////////////////////////////////////////////////////////// /// @brief Decrement the command-buffer object's reference count and delete the -/// command-buffer object if the reference count becomes zero. +/// command-buffer object if the reference count becomes zero. It is +/// legal to call the entry-point while `hCommandBuffer` is still +/// executing, which will block on completion if the reference count of +/// `hCommandBuffer` becomes zero. /// /// @returns /// - ::UR_RESULT_SUCCESS diff --git a/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp b/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp index 4f2a10593c19d..9fab249bf831c 100644 --- a/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp +++ b/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp @@ -190,3 +190,14 @@ TEST_P(urEnqueueCommandBufferExpTest, SerializeInOrOutOfOrderQueue) { ASSERT_EQ(reference, Output[i]); } } + +// Tests releasing command-buffer while it is still executing relying +// on synchronization during urCommandBufferReleaseExp call. +TEST_P(urEnqueueCommandBufferExpTest, EnqueueAndRelease) { + ASSERT_SUCCESS(urEnqueueCommandBufferExp( + in_or_out_of_order_queue, cmd_buf_handle, 0, nullptr, nullptr)); + + // Release the command buffer without explicitly waiting beforehand + EXPECT_SUCCESS(urCommandBufferReleaseExp(cmd_buf_handle)); + cmd_buf_handle = nullptr; +}