Skip to content

Commit f2075fd

Browse files
EwanCkbenzie
authored andcommitted
Serialize submissions of the same command-buffer (#18295)
Update the UR command-buffer enqueue semantics such that command-buffer submissions are ordered with a dependency on previous submissions of the same command-buffer. That is the same semantic as we currently specify for executable graph objects in the SYCL-Graph extension. Allowing for a more efficient SYCL-RT implementation on top of UR where less UR events need to be created (future work). OpenCL is the only adapter that needs updated to provide these semantics, where we now track the `cl_event` returned from the most recent submission of a command-buffer, so that it can be used as a dependency of any future command-buffer submissions.
1 parent 2cf4e7f commit f2075fd

File tree

6 files changed

+379
-15
lines changed

6 files changed

+379
-15
lines changed

scripts/core/EXP-COMMAND-BUFFER.rst

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,7 @@ Enqueueing Command-Buffers
224224
--------------------------------------------------------------------------------
225225

226226
Command-buffers are submitted for execution on a ${x}_queue_handle_t with an
227-
optional list of dependent events. An event is returned which tracks the
227+
optional list of dependent events. An event can be returned which tracks the
228228
execution of the command-buffer, and will be complete when all appended commands
229229
have finished executing.
230230

@@ -238,16 +238,17 @@ of the same command-buffer is still awaiting completion. That is, the user is no
238238
required to do a blocking wait on the completion of the first command-buffer
239239
submission before making a second submission of the command-buffer.
240240

241-
Submissions of the same command-buffer should be synchronized to prevent
242-
concurrent execution. For example, by using events, barriers, or in-order queue
243-
dependencies. The behavior of multiple submissions of the same command-buffer
244-
that can execute concurrently is undefined.
241+
Each submissions of a command-buffer is ordered behind previous submissions of
242+
the same command-buffer. As well as respecting the other synchronization
243+
dependencies set by the user, such as events, barriers, or in-order queue
244+
dependencies.
245245

246246
.. parsed-literal::
247-
// Valid usage if hQueue is in-order but undefined behavior is out-of-order
248-
${x}EnqueueCommandBufferExp(hQueue, hCommandBuffer, 0, nullptr,
247+
// Submission of hCommandBuffer to hQueueB as an implicit dependency on
248+
// prior submission to hQueueA.
249+
${x}EnqueueCommandBufferExp(hQueueA, hCommandBuffer, 0, nullptr,
249250
nullptr);
250-
${x}EnqueueCommandBufferExp(hQueue, hCommandBuffer, 0, nullptr,
251+
${x}EnqueueCommandBufferExp(hQueueB, hCommandBuffer, 0, nullptr,
251252
nullptr);
252253
253254

source/adapters/opencl/command_buffer.cpp

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@
2121
/// command-buffer to free the underlying object.
2222
ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() {
2323
urQueueRelease(hInternalQueue);
24+
if (LastSubmission) {
25+
clReleaseEvent(LastSubmission);
26+
}
2427

2528
cl_context CLContext = hContext->CLContext;
2629
cl_ext::clReleaseCommandBufferKHR_fn clReleaseCommandBufferKHR = nullptr;
@@ -501,16 +504,34 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCommandBufferExp(
501504
ur::cl::getAdapter()->fnCache.clEnqueueCommandBufferKHRCache,
502505
cl_ext::EnqueueCommandBufferName, &clEnqueueCommandBufferKHR));
503506

504-
const uint32_t NumberOfQueues = 1;
505-
cl_event Event;
506-
std::vector<cl_event> CLWaitEvents(numEventsInWaitList);
507+
// If we've submitted the command-buffer before, then add a dependency on the
508+
// last submission.
509+
bool AddExtraDep = hCommandBuffer->LastSubmission != nullptr;
510+
uint32_t CLWaitListLength =
511+
AddExtraDep ? numEventsInWaitList + 1 : numEventsInWaitList;
512+
std::vector<cl_event> CLWaitEvents(CLWaitListLength);
507513
for (uint32_t i = 0; i < numEventsInWaitList; i++) {
508514
CLWaitEvents[i] = phEventWaitList[i]->CLEvent;
509515
}
516+
if (AddExtraDep) {
517+
CLWaitEvents[numEventsInWaitList] = hCommandBuffer->LastSubmission;
518+
}
519+
520+
// Always get an event as we need it to serialize any future submissions
521+
// of the command-buffer with.
522+
cl_event Event;
510523
cl_command_queue CLQueue = hQueue->CLQueue;
511-
CL_RETURN_ON_FAILURE(clEnqueueCommandBufferKHR(
512-
NumberOfQueues, &CLQueue, hCommandBuffer->CLCommandBuffer,
513-
numEventsInWaitList, CLWaitEvents.data(), ifUrEvent(phEvent, Event)));
524+
CL_RETURN_ON_FAILURE(
525+
clEnqueueCommandBufferKHR(1, &CLQueue, hCommandBuffer->CLCommandBuffer,
526+
CLWaitListLength, CLWaitEvents.data(), &Event));
527+
528+
// Retain event so that if a user manually destroys the returned UR
529+
// event the adapter still has a valid handle.
530+
clRetainEvent(Event);
531+
if (hCommandBuffer->LastSubmission) {
532+
clReleaseEvent(hCommandBuffer->LastSubmission);
533+
}
534+
hCommandBuffer->LastSubmission = Event;
514535

515536
UR_RETURN_ON_FAILURE(createUREvent(Event, hQueue->Context, hQueue, phEvent));
516537
return UR_RESULT_SUCCESS;

source/adapters/opencl/command_buffer.hpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ struct ur_exp_command_buffer_handle_t_ {
5555
CommandHandles;
5656
/// Object reference count
5757
std::atomic_uint32_t RefCount;
58+
/// Track last submission of the command-buffer
59+
cl_event LastSubmission;
5860

5961
ur_exp_command_buffer_handle_t_(ur_queue_handle_t hQueue,
6062
ur_context_handle_t hContext,
@@ -63,7 +65,8 @@ struct ur_exp_command_buffer_handle_t_ {
6365
bool IsUpdatable, bool IsInOrder)
6466
: hInternalQueue(hQueue), hContext(hContext), hDevice(hDevice),
6567
CLCommandBuffer(CLCommandBuffer), IsUpdatable(IsUpdatable),
66-
IsInOrder(IsInOrder), IsFinalized(false), RefCount(0) {}
68+
IsInOrder(IsInOrder), IsFinalized(false), RefCount(0),
69+
LastSubmission(nullptr) {}
6770

6871
~ur_exp_command_buffer_handle_t_();
6972

test/conformance/exp_command_buffer/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ add_conformance_test_with_kernels_environment(exp_command_buffer
1616
write.cpp
1717
rect_read.cpp
1818
rect_write.cpp
19+
enqueue.cpp
1920
update/buffer_fill_kernel_update.cpp
2021
update/invalid_update.cpp
2122
update/kernel_handle_update.cpp
@@ -26,6 +27,7 @@ add_conformance_test_with_kernels_environment(exp_command_buffer
2627
update/event_sync.cpp
2728
update/kernel_event_sync.cpp
2829
update/local_memory_update.cpp
30+
update/enqueue_update.cpp
2931
regression/usm_copy.cpp
3032
)
3133

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
// Copyright (C) 2025 Intel Corporation
2+
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
3+
// Exceptions. See LICENSE.TXT
4+
//
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
7+
#include "fixtures.h"
8+
#include <array>
9+
#include <cstring>
10+
11+
// Tests that adapter implementation of urEnqueueCommandBufferExp serializes
12+
// submissions of the same UR command-buffer object with respect to previous
13+
// submissions.
14+
//
15+
// Done using a kernel that increments a value, so the ordering of the two
16+
// submission isn't tested, only that they didn't run concurrently. See the
17+
// enqueue_update.cpp test for a test verifying the order of submissions, as
18+
// the input/output to the kernels can be modified between the submissions.
19+
struct urEnqueueCommandBufferExpTest
20+
: uur::command_buffer::urCommandBufferExpExecutionTest {
21+
virtual void SetUp() override {
22+
program_name = "increment";
23+
UUR_RETURN_ON_FATAL_FAILURE(urCommandBufferExpExecutionTest::SetUp());
24+
25+
// Create an in-order queue
26+
ur_queue_properties_t queue_properties = {
27+
UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, nullptr, 0};
28+
ASSERT_SUCCESS(
29+
urQueueCreate(context, device, &queue_properties, &in_order_queue));
30+
31+
// Create an out-of-order queue
32+
queue_properties.flags = UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE;
33+
ASSERT_SUCCESS(
34+
urQueueCreate(context, device, &queue_properties, &out_of_order_queue));
35+
ASSERT_NE(out_of_order_queue, nullptr);
36+
37+
ASSERT_SUCCESS(urUSMDeviceAlloc(context, device, nullptr, nullptr,
38+
allocation_size, &device_ptr));
39+
ASSERT_NE(device_ptr, nullptr);
40+
41+
uint32_t zero_pattern = 0;
42+
ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptr, sizeof(zero_pattern),
43+
&zero_pattern, allocation_size, 0, nullptr,
44+
nullptr));
45+
ASSERT_SUCCESS(urQueueFinish(queue));
46+
47+
// Create command-buffer with a single kernel that does "Ptr[i] += 1;"
48+
ASSERT_SUCCESS(urKernelSetArgPointer(kernel, 0, nullptr, device_ptr));
49+
ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp(
50+
cmd_buf_handle, kernel, n_dimensions, &global_offset, &global_size,
51+
nullptr, 0, nullptr, 0, nullptr, 0, nullptr, nullptr, nullptr,
52+
nullptr));
53+
ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle));
54+
}
55+
56+
virtual void TearDown() override {
57+
if (in_order_queue) {
58+
EXPECT_SUCCESS(urQueueRelease(in_order_queue));
59+
}
60+
61+
if (out_of_order_queue) {
62+
EXPECT_SUCCESS(urQueueRelease(out_of_order_queue));
63+
}
64+
65+
if (device_ptr) {
66+
EXPECT_SUCCESS(urUSMFree(context, device_ptr));
67+
}
68+
69+
UUR_RETURN_ON_FATAL_FAILURE(urCommandBufferExpExecutionTest::TearDown());
70+
}
71+
72+
ur_queue_handle_t in_order_queue = nullptr;
73+
ur_queue_handle_t out_of_order_queue = nullptr;
74+
75+
static constexpr size_t global_size = 16;
76+
static constexpr size_t global_offset = 0;
77+
static constexpr size_t n_dimensions = 1;
78+
static constexpr size_t allocation_size = sizeof(uint32_t) * global_size;
79+
void *device_ptr = nullptr;
80+
};
81+
82+
UUR_INSTANTIATE_DEVICE_TEST_SUITE(urEnqueueCommandBufferExpTest);
83+
84+
// Tests that the same command-buffer submitted across different in-order
85+
// queues has an implicit dependency on first submission
86+
TEST_P(urEnqueueCommandBufferExpTest, SerializeAcrossQueues) {
87+
// Execute command-buffer to first in-order queue (created by parent
88+
// urQueueTest fixture)
89+
ASSERT_SUCCESS(
90+
urEnqueueCommandBufferExp(queue, cmd_buf_handle, 0, nullptr, nullptr));
91+
92+
// Execute command-buffer to second in-order queue, should have implicit
93+
// dependency on first submission.
94+
ASSERT_SUCCESS(urEnqueueCommandBufferExp(in_order_queue, cmd_buf_handle, 0,
95+
nullptr, nullptr));
96+
97+
// Wait for both submissions to complete
98+
ASSERT_SUCCESS(urQueueFlush(queue));
99+
ASSERT_SUCCESS(urQueueFinish(in_order_queue));
100+
101+
std::vector<uint32_t> Output(global_size);
102+
ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, Output.data(), device_ptr,
103+
allocation_size, 0, nullptr, nullptr));
104+
ASSERT_SUCCESS(urQueueFinish(queue));
105+
106+
// Verify
107+
const uint32_t reference = 2;
108+
for (size_t i = 0; i < global_size; i++) {
109+
ASSERT_EQ(reference, Output[i]);
110+
}
111+
}
112+
113+
// Tests that submitting a command-buffer twice to an out-of-order queue
114+
// relying on implicit serialization semantics for dependencies.
115+
TEST_P(urEnqueueCommandBufferExpTest, SerializeOutofOrderQueue) {
116+
ASSERT_SUCCESS(urEnqueueCommandBufferExp(out_of_order_queue, cmd_buf_handle,
117+
0, nullptr, nullptr));
118+
ASSERT_SUCCESS(urEnqueueCommandBufferExp(out_of_order_queue, cmd_buf_handle,
119+
0, nullptr, nullptr));
120+
121+
// Wait for both submissions to complete
122+
ASSERT_SUCCESS(urQueueFinish(out_of_order_queue));
123+
124+
std::vector<uint32_t> Output(global_size);
125+
ASSERT_SUCCESS(urEnqueueUSMMemcpy(out_of_order_queue, true, Output.data(),
126+
device_ptr, allocation_size, 0, nullptr,
127+
nullptr));
128+
129+
// Verify
130+
const uint32_t reference = 2;
131+
for (size_t i = 0; i < global_size; i++) {
132+
ASSERT_EQ(reference, Output[i]);
133+
}
134+
}

0 commit comments

Comments
 (0)