Skip to content

Commit 721d63c

Browse files
committed
Use fence rather than event for sync in L0 command-buffer update
The Level Zero adapter implementing `urCommandBufferUpdateKernelLaunchExp` is doing a blocking host wait with `zeEventHostSynchronize` on the executing of the command-buffer. However, there is no guarantee that the command-buffer has been enqueued before update has been called. Resulting in a hang if this is not the case. Even if a command-buffer has been enqueued before the event can't guarantee that the command-list has finished executing. Instead, replace the blocking wait on a L0 event with a blocking wait on a L0 fence from the last submission of the command-buffer. This is because fences are the only safe way to track when a command-buffer execution has completed.
1 parent 39cb69a commit 721d63c

File tree

9 files changed

+298
-35
lines changed

9 files changed

+298
-35
lines changed

include/ur_api.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8742,7 +8742,9 @@ urCommandBufferReleaseCommandExp(
87428742
);
87438743

87448744
///////////////////////////////////////////////////////////////////////////////
8745-
/// @brief Update a kernel launch command in a finalized command-buffer.
8745+
/// @brief Update a kernel launch command in a finalized command-buffer. This
8746+
/// entry-point is synchronous and may block if the command-buffer is
8747+
/// executing when the entry-point is called.
87468748
///
87478749
/// @returns
87488750
/// - ::UR_RESULT_SUCCESS

scripts/core/exp-command-buffer.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -900,7 +900,7 @@ returns:
900900
- $X_RESULT_ERROR_OUT_OF_HOST_MEMORY
901901
--- #--------------------------------------------------------------------------
902902
type: function
903-
desc: "Update a kernel launch command in a finalized command-buffer."
903+
desc: "Update a kernel launch command in a finalized command-buffer. This entry-point is synchronous and may block if the command-buffer is executing when the entry-point is called."
904904
class: $xCommandBuffer
905905
name: UpdateKernelLaunchExp
906906
params:

source/adapters/level_zero/command_buffer.cpp

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,8 @@ ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_(
5151
const ur_exp_command_buffer_desc_t *Desc, const bool IsInOrderCmdList)
5252
: Context(Context), Device(Device), ZeCommandList(CommandList),
5353
ZeCommandListResetEvents(CommandListResetEvents),
54-
ZeCommandListDesc(ZeDesc), ZeFencesList(), QueueProperties(),
55-
SyncPoints(), NextSyncPoint(0),
54+
ZeCommandListDesc(ZeDesc), ZeFencesMap(), ZeActiveFence(nullptr),
55+
QueueProperties(), SyncPoints(), NextSyncPoint(0),
5656
IsUpdatable(Desc ? Desc->isUpdatable : false),
5757
IsProfilingEnabled(Desc ? Desc->enableProfiling : false),
5858
IsInOrderCmdList(IsInOrderCmdList) {
@@ -102,8 +102,9 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() {
102102
urEventReleaseInternal(Event);
103103
}
104104

105-
// Release Fences allocated to command_buffer
106-
for (auto &ZeFence : ZeFencesList) {
105+
// Release fences allocated to command-buffer
106+
for (auto &ZeFencePair : ZeFencesMap) {
107+
auto &ZeFence = ZeFencePair.second;
107108
ZE_CALL_NOCHECK(zeFenceDestroy, (ZeFence));
108109
}
109110

@@ -1031,11 +1032,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
10311032
uint32_t QueueGroupOrdinal;
10321033
auto &ZeCommandQueue = QGroup.getZeQueue(&QueueGroupOrdinal);
10331034

1034-
ze_fence_handle_t ZeFence;
1035-
ZeStruct<ze_fence_desc_t> ZeFenceDesc;
1036-
1037-
ZE2UR_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
1038-
CommandBuffer->ZeFencesList.push_back(ZeFence);
1035+
// If we already have created a fence for this queue, first reset then reuse
1036+
// it, otherwise create a new fence.
1037+
ze_fence_handle_t &ZeFence = CommandBuffer->ZeActiveFence;
1038+
auto ZeWorkloadFenceForQueue =
1039+
CommandBuffer->ZeFencesMap.find(ZeCommandQueue);
1040+
if (ZeWorkloadFenceForQueue == CommandBuffer->ZeFencesMap.end()) {
1041+
ZeStruct<ze_fence_desc_t> ZeFenceDesc;
1042+
ZE2UR_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
1043+
CommandBuffer->ZeFencesMap.insert({{ZeCommandQueue, ZeFence}});
1044+
} else {
1045+
ZeFence = ZeWorkloadFenceForQueue->second;
1046+
ZE2UR_CALL(zeFenceReset, (ZeFence));
1047+
}
10391048

10401049
bool MustSignalWaitEvent = true;
10411050
if (NumEventsInWaitList) {
@@ -1433,10 +1442,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
14331442
MutableCommandDesc.flags = 0;
14341443

14351444
// We must synchronize mutable command list execution before mutating.
1436-
ZE2UR_CALL(zeEventHostSynchronize,
1437-
(CommandBuffer->SignalEvent->ZeEvent, UINT64_MAX));
1445+
if (ze_fence_handle_t &ZeFence = CommandBuffer->ZeActiveFence) {
1446+
ZE2UR_CALL(zeFenceHostSynchronize, (ZeFence, UINT64_MAX));
1447+
}
14381448

1439-
auto Plt = Command->CommandBuffer->Context->getPlatform();
1449+
auto Plt = CommandBuffer->Context->getPlatform();
14401450
UR_ASSERT(Plt->ZeMutableCmdListExt.Supported,
14411451
UR_RESULT_ERROR_UNSUPPORTED_FEATURE);
14421452
ZE2UR_CALL(Plt->ZeMutableCmdListExt.zexCommandListUpdateMutableCommandsExp,

source/adapters/level_zero/command_buffer.hpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,13 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object {
5454
ze_command_list_handle_t ZeCommandListResetEvents;
5555
// Level Zero command list descriptor
5656
ZeStruct<ze_command_list_desc_t> ZeCommandListDesc;
57-
// List of Level Zero fences created when submitting a graph.
58-
// This list is needed to release all fences retained by the
59-
// command_buffer.
60-
std::vector<ze_fence_handle_t> ZeFencesList;
57+
// Level Zero fences for each queue the command-buffer has been enqueued to.
58+
// These should be destroyed when the command-buffer is released.
59+
std::unordered_map<ze_command_queue_handle_t, ze_fence_handle_t> ZeFencesMap;
60+
// The Level Zero fence from the most recent enqueue of the command-buffer.
61+
// Must be an element in ZeFencesMap, so is not required to be destroyed
62+
// itself.
63+
ze_fence_handle_t ZeActiveFence;
6164
// Queue properties from command-buffer descriptor
6265
// TODO: Do we need these?
6366
ur_queue_properties_t QueueProperties;

source/loader/ur_libapi.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8117,7 +8117,9 @@ ur_result_t UR_APICALL urCommandBufferReleaseCommandExp(
81178117
}
81188118

81198119
///////////////////////////////////////////////////////////////////////////////
8120-
/// @brief Update a kernel launch command in a finalized command-buffer.
8120+
/// @brief Update a kernel launch command in a finalized command-buffer. This
8121+
/// entry-point is synchronous and may block if the command-buffer is
8122+
/// executing when the entry-point is called.
81218123
///
81228124
/// @returns
81238125
/// - ::UR_RESULT_SUCCESS

source/ur_api.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6868,7 +6868,9 @@ ur_result_t UR_APICALL urCommandBufferReleaseCommandExp(
68686868
}
68696869

68706870
///////////////////////////////////////////////////////////////////////////////
6871-
/// @brief Update a kernel launch command in a finalized command-buffer.
6871+
/// @brief Update a kernel launch command in a finalized command-buffer. This
6872+
/// entry-point is synchronous and may block if the command-buffer is
6873+
/// executing when the entry-point is called.
68726874
///
68736875
/// @returns
68746876
/// - ::UR_RESULT_SUCCESS

test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,12 @@
44
{{OPT}}BufferFillCommandTest.OverrideUpdate/SYCL_NATIVE_CPU___SYCL_Native_CPU_
55
{{OPT}}BufferFillCommandTest.OverrideArgList/SYCL_NATIVE_CPU___SYCL_Native_CPU_
66
{{OPT}}USMFillCommandTest.UpdateParameters/SYCL_NATIVE_CPU___SYCL_Native_CPU_
7+
{{OPT}}USMFillCommandTest.UpdateBeforeEnqueue/SYCL_NATIVE_CPU___SYCL_Native_CPU_
78
{{OPT}}USMMultipleFillCommandTest.UpdateAllKernels/SYCL_NATIVE_CPU___SYCL_Native_CPU_
89
{{OPT}}BufferSaxpyKernelTest.UpdateParameters/SYCL_NATIVE_CPU___SYCL_Native_CPU_
910
{{OPT}}USMSaxpyKernelTest.UpdateParameters/SYCL_NATIVE_CPU___SYCL_Native_CPU_
11+
{{OPT}}USMMultiSaxpyKernelTest.UpdateParameters/SYCL_NATIVE_CPU___SYCL_Native_CPU_
12+
{{OPT}}USMMultiSaxpyKernelTest.UpdateWithoutBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU_
1013
{{OPT}}NDRangeUpdateTest.Update3D/SYCL_NATIVE_CPU___SYCL_Native_CPU_
1114
{{OPT}}NDRangeUpdateTest.Update2D/SYCL_NATIVE_CPU___SYCL_Native_CPU_
1215
{{OPT}}NDRangeUpdateTest.Update1D/SYCL_NATIVE_CPU___SYCL_Native_CPU_

test/conformance/exp_command_buffer/usm_fill_kernel_update.cpp

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,59 @@ TEST_P(USMFillCommandTest, UpdateParameters) {
143143
Validate((uint32_t *)new_shared_ptr, new_global_size, new_val);
144144
}
145145

146+
// Test updating a command-buffer which hasn't been enqueued yet
147+
TEST_P(USMFillCommandTest, UpdateBeforeEnqueue) {
148+
ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr,
149+
allocation_size, &new_shared_ptr));
150+
ASSERT_NE(new_shared_ptr, nullptr);
151+
std::memset(new_shared_ptr, 0, allocation_size);
152+
153+
// Set new USM pointer as kernel output at index 0
154+
ur_exp_command_buffer_update_pointer_arg_desc_t new_output_desc = {
155+
UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
156+
nullptr, // pNext
157+
0, // argIndex
158+
nullptr, // pProperties
159+
&new_shared_ptr, // pArgValue
160+
};
161+
162+
// Set new value to use for fill at kernel index 1
163+
uint32_t new_val = 33;
164+
ur_exp_command_buffer_update_value_arg_desc_t new_input_desc = {
165+
UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
166+
nullptr, // pNext
167+
1, // argIndex
168+
sizeof(new_val), // argSize
169+
nullptr, // pProperties
170+
&new_val, // hArgValue
171+
};
172+
173+
ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = {
174+
UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype
175+
nullptr, // pNext
176+
0, // numNewMemObjArgs
177+
1, // numNewPointerArgs
178+
1, // numNewValueArgs
179+
0, // newWorkDim
180+
nullptr, // pNewMemObjArgList
181+
&new_output_desc, // pNewPointerArgList
182+
&new_input_desc, // pNewValueArgList
183+
nullptr, // pNewGlobalWorkOffset
184+
nullptr, // pNewGlobalWorkSize
185+
nullptr, // pNewLocalWorkSize
186+
};
187+
188+
// Update kernel and enqueue command-buffer
189+
ASSERT_SUCCESS(
190+
urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc));
191+
ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
192+
nullptr, nullptr));
193+
ASSERT_SUCCESS(urQueueFinish(queue));
194+
195+
// Verify that update occurred correctly
196+
Validate((uint32_t *)new_shared_ptr, global_size, new_val);
197+
}
198+
146199
// Test updating a command-buffer with multiple USM fill kernel commands
147200
struct USMMultipleFillCommandTest
148201
: uur::command_buffer::urUpdatableCommandBufferExpExecutionTest {

0 commit comments

Comments
 (0)