Skip to content

Commit 0f118d7

Browse files
authored
Merge pull request #1629 from Bensuo/ewan/L0_update_host_wait
Use fence rather than event for sync in L0 command-buffer update
2 parents ed4211c + 721d63c commit 0f118d7

File tree

9 files changed

+298
-35
lines changed

9 files changed

+298
-35
lines changed

include/ur_api.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8742,7 +8742,9 @@ urCommandBufferReleaseCommandExp(
87428742
);
87438743

87448744
///////////////////////////////////////////////////////////////////////////////
8745-
/// @brief Update a kernel launch command in a finalized command-buffer.
8745+
/// @brief Update a kernel launch command in a finalized command-buffer. This
8746+
/// entry-point is synchronous and may block if the command-buffer is
8747+
/// executing when the entry-point is called.
87468748
///
87478749
/// @returns
87488750
/// - ::UR_RESULT_SUCCESS

scripts/core/exp-command-buffer.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -900,7 +900,7 @@ returns:
900900
- $X_RESULT_ERROR_OUT_OF_HOST_MEMORY
901901
--- #--------------------------------------------------------------------------
902902
type: function
903-
desc: "Update a kernel launch command in a finalized command-buffer."
903+
desc: "Update a kernel launch command in a finalized command-buffer. This entry-point is synchronous and may block if the command-buffer is executing when the entry-point is called."
904904
class: $xCommandBuffer
905905
name: UpdateKernelLaunchExp
906906
params:

source/adapters/level_zero/command_buffer.cpp

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,8 @@ ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_(
5151
const ur_exp_command_buffer_desc_t *Desc, const bool IsInOrderCmdList)
5252
: Context(Context), Device(Device), ZeCommandList(CommandList),
5353
ZeCommandListResetEvents(CommandListResetEvents),
54-
ZeCommandListDesc(ZeDesc), ZeFencesList(), QueueProperties(),
55-
SyncPoints(), NextSyncPoint(0),
54+
ZeCommandListDesc(ZeDesc), ZeFencesMap(), ZeActiveFence(nullptr),
55+
QueueProperties(), SyncPoints(), NextSyncPoint(0),
5656
IsUpdatable(Desc ? Desc->isUpdatable : false),
5757
IsProfilingEnabled(Desc ? Desc->enableProfiling : false),
5858
IsInOrderCmdList(IsInOrderCmdList) {
@@ -102,8 +102,9 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() {
102102
urEventReleaseInternal(Event);
103103
}
104104

105-
// Release Fences allocated to command_buffer
106-
for (auto &ZeFence : ZeFencesList) {
105+
// Release fences allocated to command-buffer
106+
for (auto &ZeFencePair : ZeFencesMap) {
107+
auto &ZeFence = ZeFencePair.second;
107108
ZE_CALL_NOCHECK(zeFenceDestroy, (ZeFence));
108109
}
109110

@@ -1053,11 +1054,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
10531054
uint32_t QueueGroupOrdinal;
10541055
auto &ZeCommandQueue = QGroup.getZeQueue(&QueueGroupOrdinal);
10551056

1056-
ze_fence_handle_t ZeFence;
1057-
ZeStruct<ze_fence_desc_t> ZeFenceDesc;
1058-
1059-
ZE2UR_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
1060-
CommandBuffer->ZeFencesList.push_back(ZeFence);
1057+
// If we already have created a fence for this queue, first reset then reuse
1058+
// it, otherwise create a new fence.
1059+
ze_fence_handle_t &ZeFence = CommandBuffer->ZeActiveFence;
1060+
auto ZeWorkloadFenceForQueue =
1061+
CommandBuffer->ZeFencesMap.find(ZeCommandQueue);
1062+
if (ZeWorkloadFenceForQueue == CommandBuffer->ZeFencesMap.end()) {
1063+
ZeStruct<ze_fence_desc_t> ZeFenceDesc;
1064+
ZE2UR_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
1065+
CommandBuffer->ZeFencesMap.insert({{ZeCommandQueue, ZeFence}});
1066+
} else {
1067+
ZeFence = ZeWorkloadFenceForQueue->second;
1068+
ZE2UR_CALL(zeFenceReset, (ZeFence));
1069+
}
10611070

10621071
bool MustSignalWaitEvent = true;
10631072
if (NumEventsInWaitList) {
@@ -1458,10 +1467,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
14581467
MutableCommandDesc.flags = 0;
14591468

14601469
// We must synchronize mutable command list execution before mutating.
1461-
ZE2UR_CALL(zeEventHostSynchronize,
1462-
(CommandBuffer->SignalEvent->ZeEvent, UINT64_MAX));
1470+
if (ze_fence_handle_t &ZeFence = CommandBuffer->ZeActiveFence) {
1471+
ZE2UR_CALL(zeFenceHostSynchronize, (ZeFence, UINT64_MAX));
1472+
}
14631473

1464-
auto Plt = Command->CommandBuffer->Context->getPlatform();
1474+
auto Plt = CommandBuffer->Context->getPlatform();
14651475
UR_ASSERT(Plt->ZeMutableCmdListExt.Supported,
14661476
UR_RESULT_ERROR_UNSUPPORTED_FEATURE);
14671477
ZE2UR_CALL(Plt->ZeMutableCmdListExt.zexCommandListUpdateMutableCommandsExp,

source/adapters/level_zero/command_buffer.hpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,13 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object {
5454
ze_command_list_handle_t ZeCommandListResetEvents;
5555
// Level Zero command list descriptor
5656
ZeStruct<ze_command_list_desc_t> ZeCommandListDesc;
57-
// List of Level Zero fences created when submitting a graph.
58-
// This list is needed to release all fences retained by the
59-
// command_buffer.
60-
std::vector<ze_fence_handle_t> ZeFencesList;
57+
// Level Zero fences for each queue the command-buffer has been enqueued to.
58+
// These should be destroyed when the command-buffer is released.
59+
std::unordered_map<ze_command_queue_handle_t, ze_fence_handle_t> ZeFencesMap;
60+
// The Level Zero fence from the most recent enqueue of the command-buffer.
61+
// Must be an element in ZeFencesMap, so is not required to be destroyed
62+
// itself.
63+
ze_fence_handle_t ZeActiveFence;
6164
// Queue properties from command-buffer descriptor
6265
// TODO: Do we need these?
6366
ur_queue_properties_t QueueProperties;

source/loader/ur_libapi.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8117,7 +8117,9 @@ ur_result_t UR_APICALL urCommandBufferReleaseCommandExp(
81178117
}
81188118

81198119
///////////////////////////////////////////////////////////////////////////////
8120-
/// @brief Update a kernel launch command in a finalized command-buffer.
8120+
/// @brief Update a kernel launch command in a finalized command-buffer. This
8121+
/// entry-point is synchronous and may block if the command-buffer is
8122+
/// executing when the entry-point is called.
81218123
///
81228124
/// @returns
81238125
/// - ::UR_RESULT_SUCCESS

source/ur_api.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6868,7 +6868,9 @@ ur_result_t UR_APICALL urCommandBufferReleaseCommandExp(
68686868
}
68696869

68706870
///////////////////////////////////////////////////////////////////////////////
6871-
/// @brief Update a kernel launch command in a finalized command-buffer.
6871+
/// @brief Update a kernel launch command in a finalized command-buffer. This
6872+
/// entry-point is synchronous and may block if the command-buffer is
6873+
/// executing when the entry-point is called.
68726874
///
68736875
/// @returns
68746876
/// - ::UR_RESULT_SUCCESS

test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,12 @@
44
{{OPT}}BufferFillCommandTest.OverrideUpdate/SYCL_NATIVE_CPU___SYCL_Native_CPU_
55
{{OPT}}BufferFillCommandTest.OverrideArgList/SYCL_NATIVE_CPU___SYCL_Native_CPU_
66
{{OPT}}USMFillCommandTest.UpdateParameters/SYCL_NATIVE_CPU___SYCL_Native_CPU_
7+
{{OPT}}USMFillCommandTest.UpdateBeforeEnqueue/SYCL_NATIVE_CPU___SYCL_Native_CPU_
78
{{OPT}}USMMultipleFillCommandTest.UpdateAllKernels/SYCL_NATIVE_CPU___SYCL_Native_CPU_
89
{{OPT}}BufferSaxpyKernelTest.UpdateParameters/SYCL_NATIVE_CPU___SYCL_Native_CPU_
910
{{OPT}}USMSaxpyKernelTest.UpdateParameters/SYCL_NATIVE_CPU___SYCL_Native_CPU_
11+
{{OPT}}USMMultiSaxpyKernelTest.UpdateParameters/SYCL_NATIVE_CPU___SYCL_Native_CPU_
12+
{{OPT}}USMMultiSaxpyKernelTest.UpdateWithoutBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU_
1013
{{OPT}}NDRangeUpdateTest.Update3D/SYCL_NATIVE_CPU___SYCL_Native_CPU_
1114
{{OPT}}NDRangeUpdateTest.Update2D/SYCL_NATIVE_CPU___SYCL_Native_CPU_
1215
{{OPT}}NDRangeUpdateTest.Update1D/SYCL_NATIVE_CPU___SYCL_Native_CPU_

test/conformance/exp_command_buffer/usm_fill_kernel_update.cpp

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,59 @@ TEST_P(USMFillCommandTest, UpdateParameters) {
142142
Validate((uint32_t *)new_shared_ptr, new_global_size, new_val);
143143
}
144144

145+
// Test updating a command-buffer which hasn't been enqueued yet
146+
TEST_P(USMFillCommandTest, UpdateBeforeEnqueue) {
147+
ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr,
148+
allocation_size, &new_shared_ptr));
149+
ASSERT_NE(new_shared_ptr, nullptr);
150+
std::memset(new_shared_ptr, 0, allocation_size);
151+
152+
// Set new USM pointer as kernel output at index 0
153+
ur_exp_command_buffer_update_pointer_arg_desc_t new_output_desc = {
154+
UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
155+
nullptr, // pNext
156+
0, // argIndex
157+
nullptr, // pProperties
158+
&new_shared_ptr, // pArgValue
159+
};
160+
161+
// Set new value to use for fill at kernel index 1
162+
uint32_t new_val = 33;
163+
ur_exp_command_buffer_update_value_arg_desc_t new_input_desc = {
164+
UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
165+
nullptr, // pNext
166+
1, // argIndex
167+
sizeof(new_val), // argSize
168+
nullptr, // pProperties
169+
&new_val, // hArgValue
170+
};
171+
172+
ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = {
173+
UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype
174+
nullptr, // pNext
175+
0, // numNewMemObjArgs
176+
1, // numNewPointerArgs
177+
1, // numNewValueArgs
178+
0, // newWorkDim
179+
nullptr, // pNewMemObjArgList
180+
&new_output_desc, // pNewPointerArgList
181+
&new_input_desc, // pNewValueArgList
182+
nullptr, // pNewGlobalWorkOffset
183+
nullptr, // pNewGlobalWorkSize
184+
nullptr, // pNewLocalWorkSize
185+
};
186+
187+
// Update kernel and enqueue command-buffer
188+
ASSERT_SUCCESS(
189+
urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc));
190+
ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
191+
nullptr, nullptr));
192+
ASSERT_SUCCESS(urQueueFinish(queue));
193+
194+
// Verify that update occurred correctly
195+
Validate((uint32_t *)new_shared_ptr, global_size, new_val);
196+
}
197+
145198
// Test updating a command-buffer with multiple USM fill kernel commands
146199
struct USMMultipleFillCommandTest
147200
: uur::command_buffer::urUpdatableCommandBufferExpExecutionTest {

0 commit comments

Comments
 (0)