Skip to content

Commit 877beee

Browse files
authored
[SYCL][Graph] Remove explicit L0 wait from SYCL-RT (#18064)
This PR: - Removes explicit L0 event wait from SYCL-RT and moves it to the Command buffer at UR level, - Removes dead code from `exec_graph_impl::enqueue()`. Fixes #17734
1 parent 2b9291c commit 877beee

File tree

4 files changed

+39
-109
lines changed

4 files changed

+39
-109
lines changed

sycl/source/detail/graph_impl.cpp

Lines changed: 8 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1004,39 +1004,19 @@ exec_graph_impl::enqueue(const std::shared_ptr<sycl::detail::queue_impl> &Queue,
10041004
auto CommandBuffer = CurrentPartition->MCommandBuffers[Queue->get_device()];
10051005

10061006
if (CommandBuffer) {
1007-
// if previous submissions are incompleted, we automatically
1008-
// add completion events of previous submissions as dependencies.
1009-
// With Level-Zero backend we cannot resubmit a command-buffer until the
1010-
// previous one has already completed.
1011-
// Indeed, since a command-list does not accept a list a dependencies at
1012-
// submission, we circumvent this lack by adding a barrier that waits on a
1013-
// specific event and then define the conditions to signal this event in
1014-
// another command-list. Consequently, if a second submission is
1015-
// performed, the signal conditions of this single event are redefined by
1016-
// this second submission. Thus, this can lead to an undefined behaviour
1017-
// and potential hangs. We have therefore to expliclty wait in the host
1018-
// for previous submission to complete before resubmitting the
1019-
// command-buffer for level-zero backend.
1020-
// TODO https://github.com/intel/llvm/issues/17734
1021-
// Remove this backend specific behavior and allow multiple concurrent
1022-
// submissions of the UR command-buffer.
10231007
for (std::vector<sycl::detail::EventImplPtr>::iterator It =
10241008
MExecutionEvents.begin();
10251009
It != MExecutionEvents.end();) {
10261010
auto Event = *It;
10271011
if (!Event->isCompleted()) {
1028-
if (Queue->get_device().get_backend() ==
1029-
sycl::backend::ext_oneapi_level_zero) {
1030-
Event->wait(Event);
1031-
} else {
1032-
auto &AttachedEventsList = Event->getPostCompleteEvents();
1033-
CGData.MEvents.reserve(AttachedEventsList.size() + 1);
1034-
CGData.MEvents.push_back(Event);
1035-
// Add events of the previous execution of all graph partitions.
1036-
for (auto &AttachedEvent : AttachedEventsList) {
1037-
CGData.MEvents.push_back(AttachedEvent);
1038-
}
1039-
}
1012+
auto &AttachedEventsList = Event->getPostCompleteEvents();
1013+
CGData.MEvents.reserve(CGData.MEvents.size() +
1014+
AttachedEventsList.size() + 1);
1015+
CGData.MEvents.push_back(Event);
1016+
// Add events of the previous execution of all graph partitions.
1017+
CGData.MEvents.insert(CGData.MEvents.end(),
1018+
AttachedEventsList.begin(),
1019+
AttachedEventsList.end());
10401020
++It;
10411021
} else {
10421022
// Remove completed events
@@ -1102,46 +1082,6 @@ exec_graph_impl::enqueue(const std::shared_ptr<sycl::detail::queue_impl> &Queue,
11021082

11031083
NewEvent = sycl::detail::Scheduler::getInstance().addCG(
11041084
NodeImpl->getCGCopy(), Queue, /*EventNeeded=*/true);
1105-
} else {
1106-
std::vector<std::shared_ptr<sycl::detail::event_impl>> ScheduledEvents;
1107-
for (auto &NodeImpl : CurrentPartition->MSchedule) {
1108-
std::vector<ur_event_handle_t> RawEvents;
1109-
1110-
// If the node has no requirements for accessors etc. then we skip the
1111-
// scheduler and enqueue directly.
1112-
if (NodeImpl->MCGType == sycl::detail::CGType::Kernel &&
1113-
NodeImpl->MCommandGroup->getRequirements().size() +
1114-
static_cast<sycl::detail::CGExecKernel *>(
1115-
NodeImpl->MCommandGroup.get())
1116-
->MStreams.size() ==
1117-
0) {
1118-
sycl::detail::CGExecKernel *CG =
1119-
static_cast<sycl::detail::CGExecKernel *>(
1120-
NodeImpl->MCommandGroup.get());
1121-
auto OutEvent = CreateNewEvent();
1122-
sycl::detail::enqueueImpKernel(
1123-
Queue, CG->MNDRDesc, CG->MArgs, CG->MKernelBundle,
1124-
CG->MSyclKernel, CG->MKernelName, RawEvents, OutEvent,
1125-
// TODO: Pass accessor mem allocations
1126-
nullptr,
1127-
// TODO: Extract from handler
1128-
UR_KERNEL_CACHE_CONFIG_DEFAULT, CG->MKernelIsCooperative,
1129-
CG->MKernelUsesClusterLaunch, CG->MKernelWorkGroupMemorySize);
1130-
ScheduledEvents.push_back(NewEvent);
1131-
} else if (!NodeImpl->isEmpty()) {
1132-
// Empty nodes are node processed as other nodes, but only their
1133-
// dependencies are propagated in findRealDeps
1134-
sycl::detail::EventImplPtr EventImpl =
1135-
sycl::detail::Scheduler::getInstance().addCG(
1136-
NodeImpl->getCGCopy(), Queue, /*EventNeeded=*/true);
1137-
1138-
ScheduledEvents.push_back(EventImpl);
1139-
}
1140-
}
1141-
// Create an event which has all kernel events as dependencies
1142-
NewEvent = std::make_shared<sycl::detail::event_impl>(Queue);
1143-
NewEvent->setStateIncomplete();
1144-
NewEvent->getPreparedDepsEvents() = ScheduledEvents;
11451085
}
11461086
PartitionsExecutionEvents[CurrentPartition] = NewEvent;
11471087
}

unified-runtime/source/adapters/level_zero/command_buffer.cpp

Lines changed: 31 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1507,6 +1507,25 @@ ur_result_t getZeCommandQueue(ur_queue_handle_t Queue, bool UseCopyEngine,
15071507
return UR_RESULT_SUCCESS;
15081508
}
15091509

1510+
/**
1511+
* Waits for any ongoing executions of the command-buffer to finish.
1512+
* @param CommandBuffer The command-buffer to wait for.
1513+
* @return UR_RESULT_SUCCESS or an error code on failure
1514+
*/
1515+
ur_result_t
1516+
waitForOngoingExecution(ur_exp_command_buffer_handle_t CommandBuffer) {
1517+
1518+
if (ur_event_handle_t &CurrentSubmissionEvent =
1519+
CommandBuffer->CurrentSubmissionEvent) {
1520+
ZE2UR_CALL(zeEventHostSynchronize,
1521+
(CurrentSubmissionEvent->ZeEvent, UINT64_MAX));
1522+
UR_CALL(urEventReleaseInternal(CurrentSubmissionEvent));
1523+
CurrentSubmissionEvent = nullptr;
1524+
}
1525+
1526+
return UR_RESULT_SUCCESS;
1527+
}
1528+
15101529
/**
15111530
* Waits for the all the dependencies of the command-buffer
15121531
* @param[in] CommandBuffer The command-buffer.
@@ -1754,6 +1773,16 @@ ur_result_t enqueueWaitEventPath(ur_exp_command_buffer_handle_t CommandBuffer,
17541773
ZE2UR_CALL(zeCommandListAppendBarrier,
17551774
(SignalCommandList->first, (*Event)->ZeEvent, 0, nullptr));
17561775

1776+
/* The event needs to be retained since it will be used later by the
1777+
command-buffer. If there is an existing event from a
1778+
previous submission of the command-buffer, release it since it is no longer
1779+
needed. */
1780+
if (CommandBuffer->CurrentSubmissionEvent) {
1781+
UR_CALL(urEventReleaseInternal(CommandBuffer->CurrentSubmissionEvent));
1782+
}
1783+
(*Event)->RefCount.increment();
1784+
CommandBuffer->CurrentSubmissionEvent = *Event;
1785+
17571786
UR_CALL(Queue->executeCommandList(SignalCommandList, false /*IsBlocking*/,
17581787
false /*OKToBatchCommand*/));
17591788

@@ -1767,6 +1796,8 @@ ur_result_t urEnqueueCommandBufferExp(
17671796

17681797
std::scoped_lock<ur_shared_mutex> Lock(UrQueue->Mutex);
17691798

1799+
UR_CALL(waitForOngoingExecution(CommandBuffer));
1800+
17701801
const bool IsInternal = (Event == nullptr);
17711802
const bool DoProfiling =
17721803
(UrQueue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) &&
@@ -1794,8 +1825,6 @@ ur_result_t urEnqueueCommandBufferExp(
17941825
EventWaitList, OutEvent, ZeCommandListHelper,
17951826
DoProfiling));
17961827
}
1797-
// Mark that synchronization will be required for later updates
1798-
CommandBuffer->NeedsUpdateSynchronization = true;
17991828

18001829
return UR_RESULT_SUCCESS;
18011830
}
@@ -2230,37 +2259,6 @@ ur_result_t updateCommandBuffer(
22302259
return UR_RESULT_SUCCESS;
22312260
}
22322261

2233-
/**
2234-
* Waits for any ongoing executions of the command-buffer to finish before
2235-
* updating.
2236-
* @param CommandBuffer The command-buffer to wait for.
2237-
* @return UR_RESULT_SUCCESS or an error code on failure
2238-
*/
2239-
ur_result_t
2240-
waitForOngoingExecution(ur_exp_command_buffer_handle_t CommandBuffer) {
2241-
// Calling function has taken a lock for the command-buffer so we can safely
2242-
// check and modify this value here.
2243-
// If command-buffer was recently synchronized we can return early.
2244-
if (!CommandBuffer->NeedsUpdateSynchronization) {
2245-
return UR_RESULT_SUCCESS;
2246-
}
2247-
2248-
if (CommandBuffer->UseImmediateAppendPath) {
2249-
if (ur_event_handle_t &CurrentSubmissionEvent =
2250-
CommandBuffer->CurrentSubmissionEvent) {
2251-
ZE2UR_CALL(zeEventHostSynchronize,
2252-
(CurrentSubmissionEvent->ZeEvent, UINT64_MAX));
2253-
UR_CALL(urEventReleaseInternal(CurrentSubmissionEvent));
2254-
CurrentSubmissionEvent = nullptr;
2255-
}
2256-
} else if (ze_fence_handle_t &ZeFence = CommandBuffer->ZeActiveFence) {
2257-
ZE2UR_CALL(zeFenceHostSynchronize, (ZeFence, UINT64_MAX));
2258-
}
2259-
// Mark that command-buffer was recently synchronized
2260-
CommandBuffer->NeedsUpdateSynchronization = false;
2261-
return UR_RESULT_SUCCESS;
2262-
}
2263-
22642262
} // namespace
22652263

22662264
ur_result_t urCommandBufferUpdateKernelLaunchExp(

unified-runtime/source/adapters/level_zero/command_buffer.hpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -144,10 +144,6 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object {
144144
// This list is needed to release all kernels retained by the
145145
// command_buffer.
146146
std::vector<ur_kernel_handle_t> KernelsList;
147-
// Track whether synchronization is required when updating the command-buffer
148-
// Set this value to true when a command-buffer is enqueued, and false after
149-
// any fence or event synchronization to avoid repeated calls to synchronize.
150-
bool NeedsUpdateSynchronization = false;
151147
// Track handle objects to free when command-buffer is destroyed.
152148
std::vector<std::unique_ptr<ur_exp_command_buffer_command_handle_t_>>
153149
CommandHandles;

unified-runtime/test/conformance/exp_command_buffer/fill.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -128,10 +128,6 @@ TEST_P(urCommandBufferFillCommandsTest, Buffer) {
128128
}
129129

130130
TEST_P(urCommandBufferFillCommandsTest, ExecuteTwice) {
131-
// TODO https://github.com/intel/llvm/issues/17734
132-
// Fail on Level-Zero due to blocking wait code in graph_impl.cpp specific
133-
// to the level-zero backend that needs moved into the Level-Zero v1 adapter.
134-
UUR_KNOWN_FAILURE_ON(uur::LevelZero{});
135131
ASSERT_SUCCESS(urCommandBufferAppendMemBufferFillExp(
136132
cmd_buf_handle, buffer, pattern.data(), pattern_size, 0, size, 0, nullptr,
137133
0, nullptr, &sync_point, nullptr, nullptr));

0 commit comments

Comments
 (0)