Skip to content

Commit f3c5519

Browse files
authored
Merge pull request #1526 from nrspruit/event_signal_optional
[L0] Enable Batching out of order commands without signal events
2 parents ebf873f + 987c422 commit f3c5519

File tree

12 files changed

+226
-49
lines changed

12 files changed

+226
-49
lines changed

source/adapters/level_zero/command_buffer.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1048,8 +1048,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
10481048
// Create command-list to execute before `CommandListPtr` and will signal
10491049
// when `EventWaitList` dependencies are complete.
10501050
ur_command_list_ptr_t WaitCommandList{};
1051-
UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList,
1052-
false, false));
1051+
UR_CALL(Queue->Context->getAvailableCommandList(
1052+
Queue, WaitCommandList, false, NumEventsInWaitList, EventWaitList,
1053+
false));
10531054

10541055
ZE2UR_CALL(zeCommandListAppendBarrier,
10551056
(WaitCommandList->first, CommandBuffer->WaitEvent->ZeEvent,
@@ -1086,7 +1087,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
10861087
// Create a command-list to signal RetEvent on completion
10871088
ur_command_list_ptr_t SignalCommandList{};
10881089
UR_CALL(Queue->Context->getAvailableCommandList(Queue, SignalCommandList,
1089-
false, false));
1090+
false, NumEventsInWaitList,
1091+
EventWaitList, false));
10901092
// Reset the wait-event for the UR command-buffer that is signaled when its
10911093
// submission dependencies have been satisfied.
10921094
ZE2UR_CALL(zeCommandListAppendEventReset,

source/adapters/level_zero/common.hpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,19 @@ static const uint32_t UrL0QueueSyncNonBlocking = [] {
241241
return L0QueueSyncLockingModeValue;
242242
}();
243243

244+
// Controls whether the L0 Adapter creates signal events for commands on
245+
// integrated gpu devices.
246+
static const uint32_t UrL0OutOfOrderIntegratedSignalEvent = [] {
247+
const char *UrL0OutOfOrderIntegratedSignalEventEnv =
248+
std::getenv("UR_L0_OOQ_INTEGRATED_SIGNAL_EVENT");
249+
uint32_t UrL0OutOfOrderIntegratedSignalEventValue = 1;
250+
if (UrL0OutOfOrderIntegratedSignalEventEnv) {
251+
UrL0OutOfOrderIntegratedSignalEventValue =
252+
std::atoi(UrL0OutOfOrderIntegratedSignalEventEnv);
253+
}
254+
return UrL0OutOfOrderIntegratedSignalEventValue;
255+
}();
256+
244257
// This class encapsulates actions taken along with a call to Level Zero API.
245258
class ZeCall {
246259
private:

source/adapters/level_zero/context.cpp

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -645,7 +645,8 @@ static const size_t CmdListsCleanupThreshold = [] {
645645
// Retrieve an available command list to be used in a PI call.
646646
ur_result_t ur_context_handle_t_::getAvailableCommandList(
647647
ur_queue_handle_t Queue, ur_command_list_ptr_t &CommandList,
648-
bool UseCopyEngine, bool AllowBatching,
648+
bool UseCopyEngine, uint32_t NumEventsInWaitList,
649+
const ur_event_handle_t *EventWaitList, bool AllowBatching,
649650
ze_command_queue_handle_t *ForcedCmdQueue) {
650651
// Immediate commandlists have been pre-allocated and are always available.
651652
if (Queue->UsingImmCmdLists) {
@@ -677,9 +678,17 @@ ur_result_t ur_context_handle_t_::getAvailableCommandList(
677678
// for this queue.
678679
if (Queue->hasOpenCommandList(UseCopyEngine)) {
679680
if (AllowBatching) {
680-
CommandList = CommandBatch.OpenCommandList;
681-
UR_CALL(Queue->insertStartBarrierIfDiscardEventsMode(CommandList));
682-
return UR_RESULT_SUCCESS;
681+
bool batchingAllowed = true;
682+
if (!UrL0OutOfOrderIntegratedSignalEvent &&
683+
Queue->Device->isIntegrated()) {
684+
batchingAllowed = eventCanBeBatched(Queue, UseCopyEngine,
685+
NumEventsInWaitList, EventWaitList);
686+
}
687+
if (batchingAllowed) {
688+
CommandList = CommandBatch.OpenCommandList;
689+
UR_CALL(Queue->insertStartBarrierIfDiscardEventsMode(CommandList));
690+
return UR_RESULT_SUCCESS;
691+
}
683692
}
684693
// If this command isn't allowed to be batched or doesn't match the forced
685694
// command queue, then we need to go ahead and execute what is already in

source/adapters/level_zero/context.hpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -292,11 +292,11 @@ struct ur_context_handle_t_ : _ur_object {
292292
// When using immediate commandlists, retrieves an immediate command list
293293
// for executing on this device. Immediate commandlists are created only
294294
// once for each SYCL Queue and after that they are reused.
295-
ur_result_t
296-
getAvailableCommandList(ur_queue_handle_t Queue,
297-
ur_command_list_ptr_t &CommandList,
298-
bool UseCopyEngine, bool AllowBatching = false,
299-
ze_command_queue_handle_t *ForcedCmdQueue = nullptr);
295+
ur_result_t getAvailableCommandList(
296+
ur_queue_handle_t Queue, ur_command_list_ptr_t &CommandList,
297+
bool UseCopyEngine, uint32_t NumEventsInWaitList,
298+
const ur_event_handle_t *EventWaitList, bool AllowBatching = false,
299+
ze_command_queue_handle_t *ForcedCmdQueue = nullptr);
300300

301301
// Checks if Device is covered by this context.
302302
// For that the Device or its root devices need to be in the context.

source/adapters/level_zero/device.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,10 @@ struct ur_device_handle_t_ : _ur_object {
176176
(ZeDeviceProperties->deviceId & 0xff0) == 0xb60;
177177
}
178178

179+
bool isIntegrated() {
180+
return (ZeDeviceProperties->flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED);
181+
}
182+
179183
// Does this device represent a single compute slice?
180184
bool isCCS() const {
181185
return QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute]

source/adapters/level_zero/event.cpp

Lines changed: 33 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(
8484

8585
// Get a new command list to be used on this call
8686
ur_command_list_ptr_t CommandList{};
87-
UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
88-
UseCopyEngine));
87+
UR_CALL(Queue->Context->getAvailableCommandList(
88+
Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList));
8989

9090
ze_event_handle_t ZeEvent = nullptr;
9191
ur_event_handle_t InternalEvent;
@@ -256,7 +256,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
256256
// Get an arbitrary command-list in the queue.
257257
ur_command_list_ptr_t CmdList;
258258
UR_CALL(Queue->Context->getAvailableCommandList(
259-
Queue, CmdList, false /*UseCopyEngine=*/, OkToBatch));
259+
Queue, CmdList, false /*UseCopyEngine=*/, NumEventsInWaitList,
260+
EventWaitList, OkToBatch));
260261

261262
// Insert the barrier into the command-list and execute.
262263
UR_CALL(insertBarrierIntoCmdList(CmdList, TmpWaitList, *Event, IsInternal));
@@ -311,7 +312,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
311312
if (ZeQueue) {
312313
ur_command_list_ptr_t CmdList;
313314
UR_CALL(Queue->Context->getAvailableCommandList(
314-
Queue, CmdList, UseCopyEngine, OkToBatch, &ZeQueue));
315+
Queue, CmdList, UseCopyEngine, NumEventsInWaitList,
316+
EventWaitList, OkToBatch, &ZeQueue));
315317
CmdLists.push_back(CmdList);
316318
}
317319
}
@@ -324,7 +326,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
324326
// Get any available command list.
325327
ur_command_list_ptr_t CmdList;
326328
UR_CALL(Queue->Context->getAvailableCommandList(
327-
Queue, CmdList, false /*UseCopyEngine=*/, OkToBatch));
329+
Queue, CmdList, false /*UseCopyEngine=*/, NumEventsInWaitList,
330+
EventWaitList, OkToBatch));
328331
CmdLists.push_back(CmdList);
329332
}
330333

@@ -598,6 +601,7 @@ ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent(
598601
this->Mutex);
599602

600603
if (!HostVisibleEvent) {
604+
this->IsCreatingHostProxyEvent = true;
601605
if (UrQueue->ZeEventsScope != OnDemandHostVisibleProxy)
602606
die("getOrCreateHostVisibleEvent: missing host-visible event");
603607

@@ -612,20 +616,26 @@ ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent(
612616

613617
ur_command_list_ptr_t CommandList{};
614618
UR_CALL(UrQueue->Context->getAvailableCommandList(
615-
UrQueue, CommandList, false /* UseCopyEngine */, OkToBatch))
619+
UrQueue, CommandList, false /* UseCopyEngine */, 0, nullptr, OkToBatch))
616620

617621
// Create a "proxy" host-visible event.
618622
UR_CALL(createEventAndAssociateQueue(
619623
UrQueue, &HostVisibleEvent, UR_EXT_COMMAND_TYPE_USER, CommandList,
620624
/* IsInternal */ false, /* IsMultiDevice */ false,
621625
/* HostVisible */ true));
622626

623-
ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
624-
(CommandList->first, 1, &ZeEvent));
627+
if (this->IsInnerBatchedEvent) {
628+
ZE2UR_CALL(zeCommandListAppendBarrier,
629+
(CommandList->first, ZeEvent, 0, nullptr));
630+
} else {
631+
ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
632+
(CommandList->first, 1, &ZeEvent));
633+
}
625634
ZE2UR_CALL(zeCommandListAppendSignalEvent,
626635
(CommandList->first, HostVisibleEvent->ZeEvent));
627636

628637
UR_CALL(UrQueue->executeCommandList(CommandList, false, OkToBatch))
638+
this->IsCreatingHostProxyEvent = false;
629639
}
630640

631641
ZeHostVisibleEvent = HostVisibleEvent->ZeEvent;
@@ -682,7 +692,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventWait(
682692

683693
ze_event_handle_t ZeEvent = HostVisibleEvent->ZeEvent;
684694
logger::debug("ZeEvent = {}", ur_cast<std::uintptr_t>(ZeEvent));
685-
ZE2UR_CALL(zeHostSynchronize, (ZeEvent));
695+
// If this event was an inner batched event, then sync with
696+
// the Queue instead of waiting on the event.
697+
if (HostVisibleEvent->IsInnerBatchedEvent && Event->ZeBatchedQueue) {
698+
ZE2UR_CALL(zeHostSynchronize, (Event->ZeBatchedQueue));
699+
} else {
700+
ZE2UR_CALL(zeHostSynchronize, (ZeEvent));
701+
}
686702
Event->Completed = true;
687703
}
688704
}
@@ -938,7 +954,12 @@ ur_result_t CleanupCompletedEvent(ur_event_handle_t Event, bool QueueLocked,
938954
std::list<ur_event_handle_t> EventsToBeReleased;
939955
ur_queue_handle_t AssociatedQueue = nullptr;
940956
{
941-
std::scoped_lock<ur_shared_mutex> EventLock(Event->Mutex);
957+
// If the Event is already locked, then continue with the cleanup, otherwise
958+
// block on locking the event.
959+
std::unique_lock<ur_shared_mutex> EventLock(Event->Mutex, std::try_to_lock);
960+
if (!EventLock.owns_lock() && !Event->IsCreatingHostProxyEvent) {
961+
EventLock.lock();
962+
}
942963
if (SetEventCompleted)
943964
Event->Completed = true;
944965
// Exit early of event was already cleanedup.
@@ -1324,8 +1345,8 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
13241345
// Get a command list prior to acquiring an event lock.
13251346
// This prevents a potential deadlock with recursive
13261347
// event locks.
1327-
UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
1328-
false, true));
1348+
UR_CALL(Queue->Context->getAvailableCommandList(
1349+
Queue, CommandList, false, 0, nullptr, true));
13291350
}
13301351

13311352
std::shared_lock<ur_shared_mutex> Lock(EventList[I]->Mutex);

source/adapters/level_zero/event.hpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,15 @@ struct ur_event_handle_t_ : _ur_object {
198198
// performance
199199
bool IsMultiDevice = {false};
200200

201+
// Indicates inner batched event which was not used as a signal event.
202+
bool IsInnerBatchedEvent = {false};
203+
204+
// Queue where the batched command was executed.
205+
ze_command_queue_handle_t ZeBatchedQueue = {nullptr};
206+
207+
// Indicates within creation of proxy event.
208+
bool IsCreatingHostProxyEvent = {false};
209+
201210
// Besides each PI object keeping a total reference count in
202211
// _ur_object::RefCount we keep special track of the event *external*
203212
// references. This way we are able to tell when the event is not referenced

source/adapters/level_zero/image.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -790,8 +790,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
790790

791791
// Get a new command list to be used on this call
792792
ur_command_list_ptr_t CommandList{};
793-
UR_CALL(hQueue->Context->getAvailableCommandList(hQueue, CommandList,
794-
UseCopyEngine, OkToBatch));
793+
UR_CALL(hQueue->Context->getAvailableCommandList(
794+
hQueue, CommandList, UseCopyEngine, numEventsInWaitList, phEventWaitList,
795+
OkToBatch));
795796

796797
ze_event_handle_t ZeEvent = nullptr;
797798
ur_event_handle_t InternalEvent;
@@ -800,7 +801,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
800801
UR_CALL(createEventAndAssociateQueue(hQueue, Event, UR_COMMAND_MEM_IMAGE_COPY,
801802
CommandList, IsInternal,
802803
/*IsMultiDevice*/ false));
803-
ZeEvent = (*Event)->ZeEvent;
804+
UR_CALL(setSignalEvent(hQueue, UseCopyEngine, &ZeEvent, Event,
805+
numEventsInWaitList, phEventWaitList,
806+
CommandList->second.ZeQueue));
804807
(*Event)->WaitList = TmpWaitList;
805808

806809
const auto &ZeCommandList = CommandList->first;

source/adapters/level_zero/kernel.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
201201
// Get a new command list to be used on this call
202202
ur_command_list_ptr_t CommandList{};
203203
UR_CALL(Queue->Context->getAvailableCommandList(
204-
Queue, CommandList, UseCopyEngine, true /* AllowBatching */));
204+
Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList,
205+
true /* AllowBatching */));
205206

206207
ze_event_handle_t ZeEvent = nullptr;
207208
ur_event_handle_t InternalEvent{};
@@ -210,7 +211,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
210211

211212
UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_KERNEL_LAUNCH,
212213
CommandList, IsInternal, false));
213-
ZeEvent = (*Event)->ZeEvent;
214+
UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event,
215+
NumEventsInWaitList, EventWaitList,
216+
CommandList->second.ZeQueue));
214217
(*Event)->WaitList = TmpWaitList;
215218

216219
// Save the kernel in the event, so that when the event is signalled

0 commit comments

Comments
 (0)