Skip to content

Commit d0f6694

Browse files
[L0] Fix urEnqueueEventsWaitWithBarrier option1
Signed-off-by: Zhang, Winston <winston.zhang@intel.com>
1 parent f08d42c commit d0f6694

File tree

2 files changed

+296
-3
lines changed

2 files changed

+296
-3
lines changed

source/adapters/level_zero/event.cpp

Lines changed: 293 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -435,9 +435,300 @@ ur_result_t urEnqueueEventsWaitWithBarrierExt(
435435
*OutEvent ///< [in,out][optional] return an event object that identifies
436436
///< this particular command instance.
437437
) {
438-
return ur::level_zero::urEnqueueEventsWaitWithBarrier(
439-
Queue, NumEventsInWaitList, EventWaitList, OutEvent);
438+
bool InterruptBased =
439+
EnqueueExtProp &&
440+
(EnqueueExtProp->flags & UR_EXP_ENQUEUE_EXT_FLAG_LOW_POWER_EVENTS);
441+
if (!InterruptBased) {
442+
return ur::level_zero::urEnqueueEventsWaitWithBarrier(
443+
Queue, NumEventsInWaitList, EventWaitList, OutEvent);
444+
}
445+
// Lock automatically releases when this goes out of scope.
446+
std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
447+
448+
// Helper function for appending a barrier to a command list.
449+
auto insertBarrierIntoCmdList = [&Queue](ur_command_list_ptr_t CmdList,
450+
_ur_ze_event_list_t &EventWaitList,
451+
ur_event_handle_t &Event,
452+
bool IsInternal) {
453+
UR_CALL(createEventAndAssociateQueue(
454+
Queue, &Event, UR_COMMAND_EVENTS_WAIT_WITH_BARRIER, CmdList, IsInternal,
455+
false, std::nullopt, true));
456+
Event->WaitList = EventWaitList;
457+
458+
// For in-order queue we don't need a real barrier, just wait for
459+
// requested events in potentially different queues and add a "barrier"
460+
// event signal because it is already guaranteed that previous commands
461+
// in this queue are completed when the signal is started.
462+
//
463+
// Only consideration here is that when profiling is used, signalEvent
464+
// cannot be used if EventWaitList.Length == 0. In those cases, we need
465+
// to fallback directly to barrier to have correct timestamps. See here:
466+
// https://spec.oneapi.io/level-zero/latest/core/api.html?highlight=appendsignalevent#_CPPv430zeCommandListAppendSignalEvent24ze_command_list_handle_t17ze_event_handle_t
467+
//
468+
// TODO: this and other special handling of in-order queues to be
469+
// updated when/if Level Zero adds native support for in-order queues.
470+
//
471+
if (Queue->isInOrderQueue() && InOrderBarrierBySignal &&
472+
!Queue->isProfilingEnabled()) {
473+
if (EventWaitList.Length) {
474+
if (CmdList->second.IsInOrderList) {
475+
for (unsigned i = EventWaitList.Length; i-- > 0;) {
476+
// If the event is a multidevice event, then given driver in order
477+
// lists, we cannot include this into the wait event list due to
478+
// driver limitations.
479+
if (EventWaitList.UrEventList[i]->IsMultiDevice) {
480+
EventWaitList.Length--;
481+
if (EventWaitList.Length != i) {
482+
std::swap(EventWaitList.UrEventList[i],
483+
EventWaitList.UrEventList[EventWaitList.Length]);
484+
std::swap(EventWaitList.ZeEventList[i],
485+
EventWaitList.ZeEventList[EventWaitList.Length]);
486+
}
487+
}
488+
}
489+
}
490+
ZE2UR_CALL(
491+
zeCommandListAppendWaitOnEvents,
492+
(CmdList->first, EventWaitList.Length, EventWaitList.ZeEventList));
493+
}
494+
ZE2UR_CALL(zeCommandListAppendSignalEvent,
495+
(CmdList->first, Event->ZeEvent));
496+
} else {
497+
ZE2UR_CALL(zeCommandListAppendBarrier,
498+
(CmdList->first, Event->ZeEvent, EventWaitList.Length,
499+
EventWaitList.ZeEventList));
500+
}
501+
502+
return UR_RESULT_SUCCESS;
503+
};
504+
505+
// If the queue is in-order then each command in it effectively acts as a
506+
// barrier, so we don't need to do anything except if we were requested
507+
// a "barrier" event to be created. Or if we need to wait for events in
508+
// potentially different queues.
509+
//
510+
if (Queue->isInOrderQueue() && NumEventsInWaitList == 0 && !OutEvent) {
511+
return UR_RESULT_SUCCESS;
512+
}
513+
514+
ur_event_handle_t ResultEvent = nullptr;
515+
bool IsInternal = OutEvent == nullptr;
516+
// For in-order queue and wait-list which is empty or has events from
517+
// the same queue just use the last command event as the barrier event.
518+
// This optimization is disabled when profiling is enabled to ensure
519+
// accurate profiling values & the overhead that profiling incurs.
520+
if (Queue->isInOrderQueue() && !Queue->isProfilingEnabled() &&
521+
WaitListEmptyOrAllEventsFromSameQueue(Queue, NumEventsInWaitList,
522+
EventWaitList) &&
523+
Queue->LastCommandEvent && !Queue->LastCommandEvent->IsDiscarded) {
524+
UR_CALL(ur::level_zero::urEventRetain(Queue->LastCommandEvent));
525+
ResultEvent = Queue->LastCommandEvent;
526+
if (OutEvent) {
527+
*OutEvent = ResultEvent;
528+
}
529+
return UR_RESULT_SUCCESS;
530+
}
531+
532+
// Indicator for whether batching is allowed. This may be changed later in
533+
// this function, but allow it by default.
534+
bool OkToBatch = true;
535+
536+
// If we have a list of events to make the barrier from, then we can create a
537+
// barrier on these and use the resulting event as our future barrier.
538+
// We use the same approach if
539+
// UR_L0_USE_MULTIPLE_COMMANDLIST_BARRIERS is not set to a
540+
// positive value.
541+
// We use the same approach if we have in-order queue because every command
542+
// depends on previous one, so we don't need to insert barrier to multiple
543+
// command lists.
544+
if (NumEventsInWaitList || !UseMultipleCmdlistBarriers ||
545+
Queue->isInOrderQueue()) {
546+
// Retain the events as they will be owned by the result event.
547+
_ur_ze_event_list_t TmpWaitList;
548+
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
549+
NumEventsInWaitList, EventWaitList, Queue, false /*UseCopyEngine=*/));
550+
551+
// Get an arbitrary command-list in the queue.
552+
ur_command_list_ptr_t CmdList;
553+
UR_CALL(Queue->Context->getAvailableCommandList(
554+
Queue, CmdList, false /*UseCopyEngine=*/, NumEventsInWaitList,
555+
EventWaitList, OkToBatch, nullptr /*ForcedCmdQueue*/));
556+
557+
// Insert the barrier into the command-list and execute.
558+
UR_CALL(insertBarrierIntoCmdList(CmdList, TmpWaitList, ResultEvent,
559+
IsInternal));
560+
561+
UR_CALL(
562+
Queue->executeCommandList(CmdList, false /*IsBlocking*/, OkToBatch));
563+
564+
// Because of the dependency between commands in the in-order queue we don't
565+
// need to keep track of any active barriers if we have in-order queue.
566+
if (UseMultipleCmdlistBarriers && !Queue->isInOrderQueue()) {
567+
auto UREvent = reinterpret_cast<ur_event_handle_t>(ResultEvent);
568+
Queue->ActiveBarriers.add(UREvent);
569+
}
570+
571+
if (OutEvent) {
572+
*OutEvent = ResultEvent;
573+
}
574+
return UR_RESULT_SUCCESS;
575+
}
576+
577+
// Since there are no events to explicitly create a barrier for, we are
578+
// inserting a queue-wide barrier.
579+
580+
// Command list(s) for putting barriers.
581+
std::vector<ur_command_list_ptr_t> CmdLists;
582+
583+
// There must be at least one L0 queue.
584+
auto &ComputeGroup = Queue->ComputeQueueGroupsByTID.get();
585+
auto &CopyGroup = Queue->CopyQueueGroupsByTID.get();
586+
UR_ASSERT(!ComputeGroup.ZeQueues.empty() || !CopyGroup.ZeQueues.empty(),
587+
UR_RESULT_ERROR_INVALID_QUEUE);
588+
589+
size_t NumQueues = 0;
590+
for (auto &QueueMap :
591+
{Queue->ComputeQueueGroupsByTID, Queue->CopyQueueGroupsByTID})
592+
for (auto &QueueGroup : QueueMap)
593+
NumQueues += QueueGroup.second.ZeQueues.size();
594+
595+
OkToBatch = true;
596+
// Get an available command list tied to each command queue. We need
597+
// these so a queue-wide barrier can be inserted into each command
598+
// queue.
599+
CmdLists.reserve(NumQueues);
600+
for (auto &QueueMap :
601+
{Queue->ComputeQueueGroupsByTID, Queue->CopyQueueGroupsByTID})
602+
for (auto &QueueGroup : QueueMap) {
603+
bool UseCopyEngine =
604+
QueueGroup.second.Type != ur_queue_handle_t_::queue_type::Compute;
605+
if (Queue->UsingImmCmdLists) {
606+
// If immediate command lists are being used, each will act as their own
607+
// queue, so we must insert a barrier into each.
608+
for (auto &ImmCmdList : QueueGroup.second.ImmCmdLists)
609+
if (ImmCmdList != Queue->CommandListMap.end())
610+
CmdLists.push_back(ImmCmdList);
611+
} else {
612+
for (auto ZeQueue : QueueGroup.second.ZeQueues) {
613+
if (ZeQueue) {
614+
ur_command_list_ptr_t CmdList;
615+
UR_CALL(Queue->Context->getAvailableCommandList(
616+
Queue, CmdList, UseCopyEngine, NumEventsInWaitList,
617+
EventWaitList, OkToBatch, &ZeQueue));
618+
CmdLists.push_back(CmdList);
619+
}
620+
}
621+
}
622+
}
623+
624+
// If no activity has occurred on the queue then there will be no cmdlists.
625+
// We need one for generating an Event, so create one.
626+
if (CmdLists.size() == 0) {
627+
// Get any available command list.
628+
ur_command_list_ptr_t CmdList;
629+
UR_CALL(Queue->Context->getAvailableCommandList(
630+
Queue, CmdList, false /*UseCopyEngine=*/, NumEventsInWaitList,
631+
EventWaitList, OkToBatch, nullptr /*ForcedCmdQueue*/));
632+
CmdLists.push_back(CmdList);
633+
}
634+
635+
if (CmdLists.size() > 1) {
636+
// Insert a barrier into each unique command queue using the available
637+
// command-lists.
638+
std::vector<ur_event_handle_t> EventWaitVector(CmdLists.size());
639+
for (size_t I = 0; I < CmdLists.size(); ++I) {
640+
_ur_ze_event_list_t waitlist;
641+
UR_CALL(insertBarrierIntoCmdList(
642+
CmdLists[I], waitlist, EventWaitVector[I], true /*IsInternal*/));
643+
}
644+
// If there were multiple queues we need to create a "convergence" event to
645+
// be our active barrier. This convergence event is signalled by a barrier
646+
// on all the events from the barriers we have inserted into each queue.
647+
// Use the first command list as our convergence command list.
648+
ur_command_list_ptr_t &ConvergenceCmdList = CmdLists[0];
649+
650+
// Create an event list. It will take ownership over all relevant events so
651+
// we relinquish ownership and let it keep all events it needs.
652+
_ur_ze_event_list_t BaseWaitList;
653+
UR_CALL(BaseWaitList.createAndRetainUrZeEventList(
654+
EventWaitVector.size(),
655+
reinterpret_cast<const ur_event_handle_t *>(EventWaitVector.data()),
656+
Queue, ConvergenceCmdList->second.isCopy(Queue)));
657+
658+
// Insert a barrier with the events from each command-queue into the
659+
// convergence command list. The resulting event signals the convergence of
660+
// all barriers.
661+
UR_CALL(insertBarrierIntoCmdList(ConvergenceCmdList, BaseWaitList,
662+
ResultEvent, IsInternal));
663+
} else {
664+
// If there is only a single queue then insert a barrier and the single
665+
// result event can be used as our active barrier and used as the return
666+
// event. Take into account whether output event is discarded or not.
667+
_ur_ze_event_list_t waitlist;
668+
UR_CALL(insertBarrierIntoCmdList(CmdLists[0], waitlist, ResultEvent,
669+
IsInternal));
670+
}
671+
672+
// Execute each command list so the barriers can be encountered.
673+
for (ur_command_list_ptr_t &CmdList : CmdLists) {
674+
bool IsCopy =
675+
CmdList->second.isCopy(reinterpret_cast<ur_queue_handle_t>(Queue));
676+
const auto &CommandBatch =
677+
(IsCopy) ? Queue->CopyCommandBatch : Queue->ComputeCommandBatch;
678+
// Only batch if the matching CmdList is already open.
679+
OkToBatch = CommandBatch.OpenCommandList == CmdList;
680+
681+
UR_CALL(
682+
Queue->executeCommandList(CmdList, false /*IsBlocking*/, OkToBatch));
683+
}
684+
685+
UR_CALL(Queue->ActiveBarriers.clear());
686+
Queue->ActiveBarriers.add(ResultEvent);
687+
if (OutEvent) {
688+
*OutEvent = ResultEvent;
689+
}
690+
return UR_RESULT_SUCCESS;
440691
}
692+
/*
693+
ur_result_t urEnqueueEventsWaitWithBarrierExt(
694+
ur_queue_handle_t Queue, ///< [in] handle of the queue object
695+
const ur_exp_enqueue_ext_properties_t
696+
*EnqueueExtProp, ///< [in][optional] pointer to the extended enqueue
697+
properties uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
698+
const ur_event_handle_t
699+
*EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
700+
///< pointer to a list of events that must be complete
701+
///< before this command can be executed. If nullptr,
702+
///< the numEventsInWaitList must be 0, indicating that
703+
///< all previously enqueued commands must be complete.
704+
ur_event_handle_t
705+
*OutEvent ///< [in,out][optional] return an event object that identifies
706+
///< this particular command instance.
707+
) {
708+
bool InterruptBased = EnqueueExtProp && (EnqueueExtProp->flags &
709+
UR_EXP_ENQUEUE_EXT_FLAG_LOW_POWER_EVENTS); ur_event_handle_t ResultEvent =
710+
nullptr;
711+
712+
if (InterruptBased) {
713+
// Create the event with interrupt-based properties
714+
ur_command_list_ptr_t CmdList;
715+
UR_CALL(Queue->Context->getAvailableCommandList(Queue, CmdList, false,
716+
NumEventsInWaitList, EventWaitList, true, nullptr));
717+
UR_CALL(createEventAndAssociateQueue(Queue, &ResultEvent,
718+
UR_COMMAND_EVENTS_WAIT_WITH_BARRIER, CmdList, true, false, std::nullopt,
719+
InterruptBased));
720+
}
721+
722+
ur_result_t result = ur::level_zero::urEnqueueEventsWaitWithBarrier(
723+
Queue, NumEventsInWaitList, EventWaitList, OutEvent);
724+
725+
if (InterruptBased && OutEvent) {
726+
*OutEvent = ResultEvent;
727+
}
728+
return result;
729+
}
730+
731+
*/
441732

442733
ur_result_t urEventGetInfo(
443734
ur_event_handle_t Event, ///< [in] handle of the event object

source/adapters/level_zero/queue.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1896,7 +1896,9 @@ ur_result_t createEventAndAssociateQueue(
18961896
UR_CALL(EventCreate(
18971897
Queue->Context, Queue, IsMultiDevice, HostVisible.value(), Event,
18981898
Queue->CounterBasedEventsEnabled, false /*ForceDisableProfiling*/,
1899-
HostVisible.has_value() ? true : Queue->interruptBasedEventsEnabled()));
1899+
InterruptBasedEvents.has_value()
1900+
? InterruptBasedEvents.value()
1901+
: Queue->interruptBasedEventsEnabled()));
19001902

19011903
(*Event)->UrQueue = Queue;
19021904
(*Event)->CommandType = CommandType;

0 commit comments

Comments
 (0)