diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index acb6bc5540af1..bf4beb5ae83d8 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -281,6 +281,9 @@ void exec_graph_impl::makePartitions() { MPartitionNodes[Node] = PartitionFinalNum; if (isPartitionRoot(Node)) { Partition->MRoots.insert(Node); + if (Node->MCGType == CGType::CodeplayHostTask) { + Partition->MIsHostTask = true; + } } } } @@ -288,13 +291,15 @@ void exec_graph_impl::makePartitions() { Partition->schedule(); Partition->MIsInOrderGraph = Partition->checkIfGraphIsSinglePath(); MPartitions.push_back(Partition); + MRootPartitions.push_back(Partition); PartitionFinalNum++; } } // Add an empty partition if there is no partition, i.e. empty graph - if (MPartitions.size() == 0) { + if (MPartitions.empty()) { MPartitions.push_back(std::make_shared()); + MRootPartitions.push_back(MPartitions[0]); } // Make global schedule list @@ -309,8 +314,9 @@ void exec_graph_impl::makePartitions() { auto RootNode = Root.lock(); for (const auto &Dep : RootNode->MPredecessors) { auto NodeDep = Dep.lock(); - Partition->MPredecessors.push_back( - MPartitions[MPartitionNodes[NodeDep]]); + auto &Predecessor = MPartitions[MPartitionNodes[NodeDep]]; + Partition->MPredecessors.push_back(Predecessor.get()); + Predecessor->MSuccessors.push_back(Partition.get()); } } } @@ -910,6 +916,30 @@ exec_graph_impl::enqueueNode(ur_exp_command_buffer_handle_t CommandBuffer, return Event->getSyncPoint(); } + +void exec_graph_impl::buildRequirements() { + + for (auto &Node : MNodeStorage) { + if (!Node->MCommandGroup) + continue; + + MRequirements.insert(MRequirements.end(), + Node->MCommandGroup->getRequirements().begin(), + Node->MCommandGroup->getRequirements().end()); + + std::shared_ptr &Partition = MPartitions[MPartitionNodes[Node]]; + + Partition->MRequirements.insert( + Partition->MRequirements.end(), + Node->MCommandGroup->getRequirements().begin(), + Node->MCommandGroup->getRequirements().end()); + + Partition->MAccessors.insert(Partition->MAccessors.end(), + Node->MCommandGroup->getAccStorage().begin(), + Node->MCommandGroup->getAccStorage().end()); + } +} + void exec_graph_impl::createCommandBuffers( sycl::device Device, std::shared_ptr &Partition) { ur_exp_command_buffer_handle_t OutCommandBuffer; @@ -949,16 +979,6 @@ void exec_graph_impl::createCommandBuffers( } else { MSyncPoints[Node] = enqueueNode(OutCommandBuffer, Node); } - - // Append Node requirements to overall graph requirements - MRequirements.insert(MRequirements.end(), - Node->MCommandGroup->getRequirements().begin(), - Node->MCommandGroup->getRequirements().end()); - // Also store the actual accessor to make sure they are kept alive when - // commands are submitted - MAccessors.insert(MAccessors.end(), - Node->MCommandGroup->getAccStorage().begin(), - Node->MCommandGroup->getAccStorage().end()); } Res = Adapter @@ -979,7 +999,7 @@ exec_graph_impl::exec_graph_impl(sycl::context Context, sycl::detail::getSyclObjImpl(Context), sycl::async_handler{}, sycl::property_list{})), MDevice(GraphImpl->getDevice()), MContext(Context), MRequirements(), - MExecutionEvents(), + MSchedulerDependencies(), MIsUpdatable(PropList.has_property()), MEnableProfiling( PropList.has_property()), @@ -1026,140 +1046,298 @@ exec_graph_impl::~exec_graph_impl() { } } -sycl::event -exec_graph_impl::enqueue(sycl::detail::queue_impl &Queue, - sycl::detail::CG::StorageInitHelper CGData) { - WriteLock Lock(MMutex); +// Clean up any execution events which have finished so we don't pass them +// to the scheduler. +static void cleanupExecutionEvents(std::vector &ExecutionEvents) { + + auto Predicate = [](EventImplPtr &EventPtr) { + return EventPtr->isCompleted(); + }; + + ExecutionEvents.erase( + std::remove_if(ExecutionEvents.begin(), ExecutionEvents.end(), Predicate), + ExecutionEvents.end()); +} + +EventImplPtr exec_graph_impl::enqueueHostTaskPartition( + std::shared_ptr &Partition, sycl::detail::queue_impl &Queue, + sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded) { + + auto NodeImpl = Partition->MSchedule.front(); + auto NodeCommandGroup = + static_cast(NodeImpl->MCommandGroup.get()); + + CGData.MRequirements.insert(CGData.MRequirements.end(), + NodeCommandGroup->getRequirements().begin(), + NodeCommandGroup->getRequirements().end()); + CGData.MAccStorage.insert(CGData.MAccStorage.end(), + NodeCommandGroup->getAccStorage().begin(), + NodeCommandGroup->getAccStorage().end()); + + assert(std::all_of( + NodeCommandGroup->MArgs.begin(), NodeCommandGroup->MArgs.end(), + [](ArgDesc Arg) { + return Arg.MType != sycl::detail::kernel_param_kind_t::kind_std_layout; + })); + + // Create a copy of this node command-group which contains the right + // dependencies for the current execution. + std::unique_ptr CommandGroup = + std::make_unique(sycl::detail::CGHostTask( + NodeCommandGroup->MHostTask, &Queue, NodeCommandGroup->MContext, + NodeCommandGroup->MArgs, std::move(CGData), + NodeCommandGroup->getType())); + + EventImplPtr SchedulerEvent = sycl::detail::Scheduler::getInstance().addCG( + std::move(CommandGroup), Queue.shared_from_this(), EventNeeded); + + if (EventNeeded) { + return SchedulerEvent; + } + return nullptr; +} + +EventImplPtr exec_graph_impl::enqueuePartitionWithScheduler( + std::shared_ptr &Partition, sycl::detail::queue_impl &Queue, + sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded) { + + if (!Partition->MRequirements.empty()) { + CGData.MRequirements.insert(CGData.MRequirements.end(), + Partition->MRequirements.begin(), + Partition->MRequirements.end()); + CGData.MAccStorage.insert(CGData.MAccStorage.end(), + Partition->MAccessors.begin(), + Partition->MAccessors.end()); + } + + auto CommandBuffer = Partition->MCommandBuffers[Queue.get_device()]; + + std::unique_ptr CommandGroup = + std::make_unique( + CommandBuffer, nullptr, std::move(CGData)); + + EventImplPtr SchedulerEvent = sycl::detail::Scheduler::getInstance().addCG( + std::move(CommandGroup), Queue.shared_from_this(), EventNeeded); + + if (EventNeeded) { + SchedulerEvent->setEventFromSubmittedExecCommandBuffer(true); + return SchedulerEvent; + } + + return nullptr; +} + +EventImplPtr exec_graph_impl::enqueuePartitionDirectly( + std::shared_ptr &Partition, sycl::detail::queue_impl &Queue, + std::vector &WaitEvents, bool EventNeeded) { + + // Create a list containing all the UR event handles in WaitEvents. WaitEvents + // is assumed to be safe for scheduler bypass and any host-task events that it + // contains can be ignored. + std::vector UrEventHandles{}; + UrEventHandles.reserve(WaitEvents.size()); + for (auto &SyclWaitEvent : WaitEvents) { + if (auto URHandle = SyclWaitEvent->getHandle()) { + UrEventHandles.push_back(URHandle); + } + } - // Map of the partitions to their execution events - std::unordered_map, sycl::detail::EventImplPtr> - PartitionsExecutionEvents; + auto CommandBuffer = Partition->MCommandBuffers[Queue.get_device()]; + const size_t UrEnqueueWaitListSize = UrEventHandles.size(); + const ur_event_handle_t *UrEnqueueWaitList = + UrEnqueueWaitListSize == 0 ? nullptr : UrEventHandles.data(); - auto CreateNewEvent([&]() { + if (!EventNeeded) { + Queue.getAdapter() + ->call( + Queue.getHandleRef(), CommandBuffer, UrEnqueueWaitListSize, + UrEnqueueWaitList, nullptr); + return nullptr; + } else { auto NewEvent = sycl::detail::event_impl::create_device_event(Queue); NewEvent->setContextImpl(Queue.getContextImpl()); NewEvent->setStateIncomplete(); + NewEvent->setSubmissionTime(); + ur_event_handle_t UrEvent = nullptr; + Queue.getAdapter() + ->call( + Queue.getHandleRef(), CommandBuffer, UrEventHandles.size(), + UrEnqueueWaitList, &UrEvent); + NewEvent->setHandle(UrEvent); + NewEvent->setEventFromSubmittedExecCommandBuffer(true); return NewEvent; - }); - - sycl::detail::EventImplPtr NewEvent; - std::vector BackupCGDataMEvents; - if (MPartitions.size() > 1) { - BackupCGDataMEvents = CGData.MEvents; - } - for (uint32_t currentPartitionsNum = 0; - currentPartitionsNum < MPartitions.size(); currentPartitionsNum++) { - auto CurrentPartition = MPartitions[currentPartitionsNum]; - // restore initial MEvents to add only needed additional depenencies - if (currentPartitionsNum > 0) { - CGData.MEvents = BackupCGDataMEvents; - } - - for (auto const &DepPartition : CurrentPartition->MPredecessors) { - CGData.MEvents.push_back(PartitionsExecutionEvents[DepPartition]); - } - - auto CommandBuffer = CurrentPartition->MCommandBuffers[Queue.get_device()]; - - if (CommandBuffer) { - for (std::vector::iterator It = - MExecutionEvents.begin(); - It != MExecutionEvents.end();) { - EventImplPtr &Event = *It; - if (!Event->isCompleted()) { - const std::vector &AttachedEventsList = - Event->getPostCompleteEvents(); - CGData.MEvents.reserve(CGData.MEvents.size() + - AttachedEventsList.size() + 1); - CGData.MEvents.push_back(Event); - // Add events of the previous execution of all graph partitions. - CGData.MEvents.insert(CGData.MEvents.end(), - AttachedEventsList.begin(), - AttachedEventsList.end()); - ++It; - } else { - // Remove completed events - It = MExecutionEvents.erase(It); - } + } +} + +EventImplPtr +exec_graph_impl::enqueuePartitions(sycl::detail::queue_impl &Queue, + sycl::detail::CG::StorageInitHelper &CGData, + bool IsCGDataSafeForSchedulerBypass, + bool EventNeeded) { + + // If EventNeeded is true, this vector is used to keep track of dependencies + // for the returned event. This is used when the graph has multiple end nodes + // which cannot be tracked with a single scheduler event. + std::vector PostCompleteDependencies; + // TODO After refactoring the event class to use enable_shared_from_this, the + // events used in PostCompleteDependencies can become raw pointers as long as + // Event->attachEventToComplete() extends the lifetime of the pointer with + // shared_from_this. + + // This variable represents the returned event. It will always be nullptr if + // EventNeeded is false. + EventImplPtr SignalEvent; + + // CGData.MEvents gets cleared after every partition enqueue. If we need the + // original events, a backup needs to be created now. This is only needed when + // the graph contains more than one root partition. + std::vector BackupCGDataEvents; + if (MRootPartitions.size() > 1) { + BackupCGDataEvents = CGData.MEvents; + } + + for (auto &Partition : MPartitions) { + + if (Partition->MPredecessors.empty() && CGData.MEvents.empty()) { + // If this is a root partition and CGData has been cleared already, we + // need to restore it so that the partition execution waits for the + // dependencies of this graph execution. + CGData.MEvents = BackupCGDataEvents; + } else { + // Partitions can have multiple dependencies from previously executed + // partitions. To enforce this ordering, we need to add these dependencies + // to CGData. + for (auto &Predecessor : Partition->MPredecessors) { + CGData.MEvents.push_back(Predecessor->MEvent); } + } - NewEvent = CreateNewEvent(); - ur_event_handle_t UREvent = nullptr; - // Merge requirements from the nodes into requirements (if any) from the - // handler. - CGData.MRequirements.insert(CGData.MRequirements.end(), - MRequirements.begin(), MRequirements.end()); - CGData.MAccStorage.insert(CGData.MAccStorage.end(), MAccessors.begin(), - MAccessors.end()); - - // If we have no requirements or dependent events for the command buffer, - // enqueue it directly - if (CGData.MRequirements.empty() && CGData.MEvents.empty()) { - NewEvent->setSubmissionTime(); - ur_result_t Res = - Queue.getAdapter() - ->call_nocheck< - sycl::detail::UrApiKind::urEnqueueCommandBufferExp>( - Queue.getHandleRef(), CommandBuffer, 0, nullptr, &UREvent); - NewEvent->setHandle(UREvent); - if (Res == UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES) { - throw sycl::exception( - make_error_code(errc::invalid), - "Graphs cannot be submitted to a queue which uses " - "immediate command lists. Use " - "sycl::ext::intel::property::queue::no_immediate_" - "command_list to disable them."); - } else if (Res != UR_RESULT_SUCCESS) { - throw sycl::exception( - errc::event, - "Failed to enqueue event for command buffer submission"); - } + // We always need to request an event to use as dependency between + // partitions executions and between graph executions because the + // scheduler doesn't seem to guarantee the execution order of host-tasks + // without adding explicit event dependencies even when the queue is + // in-order. + constexpr bool RequestEvent = true; + + EventImplPtr EnqueueEvent; + if (Partition->MIsHostTask) { + // The event returned by a host-task is always needed to synchronize with + // other partitions or to be used by the sycl queue as a dependency for + // further commands. + EnqueueEvent = + enqueueHostTaskPartition(Partition, Queue, CGData, RequestEvent); + } else { + // The scheduler can only be skipped if the partition is a root and is not + // a host-task. This is because all host-tasks need to go through the + // scheduler and, since only the scheduler can wait on host-task events, + // any subsequent partitions that depend on a host-task partition also + // need to use the scheduler. + bool SkipScheduler = Partition->MPredecessors.empty() && + IsCGDataSafeForSchedulerBypass && + Partition->MRequirements.empty(); + if (SkipScheduler) { + EnqueueEvent = enqueuePartitionDirectly(Partition, Queue, + CGData.MEvents, RequestEvent); } else { - std::unique_ptr CommandGroup = - std::make_unique( - CommandBuffer, nullptr, std::move(CGData)); + EnqueueEvent = enqueuePartitionWithScheduler(Partition, Queue, CGData, + RequestEvent); + } + } - NewEvent = sycl::detail::Scheduler::getInstance().addCG( - std::move(CommandGroup), Queue.shared_from_this(), - /*EventNeeded=*/true); + if (!Partition->MSuccessors.empty()) { + // Need to keep track of the EnqueueEvent for this partition so that + // it can be added as a dependency to CGData when successors are executed. + Partition->MEvent = std::move(EnqueueEvent); + } else { + // Unified runtime guarantees the execution order of command-buffers. + // However, since host-tasks have been scheduled, we always need to add a + // dependency for the next graph execution. If we don't the next graph + // execution could end up with the same host-task node executing in + // parallel. + MSchedulerDependencies.push_back(EnqueueEvent); + if (EventNeeded) { + const bool IsLastPartition = (Partition == MPartitions.back()); + if (IsLastPartition) { + // If we are in the last partition move the event to SignalEvent, + // so that it can be returned to the user. + SignalEvent = std::move(EnqueueEvent); + } else { + // If it's not the last partition, keep track of the event as a post + // complete dependency. + PostCompleteDependencies.push_back(std::move(EnqueueEvent)); + } } - NewEvent->setEventFromSubmittedExecCommandBuffer(true); - } else if ((CurrentPartition->MSchedule.size() > 0) && - (CurrentPartition->MSchedule.front()->MCGType == - sycl::detail::CGType::CodeplayHostTask)) { + } - auto NodeImpl = CurrentPartition->MSchedule.front(); - // Schedule host task - NodeImpl->MCommandGroup->getEvents().insert( - NodeImpl->MCommandGroup->getEvents().end(), CGData.MEvents.begin(), - CGData.MEvents.end()); - // HostTask CG stores the Queue on which the task was submitted. - // In case of graph, this queue may differ from the actual execution - // queue. We therefore overload this Queue before submitting the task. - static_cast(*NodeImpl->MCommandGroup.get()) - .MQueue = Queue.shared_from_this(); + // Clear the event list so that unnecessary dependencies are not added on + // future partition executions. + CGData.MEvents.clear(); + } - NewEvent = sycl::detail::Scheduler::getInstance().addCG( - NodeImpl->getCGCopy(), Queue.shared_from_this(), - /*EventNeeded=*/true); + if (EventNeeded) { + for (auto &EventFromOtherPartitions : PostCompleteDependencies) { + SignalEvent->attachEventToComplete(EventFromOtherPartitions); } - PartitionsExecutionEvents[CurrentPartition] = NewEvent; } - // Keep track of this execution event so we can make sure it's completed in - // the destructor. - MExecutionEvents.push_back(NewEvent); - // Attach events of previous partitions to ensure that when the returned event - // is complete all execution associated with the graph have been completed. - for (auto const &Elem : PartitionsExecutionEvents) { - if (Elem.second != NewEvent) { - NewEvent->attachEventToComplete(Elem.second); + return SignalEvent; +} + +EventImplPtr +exec_graph_impl::enqueue(sycl::detail::queue_impl &Queue, + sycl::detail::CG::StorageInitHelper CGData, + bool EventNeeded) { + WriteLock Lock(MMutex); + + cleanupExecutionEvents(MSchedulerDependencies); + CGData.MEvents.insert(CGData.MEvents.end(), MSchedulerDependencies.begin(), + MSchedulerDependencies.end()); + + bool IsCGDataSafeForSchedulerBypass = + detail::Scheduler::areEventsSafeForSchedulerBypass( + CGData.MEvents, Queue.getContextImpl()) && + CGData.MRequirements.empty(); + + // This variable represents the returned event. It will always be nullptr if + // EventNeeded is false. + EventImplPtr SignalEvent; + + if (!MContainsHostTask) { + bool SkipScheduler = + IsCGDataSafeForSchedulerBypass && MPartitions[0]->MRequirements.empty(); + if (SkipScheduler) { + SignalEvent = enqueuePartitionDirectly(MPartitions[0], Queue, + CGData.MEvents, EventNeeded); + } else { + bool RequestSchedulerEvent = EventNeeded || MIsUpdatable; + auto SchedulerEvent = enqueuePartitionWithScheduler( + MPartitions[0], Queue, std::move(CGData), RequestSchedulerEvent); + + // If the graph is updatable, and we are going through the scheduler, we + // need to track the execution event to make sure that any future updates + // happen after the graph execution. + // There is no need to track the execution event when updates are not + // allowed because Unified Runtime already guarantees the execution order + // of command-buffers. + if (MIsUpdatable) { + MSchedulerDependencies.push_back( + EventNeeded ? SchedulerEvent : std::move(SchedulerEvent)); + } + + if (EventNeeded) { + SignalEvent = std::move(SchedulerEvent); + } } + } else { + SignalEvent = enqueuePartitions( + Queue, CGData, IsCGDataSafeForSchedulerBypass, EventNeeded); } - NewEvent->setProfilingEnabled(MEnableProfiling); - sycl::event QueueEvent = - sycl::detail::createSyclObjFromImpl(NewEvent); - return QueueEvent; + + if (EventNeeded) { + SignalEvent->setProfilingEnabled(MEnableProfiling); + } + + return SignalEvent; } void exec_graph_impl::duplicateNodes() { @@ -1408,24 +1586,16 @@ void exec_graph_impl::update( std::vector UpdateRequirements; bool NeedScheduledUpdate = needsScheduledUpdate(Nodes, UpdateRequirements); if (NeedScheduledUpdate) { - // Clean up any execution events which have finished so we don't pass them - // to the scheduler. - for (auto It = MExecutionEvents.begin(); It != MExecutionEvents.end();) { - if ((*It)->isCompleted()) { - It = MExecutionEvents.erase(It); - continue; - } - ++It; - } + cleanupExecutionEvents(MSchedulerDependencies); // Track the event for the update command since execution may be blocked by // other scheduler commands auto UpdateEvent = sycl::detail::Scheduler::getInstance().addCommandGraphUpdate( this, Nodes, MQueueImpl, std::move(UpdateRequirements), - MExecutionEvents); + MSchedulerDependencies); - MExecutionEvents.push_back(UpdateEvent); + MSchedulerDependencies.push_back(UpdateEvent); if (MContainsHostTask) { // If the graph has HostTasks, the update has to be blocking. This is @@ -1448,17 +1618,11 @@ void exec_graph_impl::update( // Rebuild cached requirements and accessor storage for this graph with // updated nodes MRequirements.clear(); - MAccessors.clear(); - for (auto &Node : MNodeStorage) { - if (!Node->MCommandGroup) - continue; - MRequirements.insert(MRequirements.end(), - Node->MCommandGroup->getRequirements().begin(), - Node->MCommandGroup->getRequirements().end()); - MAccessors.insert(MAccessors.end(), - Node->MCommandGroup->getAccStorage().begin(), - Node->MCommandGroup->getAccStorage().end()); + for (auto &Partition : MPartitions) { + Partition->MRequirements.clear(); + Partition->MAccessors.clear(); } + buildRequirements(); } bool exec_graph_impl::needsScheduledUpdate( @@ -1500,7 +1664,7 @@ bool exec_graph_impl::needsScheduledUpdate( // If we have previous execution events do the update through the scheduler to // ensure it is ordered correctly. - NeedScheduledUpdate |= MExecutionEvents.size() > 0; + NeedScheduledUpdate |= MSchedulerDependencies.size() > 0; return NeedScheduledUpdate; } @@ -1995,10 +2159,11 @@ void executable_command_graph::finalizeImpl() { auto Device = impl->getGraphImpl()->getDevice(); for (auto Partition : impl->getPartitions()) { - if (!Partition->isHostTask()) { + if (!Partition->MIsHostTask) { impl->createCommandBuffers(Device, Partition); } } + impl->buildRequirements(); } void executable_command_graph::update( diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 0ef5b506318da..3a4db99196af1 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -771,16 +771,28 @@ class partition { std::unordered_map MCommandBuffers; /// List of predecessors to this partition. - std::vector> MPredecessors; + std::vector MPredecessors; + + /// List of successors to this partition. + std::vector MSuccessors; + + /// List of requirements for this partition. + std::vector MRequirements; + + /// Storage for accessors which are used by this partition. + std::vector MAccessors; + /// True if the graph of this partition is a single path graph /// and in-order optmization can be applied on it. bool MIsInOrderGraph = false; - /// @return True if the partition contains a host task - bool isHostTask() const { - return (MRoots.size() && ((*MRoots.begin()).lock()->MCGType == - sycl::detail::CGType::CodeplayHostTask)); - } + /// True if this partition contains only one node which is a host_task. + bool MIsHostTask = false; + + // Submission event for the partition. Used during enqueue to define + // dependencies between this partition and its successors. This event is + // replaced every time the partition is executed. + EventImplPtr MEvent; /// Checks if the graph is single path, i.e. each node has a single successor. /// @return True if the graph is a single path @@ -1330,9 +1342,17 @@ class exec_graph_impl { /// execution. /// @param Queue Command-queue to schedule execution on. /// @param CGData Command-group data provided by the sycl::handler - /// @return Event associated with the execution of the graph. - sycl::event enqueue(sycl::detail::queue_impl &Queue, - sycl::detail::CG::StorageInitHelper CGData); + /// @param EventNeeded Whether an event signalling the completion of this + /// operation needs to be returned. + /// @return Returns an event if EventNeeded is true. Returns nullptr + /// otherwise. + EventImplPtr enqueue(sycl::detail::queue_impl &Queue, + sycl::detail::CG::StorageInitHelper CGData, + bool EventNeeded); + + /// Iterates through all the nodes in the graph to build the list of + /// accessor requirements for the whole graph and for each partition. + void buildRequirements(); /// Turns the internal graph representation into UR command-buffers for a /// device. @@ -1366,13 +1386,17 @@ class exec_graph_impl { return MPartitions; } + /// Query whether the graph contains any host-task nodes. + /// @return True if the graph contains any host-task nodes. False otherwise. + bool containsHostTask() const { return MContainsHostTask; } + /// Checks if the previous submissions of this graph have been completed /// This function checks the status of events associated to the previous graph /// submissions. /// @return true if all previous submissions have been completed, false /// otherwise. bool previousSubmissionCompleted() const { - for (auto Event : MExecutionEvents) { + for (auto Event : MSchedulerDependencies) { if (!Event->isCompleted()) { return false; } @@ -1447,6 +1471,65 @@ class exec_graph_impl { ur_exp_command_buffer_handle_t CommandBuffer, std::shared_ptr Node); + /// Enqueues a host-task partition (i.e. a partition that contains only a + /// single node and that node is a host-task). + /// @param Partition The partition to enqueue. + /// @param Queue Command-queue to schedule execution on. + /// @param CGData Command-group data used for initializing the host-task + /// command-group. + /// @param EventNeeded Whether an event signalling the completion of this + /// operation needs to be returned. + /// @return If EventNeeded is true returns the event resulting from enqueueing + /// the host-task through the scheduler. Returns nullptr otherwise. + EventImplPtr enqueueHostTaskPartition( + std::shared_ptr &Partition, sycl::detail::queue_impl &Queue, + sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded); + + /// Enqueues a graph partition that contains no host-tasks using the + /// scheduler. + /// @param Partition The partition to enqueue. + /// @param Queue Command-queue to schedule execution on. + /// @param CGData Command-group data used for initializing the command-buffer + /// command-group. + /// @param EventNeeded Whether an event signalling the completion of this + /// operation needs to be returned. + /// @return If EventNeeded is true returns the event resulting from enqueueing + /// the command-buffer through the scheduler. Returns nullptr otherwise. + EventImplPtr enqueuePartitionWithScheduler( + std::shared_ptr &Partition, sycl::detail::queue_impl &Queue, + sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded); + + /// Enqueues a graph partition that contains no host-tasks by directly calling + /// the unified-runtime API (i.e. avoids scheduler overhead). + /// @param Partition The partition to enqueue. + /// @param Queue Command-queue to schedule execution on. + /// @param WaitEvents List of events to wait on. All the events on this list + /// must be safe for scheduler bypass. Only events containing a valid UR event + /// handle will be waited for. + /// @param EventNeeded Whether an event signalling the completion of this + /// operation needs to be returned. + /// @return If EventNeeded is true returns the event resulting from enqueueing + /// the command-buffer. Returns nullptr otherwise. + EventImplPtr enqueuePartitionDirectly( + std::shared_ptr &Partition, sycl::detail::queue_impl &Queue, + std::vector &WaitEvents, bool EventNeeded); + + /// Enqueues all the partitions in a graph. + /// @param Queue Command-queue to schedule execution on. + /// @param CGData Command-group data that contains the dependencies and + /// accessor requirements needed to enqueue this graph. + /// @param IsCGDataSafeForSchedulerBypass Whether CGData contains any events + /// that require enqueuing through the scheduler (e.g. requirements or + /// host-task events). + /// @param EventNeeded Whether an event signalling the completion of this + /// operation needs to be returned. + /// @return If EventNeeded is true returns the event resulting from enqueueing + /// the command-buffer. Returns nullptr otherwise. + EventImplPtr enqueuePartitions(sycl::detail::queue_impl &Queue, + sycl::detail::CG::StorageInitHelper &CGData, + bool IsCGDataSafeForSchedulerBypass, + bool EventNeeded); + /// Iterates back through predecessors to find the real dependency. /// @param[out] Deps Found dependencies. /// @param[in] CurrentNode Node to find dependencies for. @@ -1541,11 +1624,9 @@ class exec_graph_impl { /// List of requirements for enqueueing this command graph, accumulated from /// all nodes enqueued to the graph. std::vector MRequirements; - /// Storage for accessors which are used by this graph, accumulated from - /// all nodes enqueued to the graph. - std::vector MAccessors; - /// List of all execution events returned from command buffer enqueue calls. - std::vector MExecutionEvents; + /// List of dependencies that enqueue or update commands need to wait on + /// when using the scheduler path. + std::vector MSchedulerDependencies; /// List of the partitions that compose the exec graph. std::vector> MPartitions; /// Storage for copies of nodes from the original modifiable graph. @@ -1554,6 +1635,8 @@ class exec_graph_impl { std::unordered_map, ur_exp_command_buffer_command_handle_t> MCommandMap; + /// List of partition without any predecessors in this exec graph. + std::vector> MRootPartitions; /// True if this graph can be updated (set with property::updatable) bool MIsUpdatable; /// If true, the graph profiling is enabled. diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index f9a9f41450ec4..e940ab1885334 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -336,15 +336,12 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF, HandlerImpl->MEventMode = SubmitInfo.EventMode(); - auto isHostTask = Type == CGType::CodeplayHostTask; - - // TODO: this shouldn't be needed but without this - // the legacy adapter doesn't synchronize the operations properly - // when non-immediate command lists are used. - auto isGraphSubmission = Type == CGType::ExecCommandBuffer; + auto isHostTask = Type == CGType::CodeplayHostTask || + (Type == CGType::ExecCommandBuffer && + HandlerImpl->MExecGraph->containsHostTask()); auto requiresPostProcess = SubmitInfo.PostProcessorFunc() || Streams.size(); - auto noLastEventPath = !isHostTask && !isGraphSubmission && + auto noLastEventPath = !isHostTask && MNoLastEventMode.load(std::memory_order_acquire) && !requiresPostProcess; @@ -363,7 +360,7 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF, } else { std::unique_lock Lock(MMutex); - if (!isGraphSubmission && trySwitchingToNoEventsMode()) { + if (trySwitchingToNoEventsMode()) { EventImpl = finalizeHandlerInOrderNoEventsUnlocked(Handler); } else { EventImpl = finalizeHandlerInOrderWithDepsUnlocked(Handler); diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index ec7aa71196d8d..a2dfee5083095 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -757,7 +757,9 @@ class queue_impl : public std::enable_shared_from_this { detail::EventImplPtr finalizeHandlerInOrderHostTaskUnlocked(HandlerType &Handler) { assert(isInOrder()); - assert(Handler.getType() == CGType::CodeplayHostTask); + assert(Handler.getType() == CGType::CodeplayHostTask || + (Handler.getType() == CGType::ExecCommandBuffer && + getSyclObjImpl(Handler)->MExecGraph->containsHostTask())); auto &EventToBuildDeps = MGraph.expired() ? MDefaultGraphDeps.LastEventPtr : MExtGraphDeps.LastEventPtr; @@ -791,13 +793,8 @@ class queue_impl : public std::enable_shared_from_this { finalizeHandlerInOrderWithDepsUnlocked(HandlerType &Handler) { // this is handled by finalizeHandlerInOrderHostTask assert(Handler.getType() != CGType::CodeplayHostTask); - - if (Handler.getType() == CGType::ExecCommandBuffer && MNoLastEventMode) { - // TODO: this shouldn't be needed but without this - // the legacy adapter doesn't synchronize the operations properly - // when non-immediate command lists are used. - Handler.depends_on(insertHelperBarrier(Handler)); - } + assert(!(Handler.getType() == CGType::ExecCommandBuffer && + getSyclObjImpl(Handler)->MExecGraph->containsHostTask())); auto &EventToBuildDeps = MGraph.expired() ? MDefaultGraphDeps.LastEventPtr : MExtGraphDeps.LastEventPtr; diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 7959442d78dea..203b777a6ade7 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -798,15 +798,20 @@ event handler::finalize() { nullptr, impl->MExecGraph, std::move(impl->CGData))); } else { - event GraphCompletionEvent = - impl->MExecGraph->enqueue(impl->get_queue(), std::move(impl->CGData)); - + detail::queue_impl &Queue = impl->get_queue(); + bool DiscardEvent = !impl->MEventNeeded && + Queue.supportsDiscardingPiEvents() && + !impl->MExecGraph->containsHostTask(); + detail::EventImplPtr GraphCompletionEvent = impl->MExecGraph->enqueue( + Queue, std::move(impl->CGData), !DiscardEvent); #ifdef __INTEL_PREVIEW_BREAKING_CHANGES - MLastEvent = getSyclObjImpl(GraphCompletionEvent); + return GraphCompletionEvent; #else - MLastEvent = GraphCompletionEvent; + return sycl::detail::createSyclObjFromImpl( + GraphCompletionEvent + ? GraphCompletionEvent + : sycl::detail::event_impl::create_discarded_event()); #endif - return MLastEvent; } } break; case detail::CGType::CopyImage: diff --git a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_with_dependencies.cpp b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_with_dependencies.cpp new file mode 100644 index 0000000000000..b9751546e2923 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_with_dependencies.cpp @@ -0,0 +1,76 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// Tests the enqueue free function kernel shortcuts. + +#include "../graph_common.hpp" +#include +#include + +int main() { + device Device{}; + context Context{Device}; + + queue InOrderQueue{Context, Device, property::queue::in_order{}}; + queue OtherQueue{Context, Device, property::queue::in_order{}}; + + using T = int; + + T *PtrA = malloc_device(Size, InOrderQueue); + T *PtrB = malloc_device(Size, InOrderQueue); + T *PtrC = malloc_device(Size, InOrderQueue); + + exp_ext::command_graph Graph{InOrderQueue}; + Graph.begin_recording(InOrderQueue); + + T Pattern = 42; + exp_ext::fill(InOrderQueue, PtrA, Pattern, Size); + + exp_ext::single_task(InOrderQueue, [=]() { + for (size_t i = 0; i < Size; ++i) { + PtrB[i] = i; + } + }); + + exp_ext::parallel_for( + InOrderQueue, sycl::range<1>{Size}, + [=](sycl::item<1> Item) { PtrC[Item] += PtrA[Item] * PtrB[Item]; }); + + std::vector Output(Size); + exp_ext::copy(InOrderQueue, PtrC, Output.data(), Size); + + Graph.end_recording(); + + auto GraphExec = Graph.finalize(); + + const size_t MemsetValue = 12; + sycl::event Event = + exp_ext::submit_with_event(OtherQueue, [&](sycl::handler &CGH) { + exp_ext::single_task(CGH, [=]() { + for (size_t I = 0; I < Size; ++I) + PtrC[I] = MemsetValue; + }); + }); + + exp_ext::submit(InOrderQueue, [&](sycl::handler &CGH) { + CGH.depends_on(Event); + exp_ext::execute_graph(CGH, GraphExec); + }); + + InOrderQueue.wait_and_throw(); + + free(PtrA, InOrderQueue); + free(PtrB, InOrderQueue); + free(PtrC, InOrderQueue); + + for (size_t i = 0; i < Size; i++) { + T Ref = Pattern * i + MemsetValue; + assert(check_value(i, Ref, Output[i], "Output")); + } + + return 0; +} diff --git a/sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp index dfb100acec848..fe4731dc31535 100644 --- a/sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp +++ b/sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp @@ -417,11 +417,11 @@ TEST_F(CommandGraphTest, GraphPartitionsMerging) { auto GraphExecImpl = sycl::detail::getSyclObjImpl(GraphExec); auto PartitionsList = GraphExecImpl->getPartitions(); ASSERT_EQ(PartitionsList.size(), 5ul); - ASSERT_FALSE(PartitionsList[0]->isHostTask()); - ASSERT_TRUE(PartitionsList[1]->isHostTask()); - ASSERT_FALSE(PartitionsList[2]->isHostTask()); - ASSERT_TRUE(PartitionsList[3]->isHostTask()); - ASSERT_FALSE(PartitionsList[4]->isHostTask()); + ASSERT_FALSE(PartitionsList[0]->MIsHostTask); + ASSERT_TRUE(PartitionsList[1]->MIsHostTask); + ASSERT_FALSE(PartitionsList[2]->MIsHostTask); + ASSERT_TRUE(PartitionsList[3]->MIsHostTask); + ASSERT_FALSE(PartitionsList[4]->MIsHostTask); } TEST_F(CommandGraphTest, GetNodeFromEvent) { diff --git a/sycl/unittests/Extensions/CommandGraph/InOrderQueue.cpp b/sycl/unittests/Extensions/CommandGraph/InOrderQueue.cpp index c10cf2033bae2..cb234c687c249 100644 --- a/sycl/unittests/Extensions/CommandGraph/InOrderQueue.cpp +++ b/sycl/unittests/Extensions/CommandGraph/InOrderQueue.cpp @@ -529,3 +529,159 @@ TEST_F(CommandGraphTest, InOrderQueueMemcpyAndGraph) { auto EventGraph = InOrderQueue.submit( [&](sycl::handler &CGH) { CGH.ext_oneapi_graph(InOrderGraphExec); }); } + +// Validate that enqueuing a graph with +// sycl::ext::oneapi::experimental::execute_graph using an in-order queue, +// does not request a signaling event from the UR backend and has no event +// dependencies. +TEST_F(CommandGraphTest, InOrderQueueEventless) { + sycl::property_list Properties{sycl::property::queue::in_order()}; + sycl::queue InOrderQueue{Dev, Properties}; + + experimental::command_graph + InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()}; + + // Record in-order queue with three nodes + InOrderGraph.begin_recording(InOrderQueue); + auto Node1Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto Node2Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto Node3Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + InOrderGraph.end_recording(InOrderQueue); + + // Finalize main graph and check schedule + auto GraphExec = InOrderGraph.finalize(); + + auto beforeUrEnqueueCommandBufferExp = [](void *pParams) -> ur_result_t { + auto params = + *static_cast(pParams); + EXPECT_TRUE(*params.pnumEventsInWaitList == 0); + EXPECT_TRUE(*params.pphEventWaitList == nullptr); + EXPECT_TRUE(*params.pphEvent == nullptr); + + return UR_RESULT_SUCCESS; + }; + + sycl::unittest::UrMock<> Mock; + mock::getCallbacks().set_before_callback("urEnqueueCommandBufferExp", + beforeUrEnqueueCommandBufferExp); + + const size_t Iterations = 5; + for (size_t I = 0; I < Iterations; ++I) { + sycl::ext::oneapi::experimental::execute_graph(InOrderQueue, GraphExec); + } +} + +// Validate that if an event is requested when enqueueing a graph with +// sycl::ext::oneapi::experimental::submit_with_event with an in-order queue, +// the implementation requests a signal event but doesn't wait on any events +// dependencies. +TEST_F(CommandGraphTest, InOrderQueueRequestEvent) { + sycl::property_list Properties{sycl::property::queue::in_order()}; + sycl::queue InOrderQueue{Dev, Properties}; + + experimental::command_graph + InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()}; + + // Record in-order queue with three nodes + InOrderGraph.begin_recording(InOrderQueue); + auto Node1Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto Node2Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto Node3Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + InOrderGraph.end_recording(InOrderQueue); + + // Finalize main graph and check schedule + auto GraphExec = InOrderGraph.finalize(); + + auto beforeUrEnqueueCommandBufferExp = [](void *pParams) -> ur_result_t { + auto params = + *static_cast(pParams); + EXPECT_TRUE(*params.pnumEventsInWaitList == 0); + EXPECT_TRUE(*params.pphEventWaitList == nullptr); + EXPECT_TRUE(*params.pphEvent != nullptr); + + return UR_RESULT_SUCCESS; + }; + + sycl::unittest::UrMock<> Mock; + mock::getCallbacks().set_before_callback("urEnqueueCommandBufferExp", + beforeUrEnqueueCommandBufferExp); + + const size_t Iterations = 5; + std::vector OutputEvents; + + for (size_t I = 0; I < Iterations; ++I) { + OutputEvents.push_back(sycl::ext::oneapi::experimental::submit_with_event( + InOrderQueue, + [&](sycl::handler &cgh) { cgh.ext_oneapi_graph(GraphExec); })); + } +} + +// Validate that enqueuing a graph using an in-order queue with an event +// dependency does not request a signaling event from the UR backend and has +// 1 event dependency. +TEST_F(CommandGraphTest, InOrderQueueEventlessWithDependency) { + device Dev{}; + context Context{Dev}; + + sycl::property_list Properties{sycl::property::queue::in_order()}; + sycl::queue InOrderQueue{Context, Dev, Properties}; + sycl::queue OtherQueue{Context, Dev, Properties}; + + experimental::command_graph + InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()}; + + // Record in-order queue with three nodes + InOrderGraph.begin_recording(InOrderQueue); + auto Node1Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto Node2Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto Node3Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + InOrderGraph.end_recording(InOrderQueue); + + auto GraphExec = InOrderGraph.finalize(); + + auto beforeUrEnqueueCommandBufferExp = [](void *pParams) -> ur_result_t { + auto params = + *static_cast(pParams); + EXPECT_TRUE(*params.pnumEventsInWaitList == 1); + EXPECT_TRUE(*params.pphEvent == nullptr); + + return UR_RESULT_SUCCESS; + }; + + sycl::unittest::UrMock<> Mock; + mock::getCallbacks().set_before_callback("urEnqueueCommandBufferExp", + beforeUrEnqueueCommandBufferExp); + + sycl::event Event = sycl::ext::oneapi::experimental::submit_with_event( + OtherQueue, [&](sycl::handler &CGH) { + sycl::ext::oneapi::experimental::single_task>( + CGH, [=]() {}); + }); + + const size_t Iterations = 5; + for (size_t I = 0; I < Iterations; ++I) { + sycl::ext::oneapi::experimental::submit( + InOrderQueue, [&](sycl::handler &CGH) { + CGH.depends_on(Event); + sycl::ext::oneapi::experimental::execute_graph(CGH, GraphExec); + }); + } +} \ No newline at end of file diff --git a/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp b/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp index d34cd0c2d5de9..b6380276e5826 100644 --- a/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp +++ b/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp @@ -61,6 +61,8 @@ class LimitedHandler { handler_impl(std::shared_ptr Queue) : MQueue(Queue) {} std::shared_ptr MQueue; MockQueueImpl &get_queue() { return *MQueue; } + std::shared_ptr + MExecGraph; }; std::shared_ptr impl; std::shared_ptr MKernel; diff --git a/unified-runtime/source/adapters/hip/command_buffer.cpp b/unified-runtime/source/adapters/hip/command_buffer.cpp index af058cdde4cf0..0514afdf71669 100644 --- a/unified-runtime/source/adapters/hip/command_buffer.cpp +++ b/unified-runtime/source/adapters/hip/command_buffer.cpp @@ -273,6 +273,10 @@ urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { if (hCommandBuffer->decrementReferenceCount() == 0) { + if (hCommandBuffer->CurrentExecution) { + UR_CHECK_ERROR(hCommandBuffer->CurrentExecution->wait()); + UR_CHECK_ERROR(urEventRelease(hCommandBuffer->CurrentExecution)); + } delete hCommandBuffer; } return UR_RESULT_SUCCESS; @@ -795,22 +799,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCommandBufferExp( hipStream_t HIPStream = hQueue->getNextComputeStream( numEventsInWaitList, phEventWaitList, Guard, &StreamToken); + if (hCommandBuffer->CurrentExecution) { + UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, 1, + &hCommandBuffer->CurrentExecution)); + UR_CHECK_ERROR(urEventRelease(hCommandBuffer->CurrentExecution)); + } + UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, phEventWaitList)); - if (phEvent) { - RetImplEvent = std::make_unique( - UR_COMMAND_ENQUEUE_COMMAND_BUFFER_EXP, hQueue, HIPStream, - StreamToken); - UR_CHECK_ERROR(RetImplEvent->start()); - } + RetImplEvent = std::make_unique( + UR_COMMAND_ENQUEUE_COMMAND_BUFFER_EXP, hQueue, HIPStream, StreamToken); + UR_CHECK_ERROR(RetImplEvent->start()); // Launch graph UR_CHECK_ERROR(hipGraphLaunch(hCommandBuffer->HIPGraphExec, HIPStream)); + UR_CHECK_ERROR(RetImplEvent->record()); + + hCommandBuffer->CurrentExecution = RetImplEvent.release(); if (phEvent) { - UR_CHECK_ERROR(RetImplEvent->record()); - *phEvent = RetImplEvent.release(); + UR_CHECK_ERROR(urEventRetain(hCommandBuffer->CurrentExecution)); + *phEvent = hCommandBuffer->CurrentExecution; } } catch (ur_result_t Err) { return Err; diff --git a/unified-runtime/source/adapters/hip/command_buffer.hpp b/unified-runtime/source/adapters/hip/command_buffer.hpp index 3d0047adee013..728d97719035b 100644 --- a/unified-runtime/source/adapters/hip/command_buffer.hpp +++ b/unified-runtime/source/adapters/hip/command_buffer.hpp @@ -128,6 +128,10 @@ struct ur_exp_command_buffer_handle_t_ : ur::hip::handle_base { // Atomic variable counting the number of reference to this command_buffer // using std::atomic prevents data race when incrementing/decrementing. std::atomic_uint32_t RefCount; + // Track the event of the current graph execution. This extra synchronization + // is needed because HIP (unlike CUDA) does not seem to synchronize with other + // executions of the same graph during hipGraphLaunch and hipExecGraphDestroy. + ur_event_handle_t CurrentExecution = nullptr; // Ordered map of sync_points to ur_events std::map SyncPoints; diff --git a/unified-runtime/source/adapters/level_zero/command_buffer.cpp b/unified-runtime/source/adapters/level_zero/command_buffer.cpp index 6101e3b6d4ea0..a69f23f286dfd 100644 --- a/unified-runtime/source/adapters/level_zero/command_buffer.cpp +++ b/unified-runtime/source/adapters/level_zero/command_buffer.cpp @@ -1488,34 +1488,33 @@ ur_result_t waitForDependencies(ur_exp_command_buffer_handle_t CommandBuffer, std::scoped_lock Guard(CommandBuffer->Mutex); const bool UseCopyEngine = false; bool MustSignalWaitEvent = true; - if (NumEventsInWaitList) { - ur_ze_event_list_t TmpWaitList; - UR_CALL(TmpWaitList.createAndRetainUrZeEventList( - NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); - - // Update the WaitList of the Wait Event - // Events are appended to the WaitList if the WaitList is not empty - if (CommandBuffer->WaitEvent->WaitList.isEmpty()) - CommandBuffer->WaitEvent->WaitList = TmpWaitList; - else - CommandBuffer->WaitEvent->WaitList.insert(TmpWaitList); - - if (!CommandBuffer->WaitEvent->WaitList.isEmpty()) { - // Create command-list to execute before `CommandListPtr` and will signal - // when `EventWaitList` dependencies are complete. - ur_command_list_ptr_t WaitCommandList{}; - UR_CALL(Queue->Context->getAvailableCommandList( - Queue, WaitCommandList, false /*UseCopyEngine*/, NumEventsInWaitList, - EventWaitList, false /*AllowBatching*/, nullptr /*ForcedCmdQueue*/)); - - ZE2UR_CALL(zeCommandListAppendBarrier, - (WaitCommandList->first, CommandBuffer->WaitEvent->ZeEvent, - CommandBuffer->WaitEvent->WaitList.Length, - CommandBuffer->WaitEvent->WaitList.ZeEventList)); - Queue->executeCommandList(WaitCommandList, false /*IsBlocking*/, - false /*OKToBatchCommand*/); - MustSignalWaitEvent = false; - } + + ur_ze_event_list_t TmpWaitList; + UR_CALL(TmpWaitList.createAndRetainUrZeEventList( + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + + // Update the WaitList of the Wait Event + // Events are appended to the WaitList if the WaitList is not empty + if (CommandBuffer->WaitEvent->WaitList.isEmpty()) + CommandBuffer->WaitEvent->WaitList = TmpWaitList; + else + CommandBuffer->WaitEvent->WaitList.insert(TmpWaitList); + + if (!CommandBuffer->WaitEvent->WaitList.isEmpty()) { + // Create command-list to execute before `CommandListPtr` and will signal + // when `EventWaitList` dependencies are complete. + ur_command_list_ptr_t WaitCommandList{}; + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, WaitCommandList, false /*UseCopyEngine*/, NumEventsInWaitList, + EventWaitList, false /*AllowBatching*/, nullptr /*ForcedCmdQueue*/)); + + ZE2UR_CALL(zeCommandListAppendBarrier, + (WaitCommandList->first, CommandBuffer->WaitEvent->ZeEvent, + CommandBuffer->WaitEvent->WaitList.Length, + CommandBuffer->WaitEvent->WaitList.ZeEventList)); + Queue->executeCommandList(WaitCommandList, false /*IsBlocking*/, + false /*OKToBatchCommand*/); + MustSignalWaitEvent = false; } // Given WaitEvent was created without specifying Counting Events, then this // event can be signalled on the host.