From 7a5c5fd17c4ab0db7cc65d80d51619c558f34a86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Mestre?= Date: Mon, 26 May 2025 19:15:25 +0100 Subject: [PATCH 01/20] Optimization enqueue work in progress --- sycl/source/detail/graph_impl.cpp | 483 ++++++++++++------ sycl/source/detail/graph_impl.hpp | 116 ++++- sycl/source/detail/queue_impl.cpp | 13 +- sycl/source/detail/queue_impl.hpp | 13 +- sycl/source/handler.cpp | 16 +- ...pi_enqueue_functions_with_dependencies.cpp | 72 +++ .../adapters/level_zero/command_buffer.cpp | 40 +- 7 files changed, 542 insertions(+), 211 deletions(-) create mode 100644 sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_with_dependencies.cpp diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index abb2eaea4bd68..8ac0d64e75ee5 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -281,6 +281,9 @@ void exec_graph_impl::makePartitions() { MPartitionNodes[Node] = PartitionFinalNum; if (isPartitionRoot(Node)) { Partition->MRoots.insert(Node); + if (Node->MCGType == CGType::CodeplayHostTask) { + Partition->MIsHostTask = true; + } } } } @@ -288,6 +291,7 @@ void exec_graph_impl::makePartitions() { Partition->schedule(); Partition->MIsInOrderGraph = Partition->checkIfGraphIsSinglePath(); MPartitions.push_back(Partition); + MRootPartitions.push_back(Partition); PartitionFinalNum++; } } @@ -295,6 +299,7 @@ void exec_graph_impl::makePartitions() { // Add an empty partition if there is no partition, i.e. empty graph if (MPartitions.size() == 0) { MPartitions.push_back(std::make_shared()); + MRootPartitions.push_back(MPartitions[0]); } // Make global schedule list @@ -309,8 +314,9 @@ void exec_graph_impl::makePartitions() { auto RootNode = Root.lock(); for (const auto &Dep : RootNode->MPredecessors) { auto NodeDep = Dep.lock(); - Partition->MPredecessors.push_back( - MPartitions[MPartitionNodes[NodeDep]]); + auto &Predecessor = MPartitions[MPartitionNodes[NodeDep]]; + Partition->MPredecessors.push_back(Predecessor); + Predecessor->MSuccessors.push_back(Partition); } } } @@ -881,6 +887,30 @@ exec_graph_impl::enqueueNode(ur_exp_command_buffer_handle_t CommandBuffer, return Event->getSyncPoint(); } + +void exec_graph_impl::buildRequirements() { + + for (auto &Node : MNodeStorage) { + if (!Node->MCommandGroup) + continue; + + MRequirements.insert(MRequirements.end(), + Node->MCommandGroup->getRequirements().begin(), + Node->MCommandGroup->getRequirements().end()); + + std::shared_ptr &Partition = MPartitions[MPartitionNodes[Node]]; + + Partition->MRequirements.insert( + Partition->MRequirements.end(), + Node->MCommandGroup->getRequirements().begin(), + Node->MCommandGroup->getRequirements().end()); + + Partition->MAccessors.insert(Partition->MAccessors.end(), + Node->MCommandGroup->getAccStorage().begin(), + Node->MCommandGroup->getAccStorage().end()); + } +} + void exec_graph_impl::createCommandBuffers( sycl::device Device, std::shared_ptr &Partition) { ur_exp_command_buffer_handle_t OutCommandBuffer; @@ -920,16 +950,6 @@ void exec_graph_impl::createCommandBuffers( } else { MSyncPoints[Node] = enqueueNode(OutCommandBuffer, Node); } - - // Append Node requirements to overall graph requirements - MRequirements.insert(MRequirements.end(), - Node->MCommandGroup->getRequirements().begin(), - Node->MCommandGroup->getRequirements().end()); - // Also store the actual accessor to make sure they are kept alive when - // commands are submitted - MAccessors.insert(MAccessors.end(), - Node->MCommandGroup->getAccStorage().begin(), - Node->MCommandGroup->getAccStorage().end()); } Res = Adapter @@ -950,7 +970,7 @@ exec_graph_impl::exec_graph_impl(sycl::context Context, sycl::detail::getSyclObjImpl(Context), sycl::async_handler{}, sycl::property_list{})), MDevice(GraphImpl->getDevice()), MContext(Context), MRequirements(), - MExecutionEvents(), + MSchedulerDependencies(), MIsUpdatable(PropList.has_property()), MEnableProfiling( PropList.has_property()), @@ -977,11 +997,6 @@ exec_graph_impl::~exec_graph_impl() { const sycl::detail::AdapterPtr &Adapter = sycl::detail::getSyclObjImpl(MContext)->getAdapter(); MSchedule.clear(); - // We need to wait on all command buffer executions before we can release - // them. - for (auto &Event : MExecutionEvents) { - Event->wait(Event); - } // Clean up any graph-owned allocations that were allocated MGraphImpl->getMemPool().deallocateAndUnmapAll(); @@ -1002,138 +1017,309 @@ exec_graph_impl::~exec_graph_impl() { } } -sycl::event -exec_graph_impl::enqueue(const std::shared_ptr &Queue, - sycl::detail::CG::StorageInitHelper CGData) { - WriteLock Lock(MMutex); +// Clean up any execution events which have finished so we don't pass them +// to the scheduler. +static void cleanupExecutionEvents(std::vector &ExecutionEvents) { - // Map of the partitions to their execution events - std::unordered_map, sycl::detail::EventImplPtr> - PartitionsExecutionEvents; + auto Predicate = [](EventImplPtr &EventPtr) { + return EventPtr->isCompleted(); + }; - auto CreateNewEvent([&]() { + ExecutionEvents.erase( + std::remove_if(ExecutionEvents.begin(), ExecutionEvents.end(), Predicate), + ExecutionEvents.end()); +} + +EventImplPtr exec_graph_impl::enqueueHostTaskPartition( + std::shared_ptr &Partition, + const std::shared_ptr &Queue, + sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded) { + + auto NodeImpl = Partition->MSchedule.front(); + auto NodeCommandGroup = + static_cast(NodeImpl->MCommandGroup.get()); + + CGData.MRequirements.insert(CGData.MRequirements.end(), + NodeCommandGroup->getRequirements().begin(), + NodeCommandGroup->getRequirements().end()); + CGData.MAccStorage.insert(CGData.MAccStorage.end(), + NodeCommandGroup->getAccStorage().begin(), + NodeCommandGroup->getAccStorage().end()); + + assert(std::all_of( + NodeCommandGroup->MArgs.begin(), NodeCommandGroup->MArgs.end(), + [](ArgDesc Arg) { + return Arg.MType != sycl::detail::kernel_param_kind_t::kind_std_layout; + })); + + // Create a copy of this node command-group which contains the right + // dependencies for the current execution. + std::unique_ptr CommandGroup = + std::make_unique(sycl::detail::CGHostTask( + NodeCommandGroup->MHostTask, Queue, NodeCommandGroup->MContext, + NodeCommandGroup->MArgs, std::move(CGData), + NodeCommandGroup->getType())); + + EventImplPtr SchedulerEvent = sycl::detail::Scheduler::getInstance().addCG( + std::move(CommandGroup), Queue, EventNeeded); + + if (EventNeeded) { + return SchedulerEvent; + } + return nullptr; +} + +EventImplPtr exec_graph_impl::enqueuePartitionWithScheduler( + std::shared_ptr &Partition, + const std::shared_ptr &Queue, + sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded) { + + if (!Partition->MRequirements.empty()) { + CGData.MRequirements.insert(CGData.MRequirements.end(), + Partition->MRequirements.begin(), + Partition->MRequirements.end()); + CGData.MAccStorage.insert(CGData.MAccStorage.end(), + Partition->MAccessors.begin(), + Partition->MAccessors.end()); + } + + auto CommandBuffer = Partition->MCommandBuffers[Queue->get_device()]; + + std::unique_ptr CommandGroup = + std::make_unique( + CommandBuffer, nullptr, std::move(CGData)); + + EventImplPtr SchedulerEvent = sycl::detail::Scheduler::getInstance().addCG( + std::move(CommandGroup), Queue, EventNeeded); + + if (EventNeeded) { + SchedulerEvent->setEventFromSubmittedExecCommandBuffer(true); + return SchedulerEvent; + } + + return nullptr; +} + +EventImplPtr exec_graph_impl::enqueuePartitionDirectly( + std::shared_ptr &Partition, + const std::shared_ptr &Queue, + std::vector &WaitEvents, bool EventNeeded) { + + auto CheckURResult = [](ur_result_t UrResult) { + if (UrResult != UR_RESULT_SUCCESS) { + throw sycl::exception( + errc::event, "Failed to enqueue event for command buffer submission"); + } + }; + + // Create a list containing all the UR event handles in WaitEvents. WaitEvents + // is assumed to be safe for scheduler bypass and any host-task events that it + // contains can be ignored. + std::vector UrEventHandles{}; + UrEventHandles.reserve(WaitEvents.size()); + for (auto &SyclWaitEvent : WaitEvents) { + auto URHandle = SyclWaitEvent->getHandle(); + if (URHandle) { + UrEventHandles.push_back(URHandle); + } + } + + auto CommandBuffer = Partition->MCommandBuffers[Queue->get_device()]; + const size_t UrEnqueueWaitListSize = UrEventHandles.size(); + const ur_event_handle_t *UrEnqueueWaitList = + UrEnqueueWaitListSize == 0 ? nullptr : UrEventHandles.data(); + + if (!EventNeeded) { + ur_result_t UrResult = + Queue->getAdapter() + ->call_nocheck( + Queue->getHandleRef(), CommandBuffer, UrEnqueueWaitListSize, + UrEnqueueWaitList, nullptr); + CheckURResult(UrResult); + return nullptr; + } else { auto NewEvent = std::make_shared(Queue); NewEvent->setContextImpl(Queue->getContextImplPtr()); NewEvent->setStateIncomplete(); + NewEvent->setSubmissionTime(); + ur_event_handle_t UrEvent = nullptr; + ur_result_t UrResult = + Queue->getAdapter() + ->call_nocheck( + Queue->getHandleRef(), CommandBuffer, UrEventHandles.size(), + UrEnqueueWaitList, &UrEvent); + CheckURResult(UrResult); + NewEvent->setHandle(UrEvent); + NewEvent->setEventFromSubmittedExecCommandBuffer(true); return NewEvent; - }); - - sycl::detail::EventImplPtr NewEvent; - std::vector BackupCGDataMEvents; - if (MPartitions.size() > 1) { - BackupCGDataMEvents = CGData.MEvents; - } - for (uint32_t currentPartitionsNum = 0; - currentPartitionsNum < MPartitions.size(); currentPartitionsNum++) { - auto CurrentPartition = MPartitions[currentPartitionsNum]; - // restore initial MEvents to add only needed additional depenencies - if (currentPartitionsNum > 0) { - CGData.MEvents = BackupCGDataMEvents; - } - - for (auto const &DepPartition : CurrentPartition->MPredecessors) { - CGData.MEvents.push_back(PartitionsExecutionEvents[DepPartition]); - } - - auto CommandBuffer = CurrentPartition->MCommandBuffers[Queue->get_device()]; - - if (CommandBuffer) { - for (std::vector::iterator It = - MExecutionEvents.begin(); - It != MExecutionEvents.end();) { - EventImplPtr &Event = *It; - if (!Event->isCompleted()) { - const std::vector &AttachedEventsList = - Event->getPostCompleteEvents(); - CGData.MEvents.reserve(CGData.MEvents.size() + - AttachedEventsList.size() + 1); - CGData.MEvents.push_back(Event); - // Add events of the previous execution of all graph partitions. - CGData.MEvents.insert(CGData.MEvents.end(), - AttachedEventsList.begin(), - AttachedEventsList.end()); - ++It; - } else { - // Remove completed events - It = MExecutionEvents.erase(It); - } + } +} + +EventImplPtr exec_graph_impl::enqueuePartitions( + const std::shared_ptr &Queue, + sycl::detail::CG::StorageInitHelper &CGData, + bool IsCGDataSafeForSchedulerBypass, bool EventNeeded) { + + // If EventNeeded is true, this vector is used to keep track of dependencies + // for the returned event. This is used when the graph has multiple end nodes + // which cannot be tracked with a single scheduler event. + std::vector PostCompleteDependencies; + + // This variable represents the returned event. It will always be nullptr if + // EventNeeded is false. + EventImplPtr SignalEvent; + + // CGData.MEvents gets cleared after every partition enqueue. If we need the + // original events, a backup needs to be created now. This is only needed when + // the graph contains more than one root partition. + std::vector BackupCGDataEvents; + if (MRootPartitions.size() > 1) { + BackupCGDataEvents = CGData.MEvents; + } + + for (auto &Partition : MPartitions) { + + if (Partition->MPredecessors.empty() && CGData.MEvents.empty()) { + // If this is a root partition and CGData has been cleared already, we + // need to restore it so that the partition execution waits for the + // dependencies of this graph execution. + CGData.MEvents = BackupCGDataEvents; + } else { + // Partitions can have multiple dependencies from previously executed + // partitions. To enforce this ordering, we need to add these dependencies + // to CGData. + for (auto &Predecessor : Partition->MPredecessors) { + CGData.MEvents.push_back(Predecessor.lock()->MEvent); } + } - NewEvent = CreateNewEvent(); - ur_event_handle_t UREvent = nullptr; - // Merge requirements from the nodes into requirements (if any) from the - // handler. - CGData.MRequirements.insert(CGData.MRequirements.end(), - MRequirements.begin(), MRequirements.end()); - CGData.MAccStorage.insert(CGData.MAccStorage.end(), MAccessors.begin(), - MAccessors.end()); - - // If we have no requirements or dependent events for the command buffer, - // enqueue it directly - if (CGData.MRequirements.empty() && CGData.MEvents.empty()) { - NewEvent->setSubmissionTime(); - ur_result_t Res = - Queue->getAdapter() - ->call_nocheck< - sycl::detail::UrApiKind::urEnqueueCommandBufferExp>( - Queue->getHandleRef(), CommandBuffer, 0, nullptr, &UREvent); - NewEvent->setHandle(UREvent); - if (Res == UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES) { - throw sycl::exception( - make_error_code(errc::invalid), - "Graphs cannot be submitted to a queue which uses " - "immediate command lists. Use " - "sycl::ext::intel::property::queue::no_immediate_" - "command_list to disable them."); - } else if (Res != UR_RESULT_SUCCESS) { - throw sycl::exception( - errc::event, - "Failed to enqueue event for command buffer submission"); - } + bool IsLastPartition = (Partition == MPartitions.back()); + EventImplPtr EnqueueEvent; + + // We always need to request an event to use as dependency between + // partitions executions and between graph executions because the + // scheduler doesn't seem to guarantee the execution order of host-tasks + // without adding explicit event dependencies even when the queue is + // in-order. + constexpr bool RequestEvent = true; + + if (Partition->MIsHostTask) { + // The event returned by a host-task is always needed to synchronize with + // other partitions or to be used by the sycl queue as a dependency for + // further commands. + EnqueueEvent = + enqueueHostTaskPartition(Partition, Queue, CGData, RequestEvent); + } else { + // The scheduler can only be skipped if the partition is a root and is not + // a host-task. This is because all host-tasks need to go through the + // scheduler and, since only the scheduler can wait on host-task events, + // any subsequent partitions that depend on a host-task partition also + // need to use the scheduler. + bool SkipScheduler = Partition->MPredecessors.empty() && + IsCGDataSafeForSchedulerBypass && + Partition->MRequirements.empty(); + if (SkipScheduler) { + EnqueueEvent = enqueuePartitionDirectly(Partition, Queue, + CGData.MEvents, RequestEvent); } else { - std::unique_ptr CommandGroup = - std::make_unique( - CommandBuffer, nullptr, std::move(CGData)); + EnqueueEvent = enqueuePartitionWithScheduler(Partition, Queue, CGData, + RequestEvent); + } + } - NewEvent = sycl::detail::Scheduler::getInstance().addCG( - std::move(CommandGroup), Queue, /*EventNeeded=*/true); + if (!Partition->MSuccessors.empty()) { + // Need to keep track of the EnqueueEvent for this partition so that + // it can be added as a dependency to CGData when successors are executed. + Partition->MEvent = std::move(EnqueueEvent); + } else { + // Unified runtime guarantees the execution order of command-buffers. + // However, since host-tasks have been scheduled, we always need to add a + // dependency for the next graph execution. If we don't the next graph + // execution could end up with the same host-task node executing in + // parallel. + MSchedulerDependencies.push_back(EnqueueEvent); + if (EventNeeded) { + if (IsLastPartition) { + // If we are in the last partition copy the event to SignalEvent, + // so that it can be returned to the user. + SignalEvent = std::move(EnqueueEvent); + } else { + // If it's not the last partition, keep track of the event as a post + // complete dependency. + PostCompleteDependencies.push_back(std::move(EnqueueEvent)); + } } - NewEvent->setEventFromSubmittedExecCommandBuffer(true); - } else if ((CurrentPartition->MSchedule.size() > 0) && - (CurrentPartition->MSchedule.front()->MCGType == - sycl::detail::CGType::CodeplayHostTask)) { + } - auto NodeImpl = CurrentPartition->MSchedule.front(); - // Schedule host task - NodeImpl->MCommandGroup->getEvents().insert( - NodeImpl->MCommandGroup->getEvents().end(), CGData.MEvents.begin(), - CGData.MEvents.end()); - // HostTask CG stores the Queue on which the task was submitted. - // In case of graph, this queue may differ from the actual execution - // queue. We therefore overload this Queue before submitting the task. - static_cast(*NodeImpl->MCommandGroup.get()) - .MQueue = Queue; + // Clear the event list so that unnecessary dependencies are not added on + // future partition executions. + CGData.MEvents.clear(); + } - NewEvent = sycl::detail::Scheduler::getInstance().addCG( - NodeImpl->getCGCopy(), Queue, /*EventNeeded=*/true); + if (EventNeeded) { + for (auto &EventFromOtherPartitions : PostCompleteDependencies) { + SignalEvent->attachEventToComplete(EventFromOtherPartitions); } - PartitionsExecutionEvents[CurrentPartition] = NewEvent; } - // Keep track of this execution event so we can make sure it's completed in - // the destructor. - MExecutionEvents.push_back(NewEvent); - // Attach events of previous partitions to ensure that when the returned event - // is complete all execution associated with the graph have been completed. - for (auto const &Elem : PartitionsExecutionEvents) { - if (Elem.second != NewEvent) { - NewEvent->attachEventToComplete(Elem.second); + return SignalEvent; +} + +EventImplPtr +exec_graph_impl::enqueue(const std::shared_ptr &Queue, + sycl::detail::CG::StorageInitHelper CGData, + bool EventNeeded) { + WriteLock Lock(MMutex); + + cleanupExecutionEvents(MSchedulerDependencies); + CGData.MEvents.insert(CGData.MEvents.end(), MSchedulerDependencies.begin(), + MSchedulerDependencies.end()); + + bool IsCGDataSafeForSchedulerBypass = + detail::Scheduler::areEventsSafeForSchedulerBypass( + CGData.MEvents, Queue->getContextImplPtr()) && + CGData.MRequirements.empty(); + + // This variable represents the returned event. It will always be nullptr if + // EventNeeded is false. + EventImplPtr SignalEvent; + + if (!MContainsHostTask) { + bool SkipScheduler = + IsCGDataSafeForSchedulerBypass && MPartitions[0]->MRequirements.empty(); + if (SkipScheduler) { + SignalEvent = enqueuePartitionDirectly(MPartitions[0], Queue, + CGData.MEvents, EventNeeded); + } else { + bool RequestSchedulerEvent = EventNeeded || MIsUpdatable; + auto SchedulerEvent = enqueuePartitionWithScheduler( + MPartitions[0], Queue, std::move(CGData), RequestSchedulerEvent); + + // If the graph is updatable, and we are going through the scheduler, we + // need to track the execution event to make sure that any future updates + // happen after the graph execution. + // There is no need to track the execution event when updates are not + // allowed because Unified Runtime already guarantees the execution order + // of command-buffers. + if (MIsUpdatable) { + MSchedulerDependencies.push_back( + EventNeeded ? SchedulerEvent : std::move(SchedulerEvent)); + } + + if (EventNeeded) { + SignalEvent = std::move(SchedulerEvent); + } } + } else { + SignalEvent = enqueuePartitions( + Queue, CGData, IsCGDataSafeForSchedulerBypass, EventNeeded); } - NewEvent->setProfilingEnabled(MEnableProfiling); - sycl::event QueueEvent = - sycl::detail::createSyclObjFromImpl(NewEvent); - return QueueEvent; + + if (EventNeeded) { + SignalEvent->setProfilingEnabled(MEnableProfiling); + } + + return SignalEvent; } void exec_graph_impl::duplicateNodes() { @@ -1382,24 +1568,16 @@ void exec_graph_impl::update( std::vector UpdateRequirements; bool NeedScheduledUpdate = needsScheduledUpdate(Nodes, UpdateRequirements); if (NeedScheduledUpdate) { - // Clean up any execution events which have finished so we don't pass them - // to the scheduler. - for (auto It = MExecutionEvents.begin(); It != MExecutionEvents.end();) { - if ((*It)->isCompleted()) { - It = MExecutionEvents.erase(It); - continue; - } - ++It; - } + cleanupExecutionEvents(MSchedulerDependencies); // Track the event for the update command since execution may be blocked by // other scheduler commands auto UpdateEvent = sycl::detail::Scheduler::getInstance().addCommandGraphUpdate( this, Nodes, MQueueImpl, std::move(UpdateRequirements), - MExecutionEvents); + MSchedulerDependencies); - MExecutionEvents.push_back(UpdateEvent); + MSchedulerDependencies.push_back(UpdateEvent); if (MContainsHostTask) { // If the graph has HostTasks, the update has to be blocking. This is @@ -1422,17 +1600,11 @@ void exec_graph_impl::update( // Rebuild cached requirements and accessor storage for this graph with // updated nodes MRequirements.clear(); - MAccessors.clear(); - for (auto &Node : MNodeStorage) { - if (!Node->MCommandGroup) - continue; - MRequirements.insert(MRequirements.end(), - Node->MCommandGroup->getRequirements().begin(), - Node->MCommandGroup->getRequirements().end()); - MAccessors.insert(MAccessors.end(), - Node->MCommandGroup->getAccStorage().begin(), - Node->MCommandGroup->getAccStorage().end()); + for (auto &Partition : MPartitions) { + Partition->MRequirements.clear(); + Partition->MAccessors.clear(); } + buildRequirements(); } bool exec_graph_impl::needsScheduledUpdate( @@ -1474,7 +1646,7 @@ bool exec_graph_impl::needsScheduledUpdate( // If we have previous execution events do the update through the scheduler to // ensure it is ordered correctly. - NeedScheduledUpdate |= MExecutionEvents.size() > 0; + NeedScheduledUpdate |= MSchedulerDependencies.size() > 0; return NeedScheduledUpdate; } @@ -1973,10 +2145,11 @@ void executable_command_graph::finalizeImpl() { auto Device = impl->getGraphImpl()->getDevice(); for (auto Partition : impl->getPartitions()) { - if (!Partition->isHostTask()) { + if (!Partition->MIsHostTask) { impl->createCommandBuffers(Device, Partition); } } + impl->buildRequirements(); } void executable_command_graph::update( diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index b803daa97c6b0..27a619d459d86 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -771,16 +771,28 @@ class partition { std::unordered_map MCommandBuffers; /// List of predecessors to this partition. - std::vector> MPredecessors; + std::vector> MPredecessors; + + /// List of successors to this partition. + std::vector> MSuccessors; + + /// List of requirements for this partition. + std::vector MRequirements; + + /// Storage for accessors which are used by this partition. + std::vector MAccessors; + /// True if the graph of this partition is a single path graph /// and in-order optmization can be applied on it. bool MIsInOrderGraph = false; - /// @return True if the partition contains a host task - bool isHostTask() const { - return (MRoots.size() && ((*MRoots.begin()).lock()->MCGType == - sycl::detail::CGType::CodeplayHostTask)); - } + /// True if this partition contains only one node which is a host_task. + bool MIsHostTask = false; + + // Submission event for the partition. Used during enqueue to define + // dependencies between this partition and its successors. This event is + // replaced every time the partition is executed. + EventImplPtr MEvent; /// Checks if the graph is single path, i.e. each node has a single successor. /// @return True if the graph is a single path @@ -1345,9 +1357,17 @@ class exec_graph_impl { /// execution. /// @param Queue Command-queue to schedule execution on. /// @param CGData Command-group data provided by the sycl::handler - /// @return Event associated with the execution of the graph. - sycl::event enqueue(const std::shared_ptr &Queue, - sycl::detail::CG::StorageInitHelper CGData); + /// @param EventNeeded Whether an event signalling the completion of this + /// operation needs to be returned. + /// @return Returns an event if EventNeeded is true. Returns nullptr + /// otherwise. + EventImplPtr enqueue(const std::shared_ptr &Queue, + sycl::detail::CG::StorageInitHelper CGData, + bool EventNeeded); + + /// Iterates through all the nodes in the graph to build the list of + /// accessor requirements for the whole graph and for each partition. + void buildRequirements(); /// Turns the internal graph representation into UR command-buffers for a /// device. @@ -1381,13 +1401,17 @@ class exec_graph_impl { return MPartitions; } + /// Query whether the graph contains any host-task nodes. + /// @return True if the graph contains any host-task nodes. False otherwise. + bool containsHostTask() const { return MContainsHostTask; } + /// Checks if the previous submissions of this graph have been completed /// This function checks the status of events associated to the previous graph /// submissions. /// @return true if all previous submissions have been completed, false /// otherwise. bool previousSubmissionCompleted() const { - for (auto Event : MExecutionEvents) { + for (auto Event : MSchedulerDependencies) { if (!Event->isCompleted()) { return false; } @@ -1461,6 +1485,68 @@ class exec_graph_impl { ur_exp_command_buffer_handle_t CommandBuffer, std::shared_ptr Node); + /// Enqueues a host-task partition (i.e. a partition that contains only a + /// single node and that node is a host-task). + /// @param Partition The partition to enqueue. + /// @param Queue Command-queue to schedule execution on. + /// @param CGData Command-group data used for initializing the host-task + /// command-group. + /// @param EventNeeded Whether an event signalling the completion of this + /// operation needs to be returned. + /// @return If EventNeeded is true returns the event resulting from enqueueing + /// the host-task through the scheduler. Returns nullptr otherwise. + EventImplPtr enqueueHostTaskPartition( + std::shared_ptr &Partition, + const std::shared_ptr &Queue, + sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded); + + /// Enqueues a graph partition that contains no host-tasks using the + /// scheduler. + /// @param Partition The partition to enqueue. + /// @param Queue Command-queue to schedule execution on. + /// @param CGData Command-group data used for initializing the command-buffer + /// command-group. + /// @param EventNeeded Whether an event signalling the completion of this + /// operation needs to be returned. + /// @return If EventNeeded is true returns the event resulting from enqueueing + /// the command-buffer through the scheduler. Returns nullptr otherwise. + EventImplPtr enqueuePartitionWithScheduler( + std::shared_ptr &Partition, + const std::shared_ptr &Queue, + sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded); + + /// Enqueues a graph partition that contains no host-tasks by directly calling + /// the unified-runtime API (i.e. avoids scheduler overhead). + /// @param Partition The partition to enqueue. + /// @param Queue Command-queue to schedule execution on. + /// @param WaitEvents List of events to wait on. All the events on this list + /// must be safe for scheduler bypass. Only events containing a valid UR event + /// handle will be waited for. + /// @param EventNeeded Whether an event signalling the completion of this + /// operation needs to be returned. + /// @return If EventNeeded is true returns the event resulting from enqueueing + /// the command-buffer. Returns nullptr otherwise. + EventImplPtr enqueuePartitionDirectly( + std::shared_ptr &Partition, + const std::shared_ptr &Queue, + std::vector &WaitEvents, bool EventNeeded); + + /// Enqueues all the partitions in a graph. + /// @param Queue Command-queue to schedule execution on. + /// @param CGData Command-group data that contains the dependencies and + /// accessor requirements needed to enqueue this graph. + /// @param IsCGDataSafeForSchedulerBypass Whether CGData contains any events + /// that require enqueuing through the scheduler (e.g. requirements or + /// host-task events). + /// @param EventNeeded Whether an event signalling the completion of this + /// operation needs to be returned. + /// @return If EventNeeded is true returns the event resulting from enqueueing + /// the command-buffer. Returns nullptr otherwise. + EventImplPtr + enqueuePartitions(const std::shared_ptr &Queue, + sycl::detail::CG::StorageInitHelper &CGData, + bool IsCGDataSafeForSchedulerBypass, bool EventNeeded); + /// Iterates back through predecessors to find the real dependency. /// @param[out] Deps Found dependencies. /// @param[in] CurrentNode Node to find dependencies for. @@ -1555,11 +1641,9 @@ class exec_graph_impl { /// List of requirements for enqueueing this command graph, accumulated from /// all nodes enqueued to the graph. std::vector MRequirements; - /// Storage for accessors which are used by this graph, accumulated from - /// all nodes enqueued to the graph. - std::vector MAccessors; - /// List of all execution events returned from command buffer enqueue calls. - std::vector MExecutionEvents; + /// List of dependencies that enqueue or update commands need to wait on + /// when using the scheduler path. + std::vector MSchedulerDependencies; /// List of the partitions that compose the exec graph. std::vector> MPartitions; /// Storage for copies of nodes from the original modifiable graph. @@ -1568,6 +1652,8 @@ class exec_graph_impl { std::unordered_map, ur_exp_command_buffer_command_handle_t> MCommandMap; + /// List of partition without any predecessors in this exec graph. + std::vector> MRootPartitions; /// True if this graph can be updated (set with property::updatable) bool MIsUpdatable; /// If true, the graph profiling is enabled. diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 8c6ce446a9206..f7e42c7fa0520 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -342,15 +342,12 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF, HandlerImpl->MEventMode = SubmitInfo.EventMode(); - auto isHostTask = Type == CGType::CodeplayHostTask; - - // TODO: this shouldn't be needed but without this - // the legacy adapter doesn't synchronize the operations properly - // when non-immediate command lists are used. - auto isGraphSubmission = Type == CGType::ExecCommandBuffer; + auto isHostTask = Type == CGType::CodeplayHostTask || + (Type == CGType::ExecCommandBuffer && + HandlerImpl->MExecGraph->containsHostTask()); auto requiresPostProcess = SubmitInfo.PostProcessorFunc() || Streams.size(); - auto noLastEventPath = !isHostTask && !isGraphSubmission && + auto noLastEventPath = !isHostTask && MNoLastEventMode.load(std::memory_order_acquire) && !requiresPostProcess; @@ -369,7 +366,7 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF, } else { std::unique_lock Lock(MMutex); - if (!isGraphSubmission && trySwitchingToNoEventsMode()) { + if (trySwitchingToNoEventsMode()) { EventImpl = finalizeHandlerInOrderNoEventsUnlocked(Handler); } else { EventImpl = finalizeHandlerInOrderWithDepsUnlocked(Handler); diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 99490ba2851c4..408db638d20e6 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -747,7 +747,9 @@ class queue_impl : public std::enable_shared_from_this { detail::EventImplPtr finalizeHandlerInOrderHostTaskUnlocked(HandlerType &Handler) { assert(isInOrder()); - assert(Handler.getType() == CGType::CodeplayHostTask); + assert(Handler.getType() == CGType::CodeplayHostTask || + (Handler.getType() == CGType::ExecCommandBuffer && + getSyclObjImpl(Handler)->MExecGraph->containsHostTask())); auto &EventToBuildDeps = MGraph.expired() ? MDefaultGraphDeps.LastEventPtr : MExtGraphDeps.LastEventPtr; @@ -781,13 +783,8 @@ class queue_impl : public std::enable_shared_from_this { finalizeHandlerInOrderWithDepsUnlocked(HandlerType &Handler) { // this is handled by finalizeHandlerInOrderHostTask assert(Handler.getType() != CGType::CodeplayHostTask); - - if (Handler.getType() == CGType::ExecCommandBuffer && MNoLastEventMode) { - // TODO: this shouldn't be needed but without this - // the legacy adapter doesn't synchronize the operations properly - // when non-immediate command lists are used. - Handler.depends_on(insertHelperBarrier(Handler)); - } + assert(!(Handler.getType() == CGType::ExecCommandBuffer && + getSyclObjImpl(Handler)->MExecGraph->containsHostTask())); auto &EventToBuildDeps = MGraph.expired() ? MDefaultGraphDeps.LastEventPtr : MExtGraphDeps.LastEventPtr; diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 59cbcb5384b8e..52a3d6f3f7e0d 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -775,15 +775,19 @@ event handler::finalize() { nullptr, impl->MExecGraph, std::move(impl->CGData))); } else { - event GraphCompletionEvent = - impl->MExecGraph->enqueue(MQueue, std::move(impl->CGData)); - + bool DiscardEvent = !impl->MEventNeeded && + MQueue->supportsDiscardingPiEvents() && + !impl->MExecGraph->containsHostTask(); + detail::EventImplPtr GraphCompletionEvent = impl->MExecGraph->enqueue( + MQueue, std::move(impl->CGData), !DiscardEvent); #ifdef __INTEL_PREVIEW_BREAKING_CHANGES - MLastEvent = getSyclObjImpl(GraphCompletionEvent); + return GraphCompletionEvent; #else - MLastEvent = GraphCompletionEvent; + return sycl::detail::createSyclObjFromImpl( + GraphCompletionEvent + ? GraphCompletionEvent + : std::make_shared(MQueue)); #endif - return MLastEvent; } } break; case detail::CGType::CopyImage: diff --git a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_with_dependencies.cpp b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_with_dependencies.cpp new file mode 100644 index 0000000000000..74f4b9ff1c9a3 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_with_dependencies.cpp @@ -0,0 +1,72 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// Tests the enqueue free function kernel shortcuts. + +#include "../graph_common.hpp" +#include +#include + +int main() { + queue InOrderQueue{property::queue::in_order{}}; + queue OtherQueue{property::queue::in_order{}}; + + using T = int; + + T *PtrA = malloc_device(Size, InOrderQueue); + T *PtrB = malloc_device(Size, InOrderQueue); + T *PtrC = malloc_device(Size, InOrderQueue); + + exp_ext::command_graph Graph{InOrderQueue}; + Graph.begin_recording(InOrderQueue); + + T Pattern = 42; + exp_ext::fill(InOrderQueue, PtrA, Pattern, Size); + + exp_ext::single_task(InOrderQueue, [=]() { + for (size_t i = 0; i < Size; ++i) { + PtrB[i] = i; + } + }); + + exp_ext::parallel_for( + InOrderQueue, sycl::range<1>{Size}, + [=](sycl::item<1> Item) { PtrC[Item] += PtrA[Item] * PtrB[Item]; }); + + std::vector Output(Size); + exp_ext::copy(InOrderQueue, PtrC, Output.data(), Size); + + Graph.end_recording(); + + auto GraphExec = Graph.finalize(); + + sycl::event Event = + exp_ext::submit_with_event(OtherQueue, [&](sycl::handler &CGH) { + exp_ext::single_task(CGH, [=]() { + for (size_t I = 0; I < Size; ++I) + PtrC[I] = 42; + }); + }); + + exp_ext::submit(InOrderQueue, [&](sycl::handler &CGH) { + CGH.depends_on(Event); + exp_ext::execute_graph(CGH, GraphExec); + }); + + InOrderQueue.wait_and_throw(); + + free(PtrA, InOrderQueue); + free(PtrB, InOrderQueue); + free(PtrC, InOrderQueue); + + for (size_t i = 0; i < Size; i++) { + T Ref = Pattern * i + 42; + assert(check_value(i, Ref, Output[i], "Output")); + } + + return 0; +} diff --git a/unified-runtime/source/adapters/level_zero/command_buffer.cpp b/unified-runtime/source/adapters/level_zero/command_buffer.cpp index 020afb90564ff..694ea5640c827 100644 --- a/unified-runtime/source/adapters/level_zero/command_buffer.cpp +++ b/unified-runtime/source/adapters/level_zero/command_buffer.cpp @@ -714,6 +714,25 @@ ur_result_t appendExecutionWaits(ur_exp_command_buffer_handle_t CommandBuffer) { return UR_RESULT_SUCCESS; } +/** + * Waits for any ongoing executions of the command-buffer to finish. + * @param CommandBuffer The command-buffer to wait for. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t +waitForOngoingExecution(ur_exp_command_buffer_handle_t CommandBuffer) { + + if (ur_event_handle_t &CurrentSubmissionEvent = + CommandBuffer->CurrentSubmissionEvent) { + ZE2UR_CALL(zeEventHostSynchronize, + (CurrentSubmissionEvent->ZeEvent, UINT64_MAX)); + UR_CALL(urEventReleaseInternal(CurrentSubmissionEvent)); + CurrentSubmissionEvent = nullptr; + } + + return UR_RESULT_SUCCESS; +} + ur_result_t urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, const ur_exp_command_buffer_desc_t *CommandBufferDesc, @@ -832,7 +851,9 @@ urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t CommandBuffer) { if (!CommandBuffer->RefCount.decrementAndTest()) return UR_RESULT_SUCCESS; + waitForOngoingExecution(CommandBuffer); CommandBuffer->cleanupCommandBufferResources(); + delete CommandBuffer; return UR_RESULT_SUCCESS; } @@ -1453,25 +1474,6 @@ ur_result_t getZeCommandQueue(ur_queue_handle_t Queue, bool UseCopyEngine, return UR_RESULT_SUCCESS; } -/** - * Waits for any ongoing executions of the command-buffer to finish. - * @param CommandBuffer The command-buffer to wait for. - * @return UR_RESULT_SUCCESS or an error code on failure - */ -ur_result_t -waitForOngoingExecution(ur_exp_command_buffer_handle_t CommandBuffer) { - - if (ur_event_handle_t &CurrentSubmissionEvent = - CommandBuffer->CurrentSubmissionEvent) { - ZE2UR_CALL(zeEventHostSynchronize, - (CurrentSubmissionEvent->ZeEvent, UINT64_MAX)); - UR_CALL(urEventReleaseInternal(CurrentSubmissionEvent)); - CurrentSubmissionEvent = nullptr; - } - - return UR_RESULT_SUCCESS; -} - /** * Waits for the all the dependencies of the command-buffer * @param[in] CommandBuffer The command-buffer. From 0241111786ca07636eaa64f28191398a78ebfd56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Mestre?= Date: Wed, 4 Jun 2025 12:22:13 +0100 Subject: [PATCH 02/20] Fix Unit test failure --- .../unittests/Extensions/CommandGraph/CommandGraph.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp index dfb100acec848..fe4731dc31535 100644 --- a/sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp +++ b/sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp @@ -417,11 +417,11 @@ TEST_F(CommandGraphTest, GraphPartitionsMerging) { auto GraphExecImpl = sycl::detail::getSyclObjImpl(GraphExec); auto PartitionsList = GraphExecImpl->getPartitions(); ASSERT_EQ(PartitionsList.size(), 5ul); - ASSERT_FALSE(PartitionsList[0]->isHostTask()); - ASSERT_TRUE(PartitionsList[1]->isHostTask()); - ASSERT_FALSE(PartitionsList[2]->isHostTask()); - ASSERT_TRUE(PartitionsList[3]->isHostTask()); - ASSERT_FALSE(PartitionsList[4]->isHostTask()); + ASSERT_FALSE(PartitionsList[0]->MIsHostTask); + ASSERT_TRUE(PartitionsList[1]->MIsHostTask); + ASSERT_FALSE(PartitionsList[2]->MIsHostTask); + ASSERT_TRUE(PartitionsList[3]->MIsHostTask); + ASSERT_FALSE(PartitionsList[4]->MIsHostTask); } TEST_F(CommandGraphTest, GetNodeFromEvent) { From d3445ef94b399a333a50582d0aebbd8cd720916b Mon Sep 17 00:00:00 2001 From: Fabio Mestre Date: Thu, 5 Jun 2025 14:21:50 +0100 Subject: [PATCH 03/20] Fix command-buffer dependencies on the legacy adapter when immediate command lists are disabled --- .../adapters/level_zero/command_buffer.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/unified-runtime/source/adapters/level_zero/command_buffer.cpp b/unified-runtime/source/adapters/level_zero/command_buffer.cpp index 694ea5640c827..3261f04ee4d0c 100644 --- a/unified-runtime/source/adapters/level_zero/command_buffer.cpp +++ b/unified-runtime/source/adapters/level_zero/command_buffer.cpp @@ -1489,6 +1489,25 @@ ur_result_t waitForDependencies(ur_exp_command_buffer_handle_t CommandBuffer, std::scoped_lock Guard(CommandBuffer->Mutex); const bool UseCopyEngine = false; bool MustSignalWaitEvent = true; + + // Level-zero does not allow in-order queue when immediate command-lists are + // not used. For that reason, if the UR queue is in-order, we need to emulate, + // its in-order properties by adding an event dependency on the last command + // executed by the queue. + std::vector WaitList; + if (Queue->isInOrderQueue() && Queue->LastCommandEvent) { + WaitList.reserve(NumEventsInWaitList + 1); + + if (NumEventsInWaitList) { + WaitList.insert(WaitList.end(), EventWaitList, + EventWaitList + NumEventsInWaitList); + } + WaitList.push_back(Queue->LastCommandEvent); + + ++NumEventsInWaitList; + EventWaitList = WaitList.data(); + } + if (NumEventsInWaitList) { ur_ze_event_list_t TmpWaitList; UR_CALL(TmpWaitList.createAndRetainUrZeEventList( From dd8f6d15a5d90b7e61a52ac7936324e0d3ef715b Mon Sep 17 00:00:00 2001 From: Fabio Mestre Date: Fri, 6 Jun 2025 12:36:29 +0100 Subject: [PATCH 04/20] Fix data race in multiple_exec_graphs test --- sycl/test-e2e/Graph/Inputs/multiple_exec_graphs.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sycl/test-e2e/Graph/Inputs/multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/Inputs/multiple_exec_graphs.cpp index 07bf4c0e4c2eb..b22dae2f12f18 100644 --- a/sycl/test-e2e/Graph/Inputs/multiple_exec_graphs.cpp +++ b/sycl/test-e2e/Graph/Inputs/multiple_exec_graphs.cpp @@ -2,9 +2,10 @@ // graph. #include "../graph_common.hpp" +#include int main() { - queue Queue{}; + queue Queue{{sycl::property::queue::in_order{}}}; using T = int; From 240c952e66c2029ee1e327b4f4c29a297e22b0cf Mon Sep 17 00:00:00 2001 From: Fabio Mestre Date: Fri, 6 Jun 2025 13:30:54 +0100 Subject: [PATCH 05/20] Let L0 event implementation handler dependencies for in-order queue --- .../adapters/level_zero/command_buffer.cpp | 80 +++++++++---------- 1 file changed, 39 insertions(+), 41 deletions(-) diff --git a/unified-runtime/source/adapters/level_zero/command_buffer.cpp b/unified-runtime/source/adapters/level_zero/command_buffer.cpp index 3261f04ee4d0c..f280cfab8197b 100644 --- a/unified-runtime/source/adapters/level_zero/command_buffer.cpp +++ b/unified-runtime/source/adapters/level_zero/command_buffer.cpp @@ -1494,48 +1494,46 @@ ur_result_t waitForDependencies(ur_exp_command_buffer_handle_t CommandBuffer, // not used. For that reason, if the UR queue is in-order, we need to emulate, // its in-order properties by adding an event dependency on the last command // executed by the queue. - std::vector WaitList; - if (Queue->isInOrderQueue() && Queue->LastCommandEvent) { - WaitList.reserve(NumEventsInWaitList + 1); - - if (NumEventsInWaitList) { - WaitList.insert(WaitList.end(), EventWaitList, - EventWaitList + NumEventsInWaitList); - } - WaitList.push_back(Queue->LastCommandEvent); - - ++NumEventsInWaitList; - EventWaitList = WaitList.data(); - } + // std::vector WaitList; + // if (Queue->isInOrderQueue() && Queue->LastCommandEvent) { + // WaitList.reserve(NumEventsInWaitList + 1); + // + // if (NumEventsInWaitList) { + // WaitList.insert(WaitList.end(), EventWaitList, + // EventWaitList + NumEventsInWaitList); + // } + // WaitList.push_back(Queue->LastCommandEvent); + // + // ++NumEventsInWaitList; + // EventWaitList = WaitList.data(); + // } + + ur_ze_event_list_t TmpWaitList; + UR_CALL(TmpWaitList.createAndRetainUrZeEventList( + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + + // Update the WaitList of the Wait Event + // Events are appended to the WaitList if the WaitList is not empty + if (CommandBuffer->WaitEvent->WaitList.isEmpty()) + CommandBuffer->WaitEvent->WaitList = TmpWaitList; + else + CommandBuffer->WaitEvent->WaitList.insert(TmpWaitList); + + if (!CommandBuffer->WaitEvent->WaitList.isEmpty()) { + // Create command-list to execute before `CommandListPtr` and will signal + // when `EventWaitList` dependencies are complete. + ur_command_list_ptr_t WaitCommandList{}; + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, WaitCommandList, false /*UseCopyEngine*/, NumEventsInWaitList, + EventWaitList, false /*AllowBatching*/, nullptr /*ForcedCmdQueue*/)); - if (NumEventsInWaitList) { - ur_ze_event_list_t TmpWaitList; - UR_CALL(TmpWaitList.createAndRetainUrZeEventList( - NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); - - // Update the WaitList of the Wait Event - // Events are appended to the WaitList if the WaitList is not empty - if (CommandBuffer->WaitEvent->WaitList.isEmpty()) - CommandBuffer->WaitEvent->WaitList = TmpWaitList; - else - CommandBuffer->WaitEvent->WaitList.insert(TmpWaitList); - - if (!CommandBuffer->WaitEvent->WaitList.isEmpty()) { - // Create command-list to execute before `CommandListPtr` and will signal - // when `EventWaitList` dependencies are complete. - ur_command_list_ptr_t WaitCommandList{}; - UR_CALL(Queue->Context->getAvailableCommandList( - Queue, WaitCommandList, false /*UseCopyEngine*/, NumEventsInWaitList, - EventWaitList, false /*AllowBatching*/, nullptr /*ForcedCmdQueue*/)); - - ZE2UR_CALL(zeCommandListAppendBarrier, - (WaitCommandList->first, CommandBuffer->WaitEvent->ZeEvent, - CommandBuffer->WaitEvent->WaitList.Length, - CommandBuffer->WaitEvent->WaitList.ZeEventList)); - Queue->executeCommandList(WaitCommandList, false /*IsBlocking*/, - false /*OKToBatchCommand*/); - MustSignalWaitEvent = false; - } + ZE2UR_CALL(zeCommandListAppendBarrier, + (WaitCommandList->first, CommandBuffer->WaitEvent->ZeEvent, + CommandBuffer->WaitEvent->WaitList.Length, + CommandBuffer->WaitEvent->WaitList.ZeEventList)); + Queue->executeCommandList(WaitCommandList, false /*IsBlocking*/, + false /*OKToBatchCommand*/); + MustSignalWaitEvent = false; } // Given WaitEvent was created without specifying Counting Events, then this // event can be signalled on the host. From d70fc37fd9f4b1eccd34fce10bd6352e5e98db6c Mon Sep 17 00:00:00 2001 From: Fabio Mestre Date: Fri, 6 Jun 2025 13:53:28 +0100 Subject: [PATCH 06/20] Wait for command-buffer execution before destroying --- .../source/adapters/level_zero/v2/command_buffer.cpp | 2 ++ unified-runtime/source/adapters/opencl/command_buffer.cpp | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp b/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp index 4281f5e280326..3a62f224eb7c9 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp @@ -162,6 +162,8 @@ ur_result_t ur_exp_command_buffer_handle_t_::registerExecutionEventUnlocked( ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() { if (currentExecution) { + ZE_CALL_NOCHECK(zeEventHostSynchronize, + (currentExecution->getZeEvent(), UINT64_MAX)); currentExecution->release(); } for (auto &event : syncPoints) { diff --git a/unified-runtime/source/adapters/opencl/command_buffer.cpp b/unified-runtime/source/adapters/opencl/command_buffer.cpp index e048b2d22175c..affc5b5882870 100644 --- a/unified-runtime/source/adapters/opencl/command_buffer.cpp +++ b/unified-runtime/source/adapters/opencl/command_buffer.cpp @@ -115,6 +115,10 @@ urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { if (hCommandBuffer->decrementReferenceCount() == 0) { + if (hCommandBuffer->LastSubmission) { + cl_int RetErr = clWaitForEvents(1, &(hCommandBuffer->LastSubmission)); + CL_RETURN_ON_FAILURE(RetErr); + } delete hCommandBuffer; } From 109bc200f109a1659a7b11c267e7f9e1713b829d Mon Sep 17 00:00:00 2001 From: Fabio Mestre Date: Fri, 6 Jun 2025 15:49:03 +0100 Subject: [PATCH 07/20] Don't rely on default context being the same for ext_oneapi_enqueue_functions_with_dependencies test --- .../ext_oneapi_enqueue_functions_with_dependencies.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_with_dependencies.cpp b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_with_dependencies.cpp index 74f4b9ff1c9a3..6ef2dd0123f14 100644 --- a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_with_dependencies.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_with_dependencies.cpp @@ -12,8 +12,11 @@ #include int main() { - queue InOrderQueue{property::queue::in_order{}}; - queue OtherQueue{property::queue::in_order{}}; + device Device{}; + context Context{Device}; + + queue InOrderQueue{Context, Device, property::queue::in_order{}}; + queue OtherQueue{Context, Device, property::queue::in_order{}}; using T = int; From 91c84a794376aca8f1bb703e84251f9ec1e16288 Mon Sep 17 00:00:00 2001 From: Fabio Mestre Date: Fri, 6 Jun 2025 16:55:36 +0100 Subject: [PATCH 08/20] Remove commented code --- sycl/test-e2e/Graph/graph_common.hpp | 2 +- .../adapters/level_zero/command_buffer.cpp | 18 ------------------ 2 files changed, 1 insertion(+), 19 deletions(-) diff --git a/sycl/test-e2e/Graph/graph_common.hpp b/sycl/test-e2e/Graph/graph_common.hpp index c99782732ec21..0526160660600 100644 --- a/sycl/test-e2e/Graph/graph_common.hpp +++ b/sycl/test-e2e/Graph/graph_common.hpp @@ -17,7 +17,7 @@ // Test constants. constexpr size_t Size = 1024; // Number of data elements in a buffer. -constexpr unsigned Iterations = 5; // Iterations of graph to execute. +constexpr unsigned Iterations = 10000; // Iterations of graph to execute. constexpr size_t Offset = 100; // Number of offset elements for Buffer accessors // Namespace alias to use in test code. diff --git a/unified-runtime/source/adapters/level_zero/command_buffer.cpp b/unified-runtime/source/adapters/level_zero/command_buffer.cpp index f280cfab8197b..c9e03d80a7567 100644 --- a/unified-runtime/source/adapters/level_zero/command_buffer.cpp +++ b/unified-runtime/source/adapters/level_zero/command_buffer.cpp @@ -1490,24 +1490,6 @@ ur_result_t waitForDependencies(ur_exp_command_buffer_handle_t CommandBuffer, const bool UseCopyEngine = false; bool MustSignalWaitEvent = true; - // Level-zero does not allow in-order queue when immediate command-lists are - // not used. For that reason, if the UR queue is in-order, we need to emulate, - // its in-order properties by adding an event dependency on the last command - // executed by the queue. - // std::vector WaitList; - // if (Queue->isInOrderQueue() && Queue->LastCommandEvent) { - // WaitList.reserve(NumEventsInWaitList + 1); - // - // if (NumEventsInWaitList) { - // WaitList.insert(WaitList.end(), EventWaitList, - // EventWaitList + NumEventsInWaitList); - // } - // WaitList.push_back(Queue->LastCommandEvent); - // - // ++NumEventsInWaitList; - // EventWaitList = WaitList.data(); - // } - ur_ze_event_list_t TmpWaitList; UR_CALL(TmpWaitList.createAndRetainUrZeEventList( NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); From da3720aef34e0f6dad904f3804330e76af3175d2 Mon Sep 17 00:00:00 2001 From: Fabio Mestre Date: Fri, 6 Jun 2025 16:56:29 +0100 Subject: [PATCH 09/20] Revert changes to graph_common --- sycl/test-e2e/Graph/graph_common.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/Graph/graph_common.hpp b/sycl/test-e2e/Graph/graph_common.hpp index 0526160660600..c99782732ec21 100644 --- a/sycl/test-e2e/Graph/graph_common.hpp +++ b/sycl/test-e2e/Graph/graph_common.hpp @@ -17,7 +17,7 @@ // Test constants. constexpr size_t Size = 1024; // Number of data elements in a buffer. -constexpr unsigned Iterations = 10000; // Iterations of graph to execute. +constexpr unsigned Iterations = 5; // Iterations of graph to execute. constexpr size_t Offset = 100; // Number of offset elements for Buffer accessors // Namespace alias to use in test code. From 4894acaa37660415bf8c7b49512f16b402907b0a Mon Sep 17 00:00:00 2001 From: Fabio Mestre Date: Fri, 6 Jun 2025 18:27:13 +0100 Subject: [PATCH 10/20] Try to remove extra sync in V2 adapter --- .../source/adapters/level_zero/v2/command_buffer.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp b/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp index 3a62f224eb7c9..4281f5e280326 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp @@ -162,8 +162,6 @@ ur_result_t ur_exp_command_buffer_handle_t_::registerExecutionEventUnlocked( ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() { if (currentExecution) { - ZE_CALL_NOCHECK(zeEventHostSynchronize, - (currentExecution->getZeEvent(), UINT64_MAX)); currentExecution->release(); } for (auto &event : syncPoints) { From 76e58f8b8e4e026346b3c661469776665927765b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Mestre?= Date: Tue, 10 Jun 2025 16:43:18 +0100 Subject: [PATCH 11/20] Add unit-tests for eventless path --- .../Extensions/CommandGraph/InOrderQueue.cpp | 156 ++++++++++++++++++ 1 file changed, 156 insertions(+) diff --git a/sycl/unittests/Extensions/CommandGraph/InOrderQueue.cpp b/sycl/unittests/Extensions/CommandGraph/InOrderQueue.cpp index c50b8c1e99371..e317c85b19087 100644 --- a/sycl/unittests/Extensions/CommandGraph/InOrderQueue.cpp +++ b/sycl/unittests/Extensions/CommandGraph/InOrderQueue.cpp @@ -529,3 +529,159 @@ TEST_F(CommandGraphTest, InOrderQueueMemcpyAndGraph) { auto EventGraph = InOrderQueue.submit( [&](sycl::handler &CGH) { CGH.ext_oneapi_graph(InOrderGraphExec); }); } + +// Validate that enqueuing a graph with +// sycl::ext::oneapi::experimental::execute_graph using an in-order queue, +// does not request a signaling event from the UR backend and has no event +// dependencies. +TEST_F(CommandGraphTest, InOrderQueueEventless) { + sycl::property_list Properties{sycl::property::queue::in_order()}; + sycl::queue InOrderQueue{Dev, Properties}; + + experimental::command_graph + InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()}; + + // Record in-order queue with three nodes + InOrderGraph.begin_recording(InOrderQueue); + auto Node1Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto Node2Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto Node3Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + InOrderGraph.end_recording(InOrderQueue); + + // Finalize main graph and check schedule + auto GraphExec = InOrderGraph.finalize(); + + auto beforeUrEnqueueCommandBufferExp = [](void *pParams) -> ur_result_t { + auto params = + *static_cast(pParams); + EXPECT_TRUE(*params.pnumEventsInWaitList == 0); + EXPECT_TRUE(*params.pphEventWaitList == nullptr); + EXPECT_TRUE(*params.pphEvent == nullptr); + + return UR_RESULT_SUCCESS; + }; + + sycl::unittest::UrMock<> Mock; + mock::getCallbacks().set_before_callback("urEnqueueCommandBufferExp", + beforeUrEnqueueCommandBufferExp); + + const size_t Iterations = 5; + for (size_t I = 0; I < Iterations; ++I) { + sycl::ext::oneapi::experimental::execute_graph(InOrderQueue, GraphExec); + } +} + +// Validate that if an event is requested when enqueueing a graph with +// sycl::ext::oneapi::experimental::submit_with_event with an in-order queue, +// the implementation requests a signal event but doesn't wait on any events +// dependencies. +TEST_F(CommandGraphTest, InOrderQueueRequestEvent) { + sycl::property_list Properties{sycl::property::queue::in_order()}; + sycl::queue InOrderQueue{Dev, Properties}; + + experimental::command_graph + InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()}; + + // Record in-order queue with three nodes + InOrderGraph.begin_recording(InOrderQueue); + auto Node1Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto Node2Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto Node3Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + InOrderGraph.end_recording(InOrderQueue); + + // Finalize main graph and check schedule + auto GraphExec = InOrderGraph.finalize(); + + auto beforeUrEnqueueCommandBufferExp = [](void *pParams) -> ur_result_t { + auto params = + *static_cast(pParams); + EXPECT_TRUE(*params.pnumEventsInWaitList == 0); + EXPECT_TRUE(*params.pphEventWaitList == nullptr); + EXPECT_TRUE(*params.pphEvent != nullptr); + + return UR_RESULT_SUCCESS; + }; + + sycl::unittest::UrMock<> Mock; + mock::getCallbacks().set_before_callback("urEnqueueCommandBufferExp", + beforeUrEnqueueCommandBufferExp); + + const size_t Iterations = 5; + std::vector OutputEvents; + + for (size_t I = 0; I < Iterations; ++I) { + OutputEvents.push_back(sycl::ext::oneapi::experimental::submit_with_event( + InOrderQueue, + [&](sycl::handler &cgh) { cgh.ext_oneapi_graph(GraphExec); })); + } +} + +// Validate that enqueuing a graph using an in-order queue with an event +// dependency does not request a signaling event from the UR backend and has +// 1 event dependency. +TEST_F(CommandGraphTest, InOrderQueueEventlessWithDependency) { + device Dev{}; + context Context{Dev}; + + sycl::property_list Properties{sycl::property::queue::in_order()}; + sycl::queue InOrderQueue{Context, Dev, Properties}; + sycl::queue OtherQueue{Context, Dev, Properties}; + + experimental::command_graph + InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()}; + + // Record in-order queue with three nodes + InOrderGraph.begin_recording(InOrderQueue); + auto Node1Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto Node2Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto Node3Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + InOrderGraph.end_recording(InOrderQueue); + + auto GraphExec = InOrderGraph.finalize(); + + auto beforeUrEnqueueCommandBufferExp = [](void *pParams) -> ur_result_t { + auto params = + *static_cast(pParams); + EXPECT_TRUE(*params.pnumEventsInWaitList == 1); + EXPECT_TRUE(*params.pphEvent == nullptr); + + return UR_RESULT_SUCCESS; + }; + + sycl::unittest::UrMock<> Mock; + mock::getCallbacks().set_before_callback("urEnqueueCommandBufferExp", + beforeUrEnqueueCommandBufferExp); + + sycl::event Event = sycl::ext::oneapi::experimental::submit_with_event( + OtherQueue, [&](sycl::handler &CGH) { + sycl::ext::oneapi::experimental::single_task>( + CGH, [=]() {}); + }); + + const size_t Iterations = 5; + for (size_t I = 0; I < Iterations; ++I) { + sycl::ext::oneapi::experimental::submit( + InOrderQueue, [&](sycl::handler &CGH) { + CGH.depends_on(Event); + sycl::ext::oneapi::experimental::execute_graph(CGH, GraphExec); + }); + } +} \ No newline at end of file From 1582ab74ad4574841524a75c1a02c6e7922dbe2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Mestre?= Date: Tue, 10 Jun 2025 17:08:46 +0100 Subject: [PATCH 12/20] Address review comments --- sycl/source/detail/graph_impl.cpp | 29 ++++++------------- ...pi_enqueue_functions_with_dependencies.cpp | 5 ++-- 2 files changed, 12 insertions(+), 22 deletions(-) diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 8ac0d64e75ee5..5e451ce8ef0ad 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -297,7 +297,7 @@ void exec_graph_impl::makePartitions() { } // Add an empty partition if there is no partition, i.e. empty graph - if (MPartitions.size() == 0) { + if (MPartitions.empty()) { MPartitions.push_back(std::make_shared()); MRootPartitions.push_back(MPartitions[0]); } @@ -1105,13 +1105,6 @@ EventImplPtr exec_graph_impl::enqueuePartitionDirectly( const std::shared_ptr &Queue, std::vector &WaitEvents, bool EventNeeded) { - auto CheckURResult = [](ur_result_t UrResult) { - if (UrResult != UR_RESULT_SUCCESS) { - throw sycl::exception( - errc::event, "Failed to enqueue event for command buffer submission"); - } - }; - // Create a list containing all the UR event handles in WaitEvents. WaitEvents // is assumed to be safe for scheduler bypass and any host-task events that it // contains can be ignored. @@ -1130,12 +1123,10 @@ EventImplPtr exec_graph_impl::enqueuePartitionDirectly( UrEnqueueWaitListSize == 0 ? nullptr : UrEventHandles.data(); if (!EventNeeded) { - ur_result_t UrResult = - Queue->getAdapter() - ->call_nocheck( - Queue->getHandleRef(), CommandBuffer, UrEnqueueWaitListSize, - UrEnqueueWaitList, nullptr); - CheckURResult(UrResult); + Queue->getAdapter() + ->call( + Queue->getHandleRef(), CommandBuffer, UrEnqueueWaitListSize, + UrEnqueueWaitList, nullptr); return nullptr; } else { auto NewEvent = std::make_shared(Queue); @@ -1143,12 +1134,10 @@ EventImplPtr exec_graph_impl::enqueuePartitionDirectly( NewEvent->setStateIncomplete(); NewEvent->setSubmissionTime(); ur_event_handle_t UrEvent = nullptr; - ur_result_t UrResult = - Queue->getAdapter() - ->call_nocheck( - Queue->getHandleRef(), CommandBuffer, UrEventHandles.size(), - UrEnqueueWaitList, &UrEvent); - CheckURResult(UrResult); + Queue->getAdapter() + ->call( + Queue->getHandleRef(), CommandBuffer, UrEventHandles.size(), + UrEnqueueWaitList, &UrEvent); NewEvent->setHandle(UrEvent); NewEvent->setEventFromSubmittedExecCommandBuffer(true); return NewEvent; diff --git a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_with_dependencies.cpp b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_with_dependencies.cpp index 6ef2dd0123f14..b9751546e2923 100644 --- a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_with_dependencies.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_with_dependencies.cpp @@ -47,11 +47,12 @@ int main() { auto GraphExec = Graph.finalize(); + const size_t MemsetValue = 12; sycl::event Event = exp_ext::submit_with_event(OtherQueue, [&](sycl::handler &CGH) { exp_ext::single_task(CGH, [=]() { for (size_t I = 0; I < Size; ++I) - PtrC[I] = 42; + PtrC[I] = MemsetValue; }); }); @@ -67,7 +68,7 @@ int main() { free(PtrC, InOrderQueue); for (size_t i = 0; i < Size; i++) { - T Ref = Pattern * i + 42; + T Ref = Pattern * i + MemsetValue; assert(check_value(i, Ref, Output[i], "Output")); } From 597dd413807cd3954b06a670c1693c7b568c78a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Mestre?= Date: Tue, 10 Jun 2025 17:52:06 +0100 Subject: [PATCH 13/20] Update new functions to not use shared_ptr argument for queue --- sycl/source/detail/graph_impl.cpp | 49 +++++++++++++++---------------- sycl/source/detail/graph_impl.hpp | 19 +++++------- sycl/source/handler.cpp | 10 +++---- 3 files changed, 37 insertions(+), 41 deletions(-) diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index f3512caf13a1c..e23ef4812d820 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -1056,8 +1056,7 @@ static void cleanupExecutionEvents(std::vector &ExecutionEvents) { } EventImplPtr exec_graph_impl::enqueueHostTaskPartition( - std::shared_ptr &Partition, - const std::shared_ptr &Queue, + std::shared_ptr &Partition, sycl::detail::queue_impl &Queue, sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded) { auto NodeImpl = Partition->MSchedule.front(); @@ -1081,12 +1080,12 @@ EventImplPtr exec_graph_impl::enqueueHostTaskPartition( // dependencies for the current execution. std::unique_ptr CommandGroup = std::make_unique(sycl::detail::CGHostTask( - NodeCommandGroup->MHostTask, Queue, NodeCommandGroup->MContext, - NodeCommandGroup->MArgs, std::move(CGData), - NodeCommandGroup->getType())); + NodeCommandGroup->MHostTask, Queue.shared_from_this(), + NodeCommandGroup->MContext, NodeCommandGroup->MArgs, + std::move(CGData), NodeCommandGroup->getType())); EventImplPtr SchedulerEvent = sycl::detail::Scheduler::getInstance().addCG( - std::move(CommandGroup), Queue, EventNeeded); + std::move(CommandGroup), Queue.shared_from_this(), EventNeeded); if (EventNeeded) { return SchedulerEvent; @@ -1095,8 +1094,7 @@ EventImplPtr exec_graph_impl::enqueueHostTaskPartition( } EventImplPtr exec_graph_impl::enqueuePartitionWithScheduler( - std::shared_ptr &Partition, - const std::shared_ptr &Queue, + std::shared_ptr &Partition, sycl::detail::queue_impl &Queue, sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded) { if (!Partition->MRequirements.empty()) { @@ -1108,14 +1106,14 @@ EventImplPtr exec_graph_impl::enqueuePartitionWithScheduler( Partition->MAccessors.end()); } - auto CommandBuffer = Partition->MCommandBuffers[Queue->get_device()]; + auto CommandBuffer = Partition->MCommandBuffers[Queue.get_device()]; std::unique_ptr CommandGroup = std::make_unique( CommandBuffer, nullptr, std::move(CGData)); EventImplPtr SchedulerEvent = sycl::detail::Scheduler::getInstance().addCG( - std::move(CommandGroup), Queue, EventNeeded); + std::move(CommandGroup), Queue.shared_from_this(), EventNeeded); if (EventNeeded) { SchedulerEvent->setEventFromSubmittedExecCommandBuffer(true); @@ -1126,8 +1124,7 @@ EventImplPtr exec_graph_impl::enqueuePartitionWithScheduler( } EventImplPtr exec_graph_impl::enqueuePartitionDirectly( - std::shared_ptr &Partition, - const std::shared_ptr &Queue, + std::shared_ptr &Partition, sycl::detail::queue_impl &Queue, std::vector &WaitEvents, bool EventNeeded) { // Create a list containing all the UR event handles in WaitEvents. WaitEvents @@ -1142,26 +1139,27 @@ EventImplPtr exec_graph_impl::enqueuePartitionDirectly( } } - auto CommandBuffer = Partition->MCommandBuffers[Queue->get_device()]; + auto CommandBuffer = Partition->MCommandBuffers[Queue.get_device()]; const size_t UrEnqueueWaitListSize = UrEventHandles.size(); const ur_event_handle_t *UrEnqueueWaitList = UrEnqueueWaitListSize == 0 ? nullptr : UrEventHandles.data(); if (!EventNeeded) { - Queue->getAdapter() + Queue.getAdapter() ->call( - Queue->getHandleRef(), CommandBuffer, UrEnqueueWaitListSize, + Queue.getHandleRef(), CommandBuffer, UrEnqueueWaitListSize, UrEnqueueWaitList, nullptr); return nullptr; } else { - auto NewEvent = std::make_shared(Queue); - NewEvent->setContextImpl(Queue->getContextImplPtr()); + auto NewEvent = + std::make_shared(Queue.shared_from_this()); + NewEvent->setContextImpl(Queue.getContextImplPtr()); NewEvent->setStateIncomplete(); NewEvent->setSubmissionTime(); ur_event_handle_t UrEvent = nullptr; - Queue->getAdapter() + Queue.getAdapter() ->call( - Queue->getHandleRef(), CommandBuffer, UrEventHandles.size(), + Queue.getHandleRef(), CommandBuffer, UrEventHandles.size(), UrEnqueueWaitList, &UrEvent); NewEvent->setHandle(UrEvent); NewEvent->setEventFromSubmittedExecCommandBuffer(true); @@ -1169,10 +1167,11 @@ EventImplPtr exec_graph_impl::enqueuePartitionDirectly( } } -EventImplPtr exec_graph_impl::enqueuePartitions( - const std::shared_ptr &Queue, - sycl::detail::CG::StorageInitHelper &CGData, - bool IsCGDataSafeForSchedulerBypass, bool EventNeeded) { +EventImplPtr +exec_graph_impl::enqueuePartitions(sycl::detail::queue_impl &Queue, + sycl::detail::CG::StorageInitHelper &CGData, + bool IsCGDataSafeForSchedulerBypass, + bool EventNeeded) { // If EventNeeded is true, this vector is used to keep track of dependencies // for the returned event. This is used when the graph has multiple end nodes @@ -1280,7 +1279,7 @@ EventImplPtr exec_graph_impl::enqueuePartitions( } EventImplPtr -exec_graph_impl::enqueue(const std::shared_ptr &Queue, +exec_graph_impl::enqueue(sycl::detail::queue_impl &Queue, sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded) { WriteLock Lock(MMutex); @@ -1291,7 +1290,7 @@ exec_graph_impl::enqueue(const std::shared_ptr &Queue, bool IsCGDataSafeForSchedulerBypass = detail::Scheduler::areEventsSafeForSchedulerBypass( - CGData.MEvents, Queue->getContextImplPtr()) && + CGData.MEvents, Queue.getContextImplPtr()) && CGData.MRequirements.empty(); // This variable represents the returned event. It will always be nullptr if diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 96ce12ae4dd58..87122bbc8417c 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -1346,7 +1346,7 @@ class exec_graph_impl { /// operation needs to be returned. /// @return Returns an event if EventNeeded is true. Returns nullptr /// otherwise. - EventImplPtr enqueue(const std::shared_ptr &Queue, + EventImplPtr enqueue(sycl::detail::queue_impl &Queue, sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded); @@ -1482,8 +1482,7 @@ class exec_graph_impl { /// @return If EventNeeded is true returns the event resulting from enqueueing /// the host-task through the scheduler. Returns nullptr otherwise. EventImplPtr enqueueHostTaskPartition( - std::shared_ptr &Partition, - const std::shared_ptr &Queue, + std::shared_ptr &Partition, sycl::detail::queue_impl &Queue, sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded); /// Enqueues a graph partition that contains no host-tasks using the @@ -1497,8 +1496,7 @@ class exec_graph_impl { /// @return If EventNeeded is true returns the event resulting from enqueueing /// the command-buffer through the scheduler. Returns nullptr otherwise. EventImplPtr enqueuePartitionWithScheduler( - std::shared_ptr &Partition, - const std::shared_ptr &Queue, + std::shared_ptr &Partition, sycl::detail::queue_impl &Queue, sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded); /// Enqueues a graph partition that contains no host-tasks by directly calling @@ -1513,8 +1511,7 @@ class exec_graph_impl { /// @return If EventNeeded is true returns the event resulting from enqueueing /// the command-buffer. Returns nullptr otherwise. EventImplPtr enqueuePartitionDirectly( - std::shared_ptr &Partition, - const std::shared_ptr &Queue, + std::shared_ptr &Partition, sycl::detail::queue_impl &Queue, std::vector &WaitEvents, bool EventNeeded); /// Enqueues all the partitions in a graph. @@ -1528,10 +1525,10 @@ class exec_graph_impl { /// operation needs to be returned. /// @return If EventNeeded is true returns the event resulting from enqueueing /// the command-buffer. Returns nullptr otherwise. - EventImplPtr - enqueuePartitions(const std::shared_ptr &Queue, - sycl::detail::CG::StorageInitHelper &CGData, - bool IsCGDataSafeForSchedulerBypass, bool EventNeeded); + EventImplPtr enqueuePartitions(sycl::detail::queue_impl &Queue, + sycl::detail::CG::StorageInitHelper &CGData, + bool IsCGDataSafeForSchedulerBypass, + bool EventNeeded); /// Iterates back through predecessors to find the real dependency. /// @param[out] Deps Found dependencies. diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 97a7cfa47ecd6..baa6c44e407f9 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -769,18 +769,18 @@ event handler::finalize() { nullptr, impl->MExecGraph, std::move(impl->CGData))); } else { + detail::queue_impl &Queue = impl->get_queue(); bool DiscardEvent = !impl->MEventNeeded && - MQueue->supportsDiscardingPiEvents() && + Queue.supportsDiscardingPiEvents() && !impl->MExecGraph->containsHostTask(); detail::EventImplPtr GraphCompletionEvent = impl->MExecGraph->enqueue( - MQueue, std::move(impl->CGData), !DiscardEvent); + Queue, std::move(impl->CGData), !DiscardEvent); #ifdef __INTEL_PREVIEW_BREAKING_CHANGES return GraphCompletionEvent; #else return sycl::detail::createSyclObjFromImpl( - GraphCompletionEvent - ? GraphCompletionEvent - : std::make_shared(MQueue)); + GraphCompletionEvent ? GraphCompletionEvent + : std::make_shared()); #endif } } break; From 9343cf93e576d162818a7bb459a092838d9b02f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Mestre?= Date: Wed, 11 Jun 2025 14:26:00 +0100 Subject: [PATCH 14/20] Address review comments --- sycl/source/detail/graph_impl.cpp | 19 +++++++++++-------- sycl/source/detail/graph_impl.hpp | 4 ++-- .../scheduler/InOrderQueueSyncCheck.cpp | 2 ++ 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index e23ef4812d820..758921addf9b8 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -315,8 +315,8 @@ void exec_graph_impl::makePartitions() { for (const auto &Dep : RootNode->MPredecessors) { auto NodeDep = Dep.lock(); auto &Predecessor = MPartitions[MPartitionNodes[NodeDep]]; - Partition->MPredecessors.push_back(Predecessor); - Predecessor->MSuccessors.push_back(Partition); + Partition->MPredecessors.push_back(Predecessor.get()); + Predecessor->MSuccessors.push_back(Partition.get()); } } } @@ -1134,7 +1134,7 @@ EventImplPtr exec_graph_impl::enqueuePartitionDirectly( UrEventHandles.reserve(WaitEvents.size()); for (auto &SyclWaitEvent : WaitEvents) { auto URHandle = SyclWaitEvent->getHandle(); - if (URHandle) { + if (auto URHandle = SyclWaitEvent->getHandle()) { UrEventHandles.push_back(URHandle); } } @@ -1177,6 +1177,10 @@ exec_graph_impl::enqueuePartitions(sycl::detail::queue_impl &Queue, // for the returned event. This is used when the graph has multiple end nodes // which cannot be tracked with a single scheduler event. std::vector PostCompleteDependencies; + // TODO After refactoring the event class to use enable_shared_from_this, the + // events used in PostCompleteDependencies can become raw pointers as long as + // Event->attachEventToComplete() extends the lifetime of the pointer with + // shared_from_this. // This variable represents the returned event. It will always be nullptr if // EventNeeded is false. @@ -1202,13 +1206,10 @@ exec_graph_impl::enqueuePartitions(sycl::detail::queue_impl &Queue, // partitions. To enforce this ordering, we need to add these dependencies // to CGData. for (auto &Predecessor : Partition->MPredecessors) { - CGData.MEvents.push_back(Predecessor.lock()->MEvent); + CGData.MEvents.push_back(Predecessor->MEvent); } } - bool IsLastPartition = (Partition == MPartitions.back()); - EventImplPtr EnqueueEvent; - // We always need to request an event to use as dependency between // partitions executions and between graph executions because the // scheduler doesn't seem to guarantee the execution order of host-tasks @@ -1216,6 +1217,7 @@ exec_graph_impl::enqueuePartitions(sycl::detail::queue_impl &Queue, // in-order. constexpr bool RequestEvent = true; + EventImplPtr EnqueueEvent; if (Partition->MIsHostTask) { // The event returned by a host-task is always needed to synchronize with // other partitions or to be used by the sycl queue as a dependency for @@ -1252,8 +1254,9 @@ exec_graph_impl::enqueuePartitions(sycl::detail::queue_impl &Queue, // parallel. MSchedulerDependencies.push_back(EnqueueEvent); if (EventNeeded) { + const bool IsLastPartition = (Partition == MPartitions.back()); if (IsLastPartition) { - // If we are in the last partition copy the event to SignalEvent, + // If we are in the last partition move the event to SignalEvent, // so that it can be returned to the user. SignalEvent = std::move(EnqueueEvent); } else { diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 87122bbc8417c..e5dd537361194 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -771,10 +771,10 @@ class partition { std::unordered_map MCommandBuffers; /// List of predecessors to this partition. - std::vector> MPredecessors; + std::vector MPredecessors; /// List of successors to this partition. - std::vector> MSuccessors; + std::vector MSuccessors; /// List of requirements for this partition. std::vector MRequirements; diff --git a/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp b/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp index e2372256e5431..9bc1ff61e66cb 100644 --- a/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp +++ b/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp @@ -61,6 +61,8 @@ class LimitedHandler { handler_impl(std::shared_ptr Queue) : MQueue(Queue) {} std::shared_ptr MQueue; MockQueueImpl &get_queue() { return *MQueue; } + std::shared_ptr + MExecGraph; }; std::shared_ptr impl; std::shared_ptr MKernel; From c4cafcf9058e4240d66d9cc58decf97229f62e00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Mestre?= Date: Wed, 11 Jun 2025 14:51:11 +0100 Subject: [PATCH 15/20] Fix typo --- sycl/source/detail/graph_impl.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 758921addf9b8..e6a396cda17b7 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -1133,7 +1133,6 @@ EventImplPtr exec_graph_impl::enqueuePartitionDirectly( std::vector UrEventHandles{}; UrEventHandles.reserve(WaitEvents.size()); for (auto &SyclWaitEvent : WaitEvents) { - auto URHandle = SyclWaitEvent->getHandle(); if (auto URHandle = SyclWaitEvent->getHandle()) { UrEventHandles.push_back(URHandle); } From 97fa20e8eff03e013866b416be4258c629073e85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Mestre?= Date: Thu, 12 Jun 2025 17:09:02 +0100 Subject: [PATCH 16/20] Revert opencl adapter changes --- unified-runtime/source/adapters/opencl/command_buffer.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/unified-runtime/source/adapters/opencl/command_buffer.cpp b/unified-runtime/source/adapters/opencl/command_buffer.cpp index affc5b5882870..e048b2d22175c 100644 --- a/unified-runtime/source/adapters/opencl/command_buffer.cpp +++ b/unified-runtime/source/adapters/opencl/command_buffer.cpp @@ -115,10 +115,6 @@ urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { if (hCommandBuffer->decrementReferenceCount() == 0) { - if (hCommandBuffer->LastSubmission) { - cl_int RetErr = clWaitForEvents(1, &(hCommandBuffer->LastSubmission)); - CL_RETURN_ON_FAILURE(RetErr); - } delete hCommandBuffer; } From 71cc56f5db77d277d155d13f4cb9fa8f5211256e Mon Sep 17 00:00:00 2001 From: Fabio Mestre Date: Mon, 16 Jun 2025 19:21:05 +0100 Subject: [PATCH 17/20] Workaround HIP limitations --- .../source/adapters/hip/command_buffer.cpp | 25 +++++++++++++------ .../source/adapters/hip/command_buffer.hpp | 2 ++ 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/unified-runtime/source/adapters/hip/command_buffer.cpp b/unified-runtime/source/adapters/hip/command_buffer.cpp index af058cdde4cf0..61270bdb4291f 100644 --- a/unified-runtime/source/adapters/hip/command_buffer.cpp +++ b/unified-runtime/source/adapters/hip/command_buffer.cpp @@ -273,6 +273,10 @@ urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { if (hCommandBuffer->decrementReferenceCount() == 0) { + if (hCommandBuffer->CurrentExecution) { + UR_CHECK_ERROR(hCommandBuffer->CurrentExecution->wait()); + UR_CHECK_ERROR(urEventRelease(hCommandBuffer->CurrentExecution)); + } delete hCommandBuffer; } return UR_RESULT_SUCCESS; @@ -788,6 +792,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCommandBufferExp( uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { try { + if (hCommandBuffer->CurrentExecution) { + UR_CHECK_ERROR(hCommandBuffer->CurrentExecution->wait()); + UR_CHECK_ERROR(urEventRelease(hCommandBuffer->CurrentExecution)); + } + std::unique_ptr RetImplEvent{nullptr}; ScopedDevice Active(hQueue->getDevice()); uint32_t StreamToken; @@ -798,19 +807,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCommandBufferExp( UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, phEventWaitList)); - if (phEvent) { - RetImplEvent = std::make_unique( - UR_COMMAND_ENQUEUE_COMMAND_BUFFER_EXP, hQueue, HIPStream, - StreamToken); - UR_CHECK_ERROR(RetImplEvent->start()); - } + RetImplEvent = std::make_unique( + UR_COMMAND_ENQUEUE_COMMAND_BUFFER_EXP, hQueue, HIPStream, StreamToken); + UR_CHECK_ERROR(RetImplEvent->start()); // Launch graph UR_CHECK_ERROR(hipGraphLaunch(hCommandBuffer->HIPGraphExec, HIPStream)); + UR_CHECK_ERROR(RetImplEvent->record()); + + hCommandBuffer->CurrentExecution = RetImplEvent.release(); if (phEvent) { - UR_CHECK_ERROR(RetImplEvent->record()); - *phEvent = RetImplEvent.release(); + UR_CHECK_ERROR(urEventRetain(hCommandBuffer->CurrentExecution)); + *phEvent = hCommandBuffer->CurrentExecution; } } catch (ur_result_t Err) { return Err; diff --git a/unified-runtime/source/adapters/hip/command_buffer.hpp b/unified-runtime/source/adapters/hip/command_buffer.hpp index 3d0047adee013..3dd1417fef427 100644 --- a/unified-runtime/source/adapters/hip/command_buffer.hpp +++ b/unified-runtime/source/adapters/hip/command_buffer.hpp @@ -128,6 +128,8 @@ struct ur_exp_command_buffer_handle_t_ : ur::hip::handle_base { // Atomic variable counting the number of reference to this command_buffer // using std::atomic prevents data race when incrementing/decrementing. std::atomic_uint32_t RefCount; + // Track the event of the current graph execution. + ur_event_handle_t CurrentExecution = nullptr; // Ordered map of sync_points to ur_events std::map SyncPoints; From 03fe4b68e41bfcb637bfa1e28e0f49b171ee32cd Mon Sep 17 00:00:00 2001 From: Fabio Mestre Date: Tue, 17 Jun 2025 11:19:05 +0100 Subject: [PATCH 18/20] Update comment for new hip variable --- unified-runtime/source/adapters/hip/command_buffer.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/unified-runtime/source/adapters/hip/command_buffer.hpp b/unified-runtime/source/adapters/hip/command_buffer.hpp index 3dd1417fef427..728d97719035b 100644 --- a/unified-runtime/source/adapters/hip/command_buffer.hpp +++ b/unified-runtime/source/adapters/hip/command_buffer.hpp @@ -128,7 +128,9 @@ struct ur_exp_command_buffer_handle_t_ : ur::hip::handle_base { // Atomic variable counting the number of reference to this command_buffer // using std::atomic prevents data race when incrementing/decrementing. std::atomic_uint32_t RefCount; - // Track the event of the current graph execution. + // Track the event of the current graph execution. This extra synchronization + // is needed because HIP (unlike CUDA) does not seem to synchronize with other + // executions of the same graph during hipGraphLaunch and hipExecGraphDestroy. ur_event_handle_t CurrentExecution = nullptr; // Ordered map of sync_points to ur_events From 758cecf77947fcca733e28b415855d2207f8d485 Mon Sep 17 00:00:00 2001 From: Fabio Mestre Date: Tue, 17 Jun 2025 16:27:03 +0100 Subject: [PATCH 19/20] [HIP] Enqueue event wait instead of waiting on the host --- .../source/adapters/hip/command_buffer.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/unified-runtime/source/adapters/hip/command_buffer.cpp b/unified-runtime/source/adapters/hip/command_buffer.cpp index 61270bdb4291f..0514afdf71669 100644 --- a/unified-runtime/source/adapters/hip/command_buffer.cpp +++ b/unified-runtime/source/adapters/hip/command_buffer.cpp @@ -792,11 +792,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCommandBufferExp( uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { try { - if (hCommandBuffer->CurrentExecution) { - UR_CHECK_ERROR(hCommandBuffer->CurrentExecution->wait()); - UR_CHECK_ERROR(urEventRelease(hCommandBuffer->CurrentExecution)); - } - std::unique_ptr RetImplEvent{nullptr}; ScopedDevice Active(hQueue->getDevice()); uint32_t StreamToken; @@ -804,6 +799,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCommandBufferExp( hipStream_t HIPStream = hQueue->getNextComputeStream( numEventsInWaitList, phEventWaitList, Guard, &StreamToken); + if (hCommandBuffer->CurrentExecution) { + UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, 1, + &hCommandBuffer->CurrentExecution)); + UR_CHECK_ERROR(urEventRelease(hCommandBuffer->CurrentExecution)); + } + UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, phEventWaitList)); From a034bb85aaddf023f4d5bc9f08d26405da737119 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Mestre?= Date: Tue, 17 Jun 2025 17:04:01 +0100 Subject: [PATCH 20/20] Fix build failures after rebase --- sycl/source/detail/graph_impl.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 971d4f0973219..cea3b1e2a9876 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -1084,9 +1084,9 @@ EventImplPtr exec_graph_impl::enqueueHostTaskPartition( // dependencies for the current execution. std::unique_ptr CommandGroup = std::make_unique(sycl::detail::CGHostTask( - NodeCommandGroup->MHostTask, Queue.shared_from_this(), - NodeCommandGroup->MContext, NodeCommandGroup->MArgs, - std::move(CGData), NodeCommandGroup->getType())); + NodeCommandGroup->MHostTask, &Queue, NodeCommandGroup->MContext, + NodeCommandGroup->MArgs, std::move(CGData), + NodeCommandGroup->getType())); EventImplPtr SchedulerEvent = sycl::detail::Scheduler::getInstance().addCG( std::move(CommandGroup), Queue.shared_from_this(), EventNeeded);