diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp
index acb6bc5540af1..bf4beb5ae83d8 100644
--- a/sycl/source/detail/graph_impl.cpp
+++ b/sycl/source/detail/graph_impl.cpp
@@ -281,6 +281,9 @@ void exec_graph_impl::makePartitions() {
         MPartitionNodes[Node] = PartitionFinalNum;
         if (isPartitionRoot(Node)) {
           Partition->MRoots.insert(Node);
+          if (Node->MCGType == CGType::CodeplayHostTask) {
+            Partition->MIsHostTask = true;
+          }
         }
       }
     }
@@ -288,13 +291,15 @@ void exec_graph_impl::makePartitions() {
       Partition->schedule();
       Partition->MIsInOrderGraph = Partition->checkIfGraphIsSinglePath();
       MPartitions.push_back(Partition);
+      MRootPartitions.push_back(Partition);
       PartitionFinalNum++;
     }
   }
 
   // Add an empty partition if there is no partition, i.e. empty graph
-  if (MPartitions.size() == 0) {
+  if (MPartitions.empty()) {
     MPartitions.push_back(std::make_shared<partition>());
+    MRootPartitions.push_back(MPartitions[0]);
   }
 
   // Make global schedule list
@@ -309,8 +314,9 @@ void exec_graph_impl::makePartitions() {
       auto RootNode = Root.lock();
       for (const auto &Dep : RootNode->MPredecessors) {
         auto NodeDep = Dep.lock();
-        Partition->MPredecessors.push_back(
-            MPartitions[MPartitionNodes[NodeDep]]);
+        auto &Predecessor = MPartitions[MPartitionNodes[NodeDep]];
+        Partition->MPredecessors.push_back(Predecessor.get());
+        Predecessor->MSuccessors.push_back(Partition.get());
       }
     }
   }
@@ -910,6 +916,30 @@ exec_graph_impl::enqueueNode(ur_exp_command_buffer_handle_t CommandBuffer,
 
   return Event->getSyncPoint();
 }
+
+void exec_graph_impl::buildRequirements() {
+
+  for (auto &Node : MNodeStorage) {
+    if (!Node->MCommandGroup)
+      continue;
+
+    MRequirements.insert(MRequirements.end(),
+                         Node->MCommandGroup->getRequirements().begin(),
+                         Node->MCommandGroup->getRequirements().end());
+
+    std::shared_ptr<partition> &Partition = MPartitions[MPartitionNodes[Node]];
+
+    Partition->MRequirements.insert(
+        Partition->MRequirements.end(),
+        Node->MCommandGroup->getRequirements().begin(),
+        Node->MCommandGroup->getRequirements().end());
+
+    Partition->MAccessors.insert(Partition->MAccessors.end(),
+                                 Node->MCommandGroup->getAccStorage().begin(),
+                                 Node->MCommandGroup->getAccStorage().end());
+  }
+}
+
 void exec_graph_impl::createCommandBuffers(
     sycl::device Device, std::shared_ptr<partition> &Partition) {
   ur_exp_command_buffer_handle_t OutCommandBuffer;
@@ -949,16 +979,6 @@ void exec_graph_impl::createCommandBuffers(
     } else {
       MSyncPoints[Node] = enqueueNode(OutCommandBuffer, Node);
     }
-
-    // Append Node requirements to overall graph requirements
-    MRequirements.insert(MRequirements.end(),
-                         Node->MCommandGroup->getRequirements().begin(),
-                         Node->MCommandGroup->getRequirements().end());
-    // Also store the actual accessor to make sure they are kept alive when
-    // commands are submitted
-    MAccessors.insert(MAccessors.end(),
-                      Node->MCommandGroup->getAccStorage().begin(),
-                      Node->MCommandGroup->getAccStorage().end());
   }
 
   Res = Adapter
@@ -979,7 +999,7 @@ exec_graph_impl::exec_graph_impl(sycl::context Context,
           sycl::detail::getSyclObjImpl(Context), sycl::async_handler{},
           sycl::property_list{})),
       MDevice(GraphImpl->getDevice()), MContext(Context), MRequirements(),
-      MExecutionEvents(),
+      MSchedulerDependencies(),
       MIsUpdatable(PropList.has_property<property::graph::updatable>()),
       MEnableProfiling(
           PropList.has_property<property::graph::enable_profiling>()),
@@ -1026,140 +1046,298 @@ exec_graph_impl::~exec_graph_impl() {
   }
 }
 
-sycl::event
-exec_graph_impl::enqueue(sycl::detail::queue_impl &Queue,
-                         sycl::detail::CG::StorageInitHelper CGData) {
-  WriteLock Lock(MMutex);
+// Clean up any execution events which have finished so we don't pass them
+// to the scheduler.
+static void cleanupExecutionEvents(std::vector<EventImplPtr> &ExecutionEvents) {
+
+  auto Predicate = [](EventImplPtr &EventPtr) {
+    return EventPtr->isCompleted();
+  };
+
+  ExecutionEvents.erase(
+      std::remove_if(ExecutionEvents.begin(), ExecutionEvents.end(), Predicate),
+      ExecutionEvents.end());
+}
+
+EventImplPtr exec_graph_impl::enqueueHostTaskPartition(
+    std::shared_ptr<partition> &Partition, sycl::detail::queue_impl &Queue,
+    sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded) {
+
+  auto NodeImpl = Partition->MSchedule.front();
+  auto NodeCommandGroup =
+      static_cast<sycl::detail::CGHostTask *>(NodeImpl->MCommandGroup.get());
+
+  CGData.MRequirements.insert(CGData.MRequirements.end(),
+                              NodeCommandGroup->getRequirements().begin(),
+                              NodeCommandGroup->getRequirements().end());
+  CGData.MAccStorage.insert(CGData.MAccStorage.end(),
+                            NodeCommandGroup->getAccStorage().begin(),
+                            NodeCommandGroup->getAccStorage().end());
+
+  assert(std::all_of(
+      NodeCommandGroup->MArgs.begin(), NodeCommandGroup->MArgs.end(),
+      [](ArgDesc Arg) {
+        return Arg.MType != sycl::detail::kernel_param_kind_t::kind_std_layout;
+      }));
+
+  // Create a copy of this node command-group which contains the right
+  // dependencies for the current execution.
+  std::unique_ptr<sycl::detail::CG> CommandGroup =
+      std::make_unique<sycl::detail::CGHostTask>(sycl::detail::CGHostTask(
+          NodeCommandGroup->MHostTask, &Queue, NodeCommandGroup->MContext,
+          NodeCommandGroup->MArgs, std::move(CGData),
+          NodeCommandGroup->getType()));
+
+  EventImplPtr SchedulerEvent = sycl::detail::Scheduler::getInstance().addCG(
+      std::move(CommandGroup), Queue.shared_from_this(), EventNeeded);
+
+  if (EventNeeded) {
+    return SchedulerEvent;
+  }
+  return nullptr;
+}
+
+EventImplPtr exec_graph_impl::enqueuePartitionWithScheduler(
+    std::shared_ptr<partition> &Partition, sycl::detail::queue_impl &Queue,
+    sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded) {
+
+  if (!Partition->MRequirements.empty()) {
+    CGData.MRequirements.insert(CGData.MRequirements.end(),
+                                Partition->MRequirements.begin(),
+                                Partition->MRequirements.end());
+    CGData.MAccStorage.insert(CGData.MAccStorage.end(),
+                              Partition->MAccessors.begin(),
+                              Partition->MAccessors.end());
+  }
+
+  auto CommandBuffer = Partition->MCommandBuffers[Queue.get_device()];
+
+  std::unique_ptr<sycl::detail::CG> CommandGroup =
+      std::make_unique<sycl::detail::CGExecCommandBuffer>(
+          CommandBuffer, nullptr, std::move(CGData));
+
+  EventImplPtr SchedulerEvent = sycl::detail::Scheduler::getInstance().addCG(
+      std::move(CommandGroup), Queue.shared_from_this(), EventNeeded);
+
+  if (EventNeeded) {
+    SchedulerEvent->setEventFromSubmittedExecCommandBuffer(true);
+    return SchedulerEvent;
+  }
+
+  return nullptr;
+}
+
+EventImplPtr exec_graph_impl::enqueuePartitionDirectly(
+    std::shared_ptr<partition> &Partition, sycl::detail::queue_impl &Queue,
+    std::vector<detail::EventImplPtr> &WaitEvents, bool EventNeeded) {
+
+  // Create a list containing all the UR event handles in WaitEvents. WaitEvents
+  // is assumed to be safe for scheduler bypass and any host-task events that it
+  // contains can be ignored.
+  std::vector<ur_event_handle_t> UrEventHandles{};
+  UrEventHandles.reserve(WaitEvents.size());
+  for (auto &SyclWaitEvent : WaitEvents) {
+    if (auto URHandle = SyclWaitEvent->getHandle()) {
+      UrEventHandles.push_back(URHandle);
+    }
+  }
 
-  // Map of the partitions to their execution events
-  std::unordered_map<std::shared_ptr<partition>, sycl::detail::EventImplPtr>
-      PartitionsExecutionEvents;
+  auto CommandBuffer = Partition->MCommandBuffers[Queue.get_device()];
+  const size_t UrEnqueueWaitListSize = UrEventHandles.size();
+  const ur_event_handle_t *UrEnqueueWaitList =
+      UrEnqueueWaitListSize == 0 ? nullptr : UrEventHandles.data();
 
-  auto CreateNewEvent([&]() {
+  if (!EventNeeded) {
+    Queue.getAdapter()
+        ->call<sycl::detail::UrApiKind::urEnqueueCommandBufferExp>(
+            Queue.getHandleRef(), CommandBuffer, UrEnqueueWaitListSize,
+            UrEnqueueWaitList, nullptr);
+    return nullptr;
+  } else {
     auto NewEvent = sycl::detail::event_impl::create_device_event(Queue);
     NewEvent->setContextImpl(Queue.getContextImpl());
     NewEvent->setStateIncomplete();
+    NewEvent->setSubmissionTime();
+    ur_event_handle_t UrEvent = nullptr;
+    Queue.getAdapter()
+        ->call<sycl::detail::UrApiKind::urEnqueueCommandBufferExp>(
+            Queue.getHandleRef(), CommandBuffer, UrEventHandles.size(),
+            UrEnqueueWaitList, &UrEvent);
+    NewEvent->setHandle(UrEvent);
+    NewEvent->setEventFromSubmittedExecCommandBuffer(true);
     return NewEvent;
-  });
-
-  sycl::detail::EventImplPtr NewEvent;
-  std::vector<sycl::detail::EventImplPtr> BackupCGDataMEvents;
-  if (MPartitions.size() > 1) {
-    BackupCGDataMEvents = CGData.MEvents;
-  }
-  for (uint32_t currentPartitionsNum = 0;
-       currentPartitionsNum < MPartitions.size(); currentPartitionsNum++) {
-    auto CurrentPartition = MPartitions[currentPartitionsNum];
-    // restore initial MEvents to add only needed additional depenencies
-    if (currentPartitionsNum > 0) {
-      CGData.MEvents = BackupCGDataMEvents;
-    }
-
-    for (auto const &DepPartition : CurrentPartition->MPredecessors) {
-      CGData.MEvents.push_back(PartitionsExecutionEvents[DepPartition]);
-    }
-
-    auto CommandBuffer = CurrentPartition->MCommandBuffers[Queue.get_device()];
-
-    if (CommandBuffer) {
-      for (std::vector<sycl::detail::EventImplPtr>::iterator It =
-               MExecutionEvents.begin();
-           It != MExecutionEvents.end();) {
-        EventImplPtr &Event = *It;
-        if (!Event->isCompleted()) {
-          const std::vector<EventImplPtr> &AttachedEventsList =
-              Event->getPostCompleteEvents();
-          CGData.MEvents.reserve(CGData.MEvents.size() +
-                                 AttachedEventsList.size() + 1);
-          CGData.MEvents.push_back(Event);
-          // Add events of the previous execution of all graph partitions.
-          CGData.MEvents.insert(CGData.MEvents.end(),
-                                AttachedEventsList.begin(),
-                                AttachedEventsList.end());
-          ++It;
-        } else {
-          // Remove completed events
-          It = MExecutionEvents.erase(It);
-        }
+  }
+}
+
+EventImplPtr
+exec_graph_impl::enqueuePartitions(sycl::detail::queue_impl &Queue,
+                                   sycl::detail::CG::StorageInitHelper &CGData,
+                                   bool IsCGDataSafeForSchedulerBypass,
+                                   bool EventNeeded) {
+
+  // If EventNeeded is true, this vector is used to keep track of dependencies
+  // for the returned event. This is used when the graph has multiple end nodes
+  // which cannot be tracked with a single scheduler event.
+  std::vector<EventImplPtr> PostCompleteDependencies;
+  // TODO After refactoring the event class to use enable_shared_from_this, the
+  // events used in PostCompleteDependencies can become raw pointers as long as
+  // Event->attachEventToComplete() extends the lifetime of the pointer with
+  // shared_from_this.
+
+  // This variable represents the returned event. It will always be nullptr if
+  // EventNeeded is false.
+  EventImplPtr SignalEvent;
+
+  // CGData.MEvents gets cleared after every partition enqueue. If we need the
+  // original events, a backup needs to be created now. This is only needed when
+  // the graph contains more than one root partition.
+  std::vector<detail::EventImplPtr> BackupCGDataEvents;
+  if (MRootPartitions.size() > 1) {
+    BackupCGDataEvents = CGData.MEvents;
+  }
+
+  for (auto &Partition : MPartitions) {
+
+    if (Partition->MPredecessors.empty() && CGData.MEvents.empty()) {
+      // If this is a root partition and CGData has been cleared already, we
+      // need to restore it so that the partition execution waits for the
+      // dependencies of this graph execution.
+      CGData.MEvents = BackupCGDataEvents;
+    } else {
+      // Partitions can have multiple dependencies from previously executed
+      // partitions. To enforce this ordering, we need to add these dependencies
+      // to CGData.
+      for (auto &Predecessor : Partition->MPredecessors) {
+        CGData.MEvents.push_back(Predecessor->MEvent);
       }
+    }
 
-      NewEvent = CreateNewEvent();
-      ur_event_handle_t UREvent = nullptr;
-      // Merge requirements from the nodes into requirements (if any) from the
-      // handler.
-      CGData.MRequirements.insert(CGData.MRequirements.end(),
-                                  MRequirements.begin(), MRequirements.end());
-      CGData.MAccStorage.insert(CGData.MAccStorage.end(), MAccessors.begin(),
-                                MAccessors.end());
-
-      // If we have no requirements or dependent events for the command buffer,
-      // enqueue it directly
-      if (CGData.MRequirements.empty() && CGData.MEvents.empty()) {
-        NewEvent->setSubmissionTime();
-        ur_result_t Res =
-            Queue.getAdapter()
-                ->call_nocheck<
-                    sycl::detail::UrApiKind::urEnqueueCommandBufferExp>(
-                    Queue.getHandleRef(), CommandBuffer, 0, nullptr, &UREvent);
-        NewEvent->setHandle(UREvent);
-        if (Res == UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES) {
-          throw sycl::exception(
-              make_error_code(errc::invalid),
-              "Graphs cannot be submitted to a queue which uses "
-              "immediate command lists. Use "
-              "sycl::ext::intel::property::queue::no_immediate_"
-              "command_list to disable them.");
-        } else if (Res != UR_RESULT_SUCCESS) {
-          throw sycl::exception(
-              errc::event,
-              "Failed to enqueue event for command buffer submission");
-        }
+    // We always need to request an event to use as dependency between
+    // partitions executions and between graph executions because the
+    // scheduler doesn't seem to guarantee the execution order of host-tasks
+    // without adding explicit event dependencies even when the queue is
+    // in-order.
+    constexpr bool RequestEvent = true;
+
+    EventImplPtr EnqueueEvent;
+    if (Partition->MIsHostTask) {
+      // The event returned by a host-task is always needed to synchronize with
+      // other partitions or to be used by the sycl queue as a dependency for
+      // further commands.
+      EnqueueEvent =
+          enqueueHostTaskPartition(Partition, Queue, CGData, RequestEvent);
+    } else {
+      // The scheduler can only be skipped if the partition is a root and is not
+      // a host-task. This is because all host-tasks need to go through the
+      // scheduler and, since only the scheduler can wait on host-task events,
+      // any subsequent partitions that depend on a host-task partition also
+      // need to use the scheduler.
+      bool SkipScheduler = Partition->MPredecessors.empty() &&
+                           IsCGDataSafeForSchedulerBypass &&
+                           Partition->MRequirements.empty();
+      if (SkipScheduler) {
+        EnqueueEvent = enqueuePartitionDirectly(Partition, Queue,
+                                                CGData.MEvents, RequestEvent);
       } else {
-        std::unique_ptr<sycl::detail::CG> CommandGroup =
-            std::make_unique<sycl::detail::CGExecCommandBuffer>(
-                CommandBuffer, nullptr, std::move(CGData));
+        EnqueueEvent = enqueuePartitionWithScheduler(Partition, Queue, CGData,
+                                                     RequestEvent);
+      }
+    }
 
-        NewEvent = sycl::detail::Scheduler::getInstance().addCG(
-            std::move(CommandGroup), Queue.shared_from_this(),
-            /*EventNeeded=*/true);
+    if (!Partition->MSuccessors.empty()) {
+      // Need to keep track of the EnqueueEvent for this partition so that
+      // it can be added as a dependency to CGData when successors are executed.
+      Partition->MEvent = std::move(EnqueueEvent);
+    } else {
+      // Unified runtime guarantees the execution order of command-buffers.
+      // However, since host-tasks have been scheduled, we always need to add a
+      // dependency for the next graph execution. If we don't the next graph
+      // execution could end up with the same host-task node executing in
+      // parallel.
+      MSchedulerDependencies.push_back(EnqueueEvent);
+      if (EventNeeded) {
+        const bool IsLastPartition = (Partition == MPartitions.back());
+        if (IsLastPartition) {
+          // If we are in the last partition move the event to SignalEvent,
+          // so that it can be returned to the user.
+          SignalEvent = std::move(EnqueueEvent);
+        } else {
+          // If it's not the last partition, keep track of the event as a post
+          // complete dependency.
+          PostCompleteDependencies.push_back(std::move(EnqueueEvent));
+        }
       }
-      NewEvent->setEventFromSubmittedExecCommandBuffer(true);
-    } else if ((CurrentPartition->MSchedule.size() > 0) &&
-               (CurrentPartition->MSchedule.front()->MCGType ==
-                sycl::detail::CGType::CodeplayHostTask)) {
+    }
 
-      auto NodeImpl = CurrentPartition->MSchedule.front();
-      // Schedule host task
-      NodeImpl->MCommandGroup->getEvents().insert(
-          NodeImpl->MCommandGroup->getEvents().end(), CGData.MEvents.begin(),
-          CGData.MEvents.end());
-      // HostTask CG stores the Queue on which the task was submitted.
-      // In case of graph, this queue may differ from the actual execution
-      // queue. We therefore overload this Queue before submitting the task.
-      static_cast<sycl::detail::CGHostTask &>(*NodeImpl->MCommandGroup.get())
-          .MQueue = Queue.shared_from_this();
+    // Clear the event list so that unnecessary dependencies are not added on
+    // future partition executions.
+    CGData.MEvents.clear();
+  }
 
-      NewEvent = sycl::detail::Scheduler::getInstance().addCG(
-          NodeImpl->getCGCopy(), Queue.shared_from_this(),
-          /*EventNeeded=*/true);
+  if (EventNeeded) {
+    for (auto &EventFromOtherPartitions : PostCompleteDependencies) {
+      SignalEvent->attachEventToComplete(EventFromOtherPartitions);
     }
-    PartitionsExecutionEvents[CurrentPartition] = NewEvent;
   }
 
-  // Keep track of this execution event so we can make sure it's completed in
-  // the destructor.
-  MExecutionEvents.push_back(NewEvent);
-  // Attach events of previous partitions to ensure that when the returned event
-  // is complete all execution associated with the graph have been completed.
-  for (auto const &Elem : PartitionsExecutionEvents) {
-    if (Elem.second != NewEvent) {
-      NewEvent->attachEventToComplete(Elem.second);
+  return SignalEvent;
+}
+
+EventImplPtr
+exec_graph_impl::enqueue(sycl::detail::queue_impl &Queue,
+                         sycl::detail::CG::StorageInitHelper CGData,
+                         bool EventNeeded) {
+  WriteLock Lock(MMutex);
+
+  cleanupExecutionEvents(MSchedulerDependencies);
+  CGData.MEvents.insert(CGData.MEvents.end(), MSchedulerDependencies.begin(),
+                        MSchedulerDependencies.end());
+
+  bool IsCGDataSafeForSchedulerBypass =
+      detail::Scheduler::areEventsSafeForSchedulerBypass(
+          CGData.MEvents, Queue.getContextImpl()) &&
+      CGData.MRequirements.empty();
+
+  // This variable represents the returned event. It will always be nullptr if
+  // EventNeeded is false.
+  EventImplPtr SignalEvent;
+
+  if (!MContainsHostTask) {
+    bool SkipScheduler =
+        IsCGDataSafeForSchedulerBypass && MPartitions[0]->MRequirements.empty();
+    if (SkipScheduler) {
+      SignalEvent = enqueuePartitionDirectly(MPartitions[0], Queue,
+                                             CGData.MEvents, EventNeeded);
+    } else {
+      bool RequestSchedulerEvent = EventNeeded || MIsUpdatable;
+      auto SchedulerEvent = enqueuePartitionWithScheduler(
+          MPartitions[0], Queue, std::move(CGData), RequestSchedulerEvent);
+
+      // If the graph is updatable, and we are going through the scheduler, we
+      // need to track the execution event to make sure that any future updates
+      // happen after the graph execution.
+      // There is no need to track the execution event when updates are not
+      // allowed because Unified Runtime already guarantees the execution order
+      // of command-buffers.
+      if (MIsUpdatable) {
+        MSchedulerDependencies.push_back(
+            EventNeeded ? SchedulerEvent : std::move(SchedulerEvent));
+      }
+
+      if (EventNeeded) {
+        SignalEvent = std::move(SchedulerEvent);
+      }
     }
+  } else {
+    SignalEvent = enqueuePartitions(
+        Queue, CGData, IsCGDataSafeForSchedulerBypass, EventNeeded);
   }
-  NewEvent->setProfilingEnabled(MEnableProfiling);
-  sycl::event QueueEvent =
-      sycl::detail::createSyclObjFromImpl<sycl::event>(NewEvent);
-  return QueueEvent;
+
+  if (EventNeeded) {
+    SignalEvent->setProfilingEnabled(MEnableProfiling);
+  }
+
+  return SignalEvent;
 }
 
 void exec_graph_impl::duplicateNodes() {
@@ -1408,24 +1586,16 @@ void exec_graph_impl::update(
   std::vector<sycl::detail::AccessorImplHost *> UpdateRequirements;
   bool NeedScheduledUpdate = needsScheduledUpdate(Nodes, UpdateRequirements);
   if (NeedScheduledUpdate) {
-    // Clean up any execution events which have finished so we don't pass them
-    // to the scheduler.
-    for (auto It = MExecutionEvents.begin(); It != MExecutionEvents.end();) {
-      if ((*It)->isCompleted()) {
-        It = MExecutionEvents.erase(It);
-        continue;
-      }
-      ++It;
-    }
+    cleanupExecutionEvents(MSchedulerDependencies);
 
     // Track the event for the update command since execution may be blocked by
     // other scheduler commands
     auto UpdateEvent =
         sycl::detail::Scheduler::getInstance().addCommandGraphUpdate(
             this, Nodes, MQueueImpl, std::move(UpdateRequirements),
-            MExecutionEvents);
+            MSchedulerDependencies);
 
-    MExecutionEvents.push_back(UpdateEvent);
+    MSchedulerDependencies.push_back(UpdateEvent);
 
     if (MContainsHostTask) {
       // If the graph has HostTasks, the update has to be blocking. This is
@@ -1448,17 +1618,11 @@ void exec_graph_impl::update(
   // Rebuild cached requirements and accessor storage for this graph with
   // updated nodes
   MRequirements.clear();
-  MAccessors.clear();
-  for (auto &Node : MNodeStorage) {
-    if (!Node->MCommandGroup)
-      continue;
-    MRequirements.insert(MRequirements.end(),
-                         Node->MCommandGroup->getRequirements().begin(),
-                         Node->MCommandGroup->getRequirements().end());
-    MAccessors.insert(MAccessors.end(),
-                      Node->MCommandGroup->getAccStorage().begin(),
-                      Node->MCommandGroup->getAccStorage().end());
+  for (auto &Partition : MPartitions) {
+    Partition->MRequirements.clear();
+    Partition->MAccessors.clear();
   }
+  buildRequirements();
 }
 
 bool exec_graph_impl::needsScheduledUpdate(
@@ -1500,7 +1664,7 @@ bool exec_graph_impl::needsScheduledUpdate(
 
   // If we have previous execution events do the update through the scheduler to
   // ensure it is ordered correctly.
-  NeedScheduledUpdate |= MExecutionEvents.size() > 0;
+  NeedScheduledUpdate |= MSchedulerDependencies.size() > 0;
 
   return NeedScheduledUpdate;
 }
@@ -1995,10 +2159,11 @@ void executable_command_graph::finalizeImpl() {
 
   auto Device = impl->getGraphImpl()->getDevice();
   for (auto Partition : impl->getPartitions()) {
-    if (!Partition->isHostTask()) {
+    if (!Partition->MIsHostTask) {
       impl->createCommandBuffers(Device, Partition);
     }
   }
+  impl->buildRequirements();
 }
 
 void executable_command_graph::update(
diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp
index 0ef5b506318da..3a4db99196af1 100644
--- a/sycl/source/detail/graph_impl.hpp
+++ b/sycl/source/detail/graph_impl.hpp
@@ -771,16 +771,28 @@ class partition {
   std::unordered_map<sycl::device, ur_exp_command_buffer_handle_t>
       MCommandBuffers;
   /// List of predecessors to this partition.
-  std::vector<std::shared_ptr<partition>> MPredecessors;
+  std::vector<partition *> MPredecessors;
+
+  /// List of successors to this partition.
+  std::vector<partition *> MSuccessors;
+
+  /// List of requirements for this partition.
+  std::vector<sycl::detail::AccessorImplHost *> MRequirements;
+
+  /// Storage for accessors which are used by this partition.
+  std::vector<AccessorImplPtr> MAccessors;
+
   /// True if the graph of this partition is a single path graph
   /// and in-order optmization can be applied on it.
   bool MIsInOrderGraph = false;
 
-  /// @return True if the partition contains a host task
-  bool isHostTask() const {
-    return (MRoots.size() && ((*MRoots.begin()).lock()->MCGType ==
-                              sycl::detail::CGType::CodeplayHostTask));
-  }
+  /// True if this partition contains only one node which is a host_task.
+  bool MIsHostTask = false;
+
+  // Submission event for the partition. Used during enqueue to define
+  // dependencies between this partition and its successors. This event is
+  // replaced every time the partition is executed.
+  EventImplPtr MEvent;
 
   /// Checks if the graph is single path, i.e. each node has a single successor.
   /// @return True if the graph is a single path
@@ -1330,9 +1342,17 @@ class exec_graph_impl {
   /// execution.
   /// @param Queue Command-queue to schedule execution on.
   /// @param CGData Command-group data provided by the sycl::handler
-  /// @return Event associated with the execution of the graph.
-  sycl::event enqueue(sycl::detail::queue_impl &Queue,
-                      sycl::detail::CG::StorageInitHelper CGData);
+  /// @param EventNeeded Whether an event signalling the completion of this
+  /// operation needs to be returned.
+  /// @return Returns an event if EventNeeded is true. Returns nullptr
+  /// otherwise.
+  EventImplPtr enqueue(sycl::detail::queue_impl &Queue,
+                       sycl::detail::CG::StorageInitHelper CGData,
+                       bool EventNeeded);
+
+  /// Iterates through all the nodes in the graph to build the list of
+  /// accessor requirements for the whole graph and for each partition.
+  void buildRequirements();
 
   /// Turns the internal graph representation into UR command-buffers for a
   /// device.
@@ -1366,13 +1386,17 @@ class exec_graph_impl {
     return MPartitions;
   }
 
+  /// Query whether the graph contains any host-task nodes.
+  /// @return True if the graph contains any host-task nodes. False otherwise.
+  bool containsHostTask() const { return MContainsHostTask; }
+
   /// Checks if the previous submissions of this graph have been completed
   /// This function checks the status of events associated to the previous graph
   /// submissions.
   /// @return true if all previous submissions have been completed, false
   /// otherwise.
   bool previousSubmissionCompleted() const {
-    for (auto Event : MExecutionEvents) {
+    for (auto Event : MSchedulerDependencies) {
       if (!Event->isCompleted()) {
         return false;
       }
@@ -1447,6 +1471,65 @@ class exec_graph_impl {
                     ur_exp_command_buffer_handle_t CommandBuffer,
                     std::shared_ptr<node_impl> Node);
 
+  /// Enqueues a host-task partition (i.e. a partition that contains only a
+  /// single node and that node is a host-task).
+  /// @param Partition The partition to enqueue.
+  /// @param Queue Command-queue to schedule execution on.
+  /// @param CGData Command-group data used for initializing the host-task
+  /// command-group.
+  /// @param EventNeeded Whether an event signalling the completion of this
+  /// operation needs to be returned.
+  /// @return If EventNeeded is true returns the event resulting from enqueueing
+  /// the host-task through the scheduler. Returns nullptr otherwise.
+  EventImplPtr enqueueHostTaskPartition(
+      std::shared_ptr<partition> &Partition, sycl::detail::queue_impl &Queue,
+      sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded);
+
+  /// Enqueues a graph partition that contains no host-tasks using the
+  /// scheduler.
+  /// @param Partition The partition to enqueue.
+  /// @param Queue Command-queue to schedule execution on.
+  /// @param CGData Command-group data used for initializing the command-buffer
+  /// command-group.
+  /// @param EventNeeded Whether an event signalling the completion of this
+  /// operation needs to be returned.
+  /// @return If EventNeeded is true returns the event resulting from enqueueing
+  /// the command-buffer through the scheduler. Returns nullptr otherwise.
+  EventImplPtr enqueuePartitionWithScheduler(
+      std::shared_ptr<partition> &Partition, sycl::detail::queue_impl &Queue,
+      sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded);
+
+  /// Enqueues a graph partition that contains no host-tasks by directly calling
+  /// the unified-runtime API (i.e. avoids scheduler overhead).
+  /// @param Partition The partition to enqueue.
+  /// @param Queue Command-queue to schedule execution on.
+  /// @param WaitEvents List of events to wait on. All the events on this list
+  /// must be safe for scheduler bypass. Only events containing a valid UR event
+  /// handle will be waited for.
+  /// @param EventNeeded Whether an event signalling the completion of this
+  /// operation needs to be returned.
+  /// @return If EventNeeded is true returns the event resulting from enqueueing
+  /// the command-buffer. Returns nullptr otherwise.
+  EventImplPtr enqueuePartitionDirectly(
+      std::shared_ptr<partition> &Partition, sycl::detail::queue_impl &Queue,
+      std::vector<detail::EventImplPtr> &WaitEvents, bool EventNeeded);
+
+  /// Enqueues all the partitions in a graph.
+  /// @param Queue Command-queue to schedule execution on.
+  /// @param CGData Command-group data that contains the dependencies and
+  /// accessor requirements needed to enqueue this graph.
+  /// @param IsCGDataSafeForSchedulerBypass Whether CGData contains any events
+  /// that require enqueuing through the scheduler (e.g. requirements or
+  /// host-task events).
+  /// @param EventNeeded Whether an event signalling the completion of this
+  /// operation needs to be returned.
+  /// @return If EventNeeded is true returns the event resulting from enqueueing
+  /// the command-buffer. Returns nullptr otherwise.
+  EventImplPtr enqueuePartitions(sycl::detail::queue_impl &Queue,
+                                 sycl::detail::CG::StorageInitHelper &CGData,
+                                 bool IsCGDataSafeForSchedulerBypass,
+                                 bool EventNeeded);
+
   /// Iterates back through predecessors to find the real dependency.
   /// @param[out] Deps Found dependencies.
   /// @param[in] CurrentNode Node to find dependencies for.
@@ -1541,11 +1624,9 @@ class exec_graph_impl {
   /// List of requirements for enqueueing this command graph, accumulated from
   /// all nodes enqueued to the graph.
   std::vector<sycl::detail::AccessorImplHost *> MRequirements;
-  /// Storage for accessors which are used by this graph, accumulated from
-  /// all nodes enqueued to the graph.
-  std::vector<sycl::detail::AccessorImplPtr> MAccessors;
-  /// List of all execution events returned from command buffer enqueue calls.
-  std::vector<sycl::detail::EventImplPtr> MExecutionEvents;
+  /// List of dependencies that enqueue or update commands need to wait on
+  /// when using the scheduler path.
+  std::vector<sycl::detail::EventImplPtr> MSchedulerDependencies;
   /// List of the partitions that compose the exec graph.
   std::vector<std::shared_ptr<partition>> MPartitions;
   /// Storage for copies of nodes from the original modifiable graph.
@@ -1554,6 +1635,8 @@ class exec_graph_impl {
   std::unordered_map<std::shared_ptr<node_impl>,
                      ur_exp_command_buffer_command_handle_t>
       MCommandMap;
+  /// List of partition without any predecessors in this exec graph.
+  std::vector<std::weak_ptr<partition>> MRootPartitions;
   /// True if this graph can be updated (set with property::updatable)
   bool MIsUpdatable;
   /// If true, the graph profiling is enabled.
diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index f9a9f41450ec4..e940ab1885334 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -336,15 +336,12 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
 
   HandlerImpl->MEventMode = SubmitInfo.EventMode();
 
-  auto isHostTask = Type == CGType::CodeplayHostTask;
-
-  // TODO: this shouldn't be needed but without this
-  // the legacy adapter doesn't synchronize the operations properly
-  // when non-immediate command lists are used.
-  auto isGraphSubmission = Type == CGType::ExecCommandBuffer;
+  auto isHostTask = Type == CGType::CodeplayHostTask ||
+                    (Type == CGType::ExecCommandBuffer &&
+                     HandlerImpl->MExecGraph->containsHostTask());
 
   auto requiresPostProcess = SubmitInfo.PostProcessorFunc() || Streams.size();
-  auto noLastEventPath = !isHostTask && !isGraphSubmission &&
+  auto noLastEventPath = !isHostTask &&
                          MNoLastEventMode.load(std::memory_order_acquire) &&
                          !requiresPostProcess;
 
@@ -363,7 +360,7 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
     } else {
       std::unique_lock<std::mutex> Lock(MMutex);
 
-      if (!isGraphSubmission && trySwitchingToNoEventsMode()) {
+      if (trySwitchingToNoEventsMode()) {
         EventImpl = finalizeHandlerInOrderNoEventsUnlocked(Handler);
       } else {
         EventImpl = finalizeHandlerInOrderWithDepsUnlocked(Handler);
diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index ec7aa71196d8d..a2dfee5083095 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -757,7 +757,9 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
   detail::EventImplPtr
   finalizeHandlerInOrderHostTaskUnlocked(HandlerType &Handler) {
     assert(isInOrder());
-    assert(Handler.getType() == CGType::CodeplayHostTask);
+    assert(Handler.getType() == CGType::CodeplayHostTask ||
+           (Handler.getType() == CGType::ExecCommandBuffer &&
+            getSyclObjImpl(Handler)->MExecGraph->containsHostTask()));
 
     auto &EventToBuildDeps = MGraph.expired() ? MDefaultGraphDeps.LastEventPtr
                                               : MExtGraphDeps.LastEventPtr;
@@ -791,13 +793,8 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
   finalizeHandlerInOrderWithDepsUnlocked(HandlerType &Handler) {
     // this is handled by finalizeHandlerInOrderHostTask
     assert(Handler.getType() != CGType::CodeplayHostTask);
-
-    if (Handler.getType() == CGType::ExecCommandBuffer && MNoLastEventMode) {
-      // TODO: this shouldn't be needed but without this
-      // the legacy adapter doesn't synchronize the operations properly
-      // when non-immediate command lists are used.
-      Handler.depends_on(insertHelperBarrier(Handler));
-    }
+    assert(!(Handler.getType() == CGType::ExecCommandBuffer &&
+             getSyclObjImpl(Handler)->MExecGraph->containsHostTask()));
 
     auto &EventToBuildDeps = MGraph.expired() ? MDefaultGraphDeps.LastEventPtr
                                               : MExtGraphDeps.LastEventPtr;
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index 7959442d78dea..203b777a6ade7 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -798,15 +798,20 @@ event handler::finalize() {
           nullptr, impl->MExecGraph, std::move(impl->CGData)));
 
     } else {
-      event GraphCompletionEvent =
-          impl->MExecGraph->enqueue(impl->get_queue(), std::move(impl->CGData));
-
+      detail::queue_impl &Queue = impl->get_queue();
+      bool DiscardEvent = !impl->MEventNeeded &&
+                          Queue.supportsDiscardingPiEvents() &&
+                          !impl->MExecGraph->containsHostTask();
+      detail::EventImplPtr GraphCompletionEvent = impl->MExecGraph->enqueue(
+          Queue, std::move(impl->CGData), !DiscardEvent);
 #ifdef __INTEL_PREVIEW_BREAKING_CHANGES
-      MLastEvent = getSyclObjImpl(GraphCompletionEvent);
+      return GraphCompletionEvent;
 #else
-      MLastEvent = GraphCompletionEvent;
+      return sycl::detail::createSyclObjFromImpl<sycl::event>(
+          GraphCompletionEvent
+              ? GraphCompletionEvent
+              : sycl::detail::event_impl::create_discarded_event());
 #endif
-      return MLastEvent;
     }
   } break;
   case detail::CGType::CopyImage:
diff --git a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_with_dependencies.cpp b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_with_dependencies.cpp
new file mode 100644
index 0000000000000..b9751546e2923
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_with_dependencies.cpp
@@ -0,0 +1,76 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+// Extra run to check for immediate-command-list in Level Zero
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+// Tests the enqueue free function kernel shortcuts.
+
+#include "../graph_common.hpp"
+#include <sycl/ext/oneapi/experimental/enqueue_functions.hpp>
+#include <sycl/properties/all_properties.hpp>
+
+int main() {
+  device Device{};
+  context Context{Device};
+
+  queue InOrderQueue{Context, Device, property::queue::in_order{}};
+  queue OtherQueue{Context, Device, property::queue::in_order{}};
+
+  using T = int;
+
+  T *PtrA = malloc_device<T>(Size, InOrderQueue);
+  T *PtrB = malloc_device<T>(Size, InOrderQueue);
+  T *PtrC = malloc_device<T>(Size, InOrderQueue);
+
+  exp_ext::command_graph Graph{InOrderQueue};
+  Graph.begin_recording(InOrderQueue);
+
+  T Pattern = 42;
+  exp_ext::fill(InOrderQueue, PtrA, Pattern, Size);
+
+  exp_ext::single_task(InOrderQueue, [=]() {
+    for (size_t i = 0; i < Size; ++i) {
+      PtrB[i] = i;
+    }
+  });
+
+  exp_ext::parallel_for(
+      InOrderQueue, sycl::range<1>{Size},
+      [=](sycl::item<1> Item) { PtrC[Item] += PtrA[Item] * PtrB[Item]; });
+
+  std::vector<T> Output(Size);
+  exp_ext::copy(InOrderQueue, PtrC, Output.data(), Size);
+
+  Graph.end_recording();
+
+  auto GraphExec = Graph.finalize();
+
+  const size_t MemsetValue = 12;
+  sycl::event Event =
+      exp_ext::submit_with_event(OtherQueue, [&](sycl::handler &CGH) {
+        exp_ext::single_task(CGH, [=]() {
+          for (size_t I = 0; I < Size; ++I)
+            PtrC[I] = MemsetValue;
+        });
+      });
+
+  exp_ext::submit(InOrderQueue, [&](sycl::handler &CGH) {
+    CGH.depends_on(Event);
+    exp_ext::execute_graph(CGH, GraphExec);
+  });
+
+  InOrderQueue.wait_and_throw();
+
+  free(PtrA, InOrderQueue);
+  free(PtrB, InOrderQueue);
+  free(PtrC, InOrderQueue);
+
+  for (size_t i = 0; i < Size; i++) {
+    T Ref = Pattern * i + MemsetValue;
+    assert(check_value(i, Ref, Output[i], "Output"));
+  }
+
+  return 0;
+}
diff --git a/sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp
index dfb100acec848..fe4731dc31535 100644
--- a/sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp
+++ b/sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp
@@ -417,11 +417,11 @@ TEST_F(CommandGraphTest, GraphPartitionsMerging) {
   auto GraphExecImpl = sycl::detail::getSyclObjImpl(GraphExec);
   auto PartitionsList = GraphExecImpl->getPartitions();
   ASSERT_EQ(PartitionsList.size(), 5ul);
-  ASSERT_FALSE(PartitionsList[0]->isHostTask());
-  ASSERT_TRUE(PartitionsList[1]->isHostTask());
-  ASSERT_FALSE(PartitionsList[2]->isHostTask());
-  ASSERT_TRUE(PartitionsList[3]->isHostTask());
-  ASSERT_FALSE(PartitionsList[4]->isHostTask());
+  ASSERT_FALSE(PartitionsList[0]->MIsHostTask);
+  ASSERT_TRUE(PartitionsList[1]->MIsHostTask);
+  ASSERT_FALSE(PartitionsList[2]->MIsHostTask);
+  ASSERT_TRUE(PartitionsList[3]->MIsHostTask);
+  ASSERT_FALSE(PartitionsList[4]->MIsHostTask);
 }
 
 TEST_F(CommandGraphTest, GetNodeFromEvent) {
diff --git a/sycl/unittests/Extensions/CommandGraph/InOrderQueue.cpp b/sycl/unittests/Extensions/CommandGraph/InOrderQueue.cpp
index c10cf2033bae2..cb234c687c249 100644
--- a/sycl/unittests/Extensions/CommandGraph/InOrderQueue.cpp
+++ b/sycl/unittests/Extensions/CommandGraph/InOrderQueue.cpp
@@ -529,3 +529,159 @@ TEST_F(CommandGraphTest, InOrderQueueMemcpyAndGraph) {
   auto EventGraph = InOrderQueue.submit(
       [&](sycl::handler &CGH) { CGH.ext_oneapi_graph(InOrderGraphExec); });
 }
+
+// Validate that enqueuing a graph with
+// sycl::ext::oneapi::experimental::execute_graph using an in-order queue,
+// does not request a signaling event from the UR backend and has no event
+// dependencies.
+TEST_F(CommandGraphTest, InOrderQueueEventless) {
+  sycl::property_list Properties{sycl::property::queue::in_order()};
+  sycl::queue InOrderQueue{Dev, Properties};
+
+  experimental::command_graph<experimental::graph_state::modifiable>
+      InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()};
+
+  // Record in-order queue with three nodes
+  InOrderGraph.begin_recording(InOrderQueue);
+  auto Node1Graph = InOrderQueue.submit(
+      [&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
+
+  auto Node2Graph = InOrderQueue.submit(
+      [&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
+
+  auto Node3Graph = InOrderQueue.submit(
+      [&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
+
+  InOrderGraph.end_recording(InOrderQueue);
+
+  // Finalize main graph and check schedule
+  auto GraphExec = InOrderGraph.finalize();
+
+  auto beforeUrEnqueueCommandBufferExp = [](void *pParams) -> ur_result_t {
+    auto params =
+        *static_cast<ur_enqueue_command_buffer_exp_params_t *>(pParams);
+    EXPECT_TRUE(*params.pnumEventsInWaitList == 0);
+    EXPECT_TRUE(*params.pphEventWaitList == nullptr);
+    EXPECT_TRUE(*params.pphEvent == nullptr);
+
+    return UR_RESULT_SUCCESS;
+  };
+
+  sycl::unittest::UrMock<> Mock;
+  mock::getCallbacks().set_before_callback("urEnqueueCommandBufferExp",
+                                           beforeUrEnqueueCommandBufferExp);
+
+  const size_t Iterations = 5;
+  for (size_t I = 0; I < Iterations; ++I) {
+    sycl::ext::oneapi::experimental::execute_graph(InOrderQueue, GraphExec);
+  }
+}
+
+// Validate that if an event is requested when enqueueing a graph with
+// sycl::ext::oneapi::experimental::submit_with_event with an in-order queue,
+// the implementation requests a signal event but doesn't wait on any events
+// dependencies.
+TEST_F(CommandGraphTest, InOrderQueueRequestEvent) {
+  sycl::property_list Properties{sycl::property::queue::in_order()};
+  sycl::queue InOrderQueue{Dev, Properties};
+
+  experimental::command_graph<experimental::graph_state::modifiable>
+      InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()};
+
+  // Record in-order queue with three nodes
+  InOrderGraph.begin_recording(InOrderQueue);
+  auto Node1Graph = InOrderQueue.submit(
+      [&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
+
+  auto Node2Graph = InOrderQueue.submit(
+      [&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
+
+  auto Node3Graph = InOrderQueue.submit(
+      [&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
+
+  InOrderGraph.end_recording(InOrderQueue);
+
+  // Finalize main graph and check schedule
+  auto GraphExec = InOrderGraph.finalize();
+
+  auto beforeUrEnqueueCommandBufferExp = [](void *pParams) -> ur_result_t {
+    auto params =
+        *static_cast<ur_enqueue_command_buffer_exp_params_t *>(pParams);
+    EXPECT_TRUE(*params.pnumEventsInWaitList == 0);
+    EXPECT_TRUE(*params.pphEventWaitList == nullptr);
+    EXPECT_TRUE(*params.pphEvent != nullptr);
+
+    return UR_RESULT_SUCCESS;
+  };
+
+  sycl::unittest::UrMock<> Mock;
+  mock::getCallbacks().set_before_callback("urEnqueueCommandBufferExp",
+                                           beforeUrEnqueueCommandBufferExp);
+
+  const size_t Iterations = 5;
+  std::vector<sycl::event> OutputEvents;
+
+  for (size_t I = 0; I < Iterations; ++I) {
+    OutputEvents.push_back(sycl::ext::oneapi::experimental::submit_with_event(
+        InOrderQueue,
+        [&](sycl::handler &cgh) { cgh.ext_oneapi_graph(GraphExec); }));
+  }
+}
+
+// Validate that enqueuing a graph using an in-order queue with an event
+// dependency does not request a signaling event from the UR backend and has
+// 1 event dependency.
+TEST_F(CommandGraphTest, InOrderQueueEventlessWithDependency) {
+  device Dev{};
+  context Context{Dev};
+
+  sycl::property_list Properties{sycl::property::queue::in_order()};
+  sycl::queue InOrderQueue{Context, Dev, Properties};
+  sycl::queue OtherQueue{Context, Dev, Properties};
+
+  experimental::command_graph<experimental::graph_state::modifiable>
+      InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()};
+
+  // Record in-order queue with three nodes
+  InOrderGraph.begin_recording(InOrderQueue);
+  auto Node1Graph = InOrderQueue.submit(
+      [&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
+
+  auto Node2Graph = InOrderQueue.submit(
+      [&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
+
+  auto Node3Graph = InOrderQueue.submit(
+      [&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
+
+  InOrderGraph.end_recording(InOrderQueue);
+
+  auto GraphExec = InOrderGraph.finalize();
+
+  auto beforeUrEnqueueCommandBufferExp = [](void *pParams) -> ur_result_t {
+    auto params =
+        *static_cast<ur_enqueue_command_buffer_exp_params_t *>(pParams);
+    EXPECT_TRUE(*params.pnumEventsInWaitList == 1);
+    EXPECT_TRUE(*params.pphEvent == nullptr);
+
+    return UR_RESULT_SUCCESS;
+  };
+
+  sycl::unittest::UrMock<> Mock;
+  mock::getCallbacks().set_before_callback("urEnqueueCommandBufferExp",
+                                           beforeUrEnqueueCommandBufferExp);
+
+  sycl::event Event = sycl::ext::oneapi::experimental::submit_with_event(
+      OtherQueue, [&](sycl::handler &CGH) {
+        sycl::ext::oneapi::experimental::single_task<class TestKernel<>>(
+            CGH, [=]() {});
+      });
+
+  const size_t Iterations = 5;
+  for (size_t I = 0; I < Iterations; ++I) {
+    sycl::ext::oneapi::experimental::submit(
+        InOrderQueue, [&](sycl::handler &CGH) {
+          CGH.depends_on(Event);
+          sycl::ext::oneapi::experimental::execute_graph(CGH, GraphExec);
+        });
+  }
+}
\ No newline at end of file
diff --git a/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp b/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp
index d34cd0c2d5de9..b6380276e5826 100644
--- a/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp
+++ b/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp
@@ -61,6 +61,8 @@ class LimitedHandler {
     handler_impl(std::shared_ptr<MockQueueImpl> Queue) : MQueue(Queue) {}
     std::shared_ptr<MockQueueImpl> MQueue;
     MockQueueImpl &get_queue() { return *MQueue; }
+    std::shared_ptr<ext::oneapi::experimental::detail::exec_graph_impl>
+        MExecGraph;
   };
   std::shared_ptr<handler_impl> impl;
   std::shared_ptr<detail::kernel_impl> MKernel;
diff --git a/unified-runtime/source/adapters/hip/command_buffer.cpp b/unified-runtime/source/adapters/hip/command_buffer.cpp
index af058cdde4cf0..0514afdf71669 100644
--- a/unified-runtime/source/adapters/hip/command_buffer.cpp
+++ b/unified-runtime/source/adapters/hip/command_buffer.cpp
@@ -273,6 +273,10 @@ urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) {
 UR_APIEXPORT ur_result_t UR_APICALL
 urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) {
   if (hCommandBuffer->decrementReferenceCount() == 0) {
+    if (hCommandBuffer->CurrentExecution) {
+      UR_CHECK_ERROR(hCommandBuffer->CurrentExecution->wait());
+      UR_CHECK_ERROR(urEventRelease(hCommandBuffer->CurrentExecution));
+    }
     delete hCommandBuffer;
   }
   return UR_RESULT_SUCCESS;
@@ -795,22 +799,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCommandBufferExp(
     hipStream_t HIPStream = hQueue->getNextComputeStream(
         numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
 
+    if (hCommandBuffer->CurrentExecution) {
+      UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, 1,
+                                       &hCommandBuffer->CurrentExecution));
+      UR_CHECK_ERROR(urEventRelease(hCommandBuffer->CurrentExecution));
+    }
+
     UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList,
                                      phEventWaitList));
 
-    if (phEvent) {
-      RetImplEvent = std::make_unique<ur_event_handle_t_>(
-          UR_COMMAND_ENQUEUE_COMMAND_BUFFER_EXP, hQueue, HIPStream,
-          StreamToken);
-      UR_CHECK_ERROR(RetImplEvent->start());
-    }
+    RetImplEvent = std::make_unique<ur_event_handle_t_>(
+        UR_COMMAND_ENQUEUE_COMMAND_BUFFER_EXP, hQueue, HIPStream, StreamToken);
+    UR_CHECK_ERROR(RetImplEvent->start());
 
     // Launch graph
     UR_CHECK_ERROR(hipGraphLaunch(hCommandBuffer->HIPGraphExec, HIPStream));
+    UR_CHECK_ERROR(RetImplEvent->record());
+
+    hCommandBuffer->CurrentExecution = RetImplEvent.release();
 
     if (phEvent) {
-      UR_CHECK_ERROR(RetImplEvent->record());
-      *phEvent = RetImplEvent.release();
+      UR_CHECK_ERROR(urEventRetain(hCommandBuffer->CurrentExecution));
+      *phEvent = hCommandBuffer->CurrentExecution;
     }
   } catch (ur_result_t Err) {
     return Err;
diff --git a/unified-runtime/source/adapters/hip/command_buffer.hpp b/unified-runtime/source/adapters/hip/command_buffer.hpp
index 3d0047adee013..728d97719035b 100644
--- a/unified-runtime/source/adapters/hip/command_buffer.hpp
+++ b/unified-runtime/source/adapters/hip/command_buffer.hpp
@@ -128,6 +128,10 @@ struct ur_exp_command_buffer_handle_t_ : ur::hip::handle_base {
   // Atomic variable counting the number of reference to this command_buffer
   // using std::atomic prevents data race when incrementing/decrementing.
   std::atomic_uint32_t RefCount;
+  // Track the event of the current graph execution. This extra synchronization
+  // is needed because HIP (unlike CUDA) does not seem to synchronize with other
+  // executions of the same graph during hipGraphLaunch and hipExecGraphDestroy.
+  ur_event_handle_t CurrentExecution = nullptr;
 
   // Ordered map of sync_points to ur_events
   std::map<ur_exp_command_buffer_sync_point_t, hipGraphNode_t> SyncPoints;
diff --git a/unified-runtime/source/adapters/level_zero/command_buffer.cpp b/unified-runtime/source/adapters/level_zero/command_buffer.cpp
index 6101e3b6d4ea0..a69f23f286dfd 100644
--- a/unified-runtime/source/adapters/level_zero/command_buffer.cpp
+++ b/unified-runtime/source/adapters/level_zero/command_buffer.cpp
@@ -1488,34 +1488,33 @@ ur_result_t waitForDependencies(ur_exp_command_buffer_handle_t CommandBuffer,
   std::scoped_lock<ur_shared_mutex> Guard(CommandBuffer->Mutex);
   const bool UseCopyEngine = false;
   bool MustSignalWaitEvent = true;
-  if (NumEventsInWaitList) {
-    ur_ze_event_list_t TmpWaitList;
-    UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
-        NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
-
-    // Update the WaitList of the Wait Event
-    // Events are appended to the WaitList if the WaitList is not empty
-    if (CommandBuffer->WaitEvent->WaitList.isEmpty())
-      CommandBuffer->WaitEvent->WaitList = TmpWaitList;
-    else
-      CommandBuffer->WaitEvent->WaitList.insert(TmpWaitList);
-
-    if (!CommandBuffer->WaitEvent->WaitList.isEmpty()) {
-      // Create command-list to execute before `CommandListPtr` and will signal
-      // when `EventWaitList` dependencies are complete.
-      ur_command_list_ptr_t WaitCommandList{};
-      UR_CALL(Queue->Context->getAvailableCommandList(
-          Queue, WaitCommandList, false /*UseCopyEngine*/, NumEventsInWaitList,
-          EventWaitList, false /*AllowBatching*/, nullptr /*ForcedCmdQueue*/));
-
-      ZE2UR_CALL(zeCommandListAppendBarrier,
-                 (WaitCommandList->first, CommandBuffer->WaitEvent->ZeEvent,
-                  CommandBuffer->WaitEvent->WaitList.Length,
-                  CommandBuffer->WaitEvent->WaitList.ZeEventList));
-      Queue->executeCommandList(WaitCommandList, false /*IsBlocking*/,
-                                false /*OKToBatchCommand*/);
-      MustSignalWaitEvent = false;
-    }
+
+  ur_ze_event_list_t TmpWaitList;
+  UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
+      NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
+
+  // Update the WaitList of the Wait Event
+  // Events are appended to the WaitList if the WaitList is not empty
+  if (CommandBuffer->WaitEvent->WaitList.isEmpty())
+    CommandBuffer->WaitEvent->WaitList = TmpWaitList;
+  else
+    CommandBuffer->WaitEvent->WaitList.insert(TmpWaitList);
+
+  if (!CommandBuffer->WaitEvent->WaitList.isEmpty()) {
+    // Create command-list to execute before `CommandListPtr` and will signal
+    // when `EventWaitList` dependencies are complete.
+    ur_command_list_ptr_t WaitCommandList{};
+    UR_CALL(Queue->Context->getAvailableCommandList(
+        Queue, WaitCommandList, false /*UseCopyEngine*/, NumEventsInWaitList,
+        EventWaitList, false /*AllowBatching*/, nullptr /*ForcedCmdQueue*/));
+
+    ZE2UR_CALL(zeCommandListAppendBarrier,
+               (WaitCommandList->first, CommandBuffer->WaitEvent->ZeEvent,
+                CommandBuffer->WaitEvent->WaitList.Length,
+                CommandBuffer->WaitEvent->WaitList.ZeEventList));
+    Queue->executeCommandList(WaitCommandList, false /*IsBlocking*/,
+                              false /*OKToBatchCommand*/);
+    MustSignalWaitEvent = false;
   }
   // Given WaitEvent was created without specifying Counting Events, then this
   // event can be signalled on the host.