Skip to content

Commit d3b52a9

Browse files
authored
[SYCL] Avoid multiple event_impl allocations on kernel enqueue (#17312)
Reduced the number of event_impl allocations on the kernel enqueue path to one, for the scheduler bypass fast path.
1 parent 0f04f66 commit d3b52a9

File tree

6 files changed

+139
-96
lines changed

6 files changed

+139
-96
lines changed

sycl/source/detail/event_impl.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,17 @@ event_impl::event_impl(const QueueImplPtr &Queue)
178178
MState.store(HES_Complete);
179179
}
180180

181+
void event_impl::setQueue(const QueueImplPtr &Queue) {
182+
MQueue = Queue;
183+
MIsProfilingEnabled = Queue->MIsProfilingEnabled;
184+
MFallbackProfiling = MIsProfilingEnabled && Queue->isProfilingFallback();
185+
186+
// TODO After setting the queue, the event is no longer default
187+
// constructed. Consider a design change which would allow
188+
// for such a change regardless of the construction method.
189+
MIsDefaultConstructed = false;
190+
}
191+
181192
void *event_impl::instrumentationProlog(std::string &Name, int32_t StreamID,
182193
uint64_t &IId) const {
183194
void *TraceEvent = nullptr;

sycl/source/detail/event_impl.hpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,14 @@ class event_impl {
6868
event_impl(ur_event_handle_t Event, const context &SyclContext);
6969
event_impl(const QueueImplPtr &Queue);
7070

71+
/// Sets a queue associated with the event
72+
///
73+
/// Please note that this function changes the event state
74+
/// as it was constructed with the queue based constructor.
75+
///
76+
/// \param Queue is a queue to be associated with the event
77+
void setQueue(const QueueImplPtr &Queue);
78+
7179
/// Waits for the event.
7280
///
7381
/// Self is needed in order to pass shared_ptr to Scheduler.

sycl/source/detail/queue_impl.cpp

Lines changed: 1 addition & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -376,30 +376,13 @@ event queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
376376
// Host and interop tasks, however, are not submitted to low-level runtimes
377377
// and require separate dependency management.
378378
const CGType Type = HandlerImpl->MCGType;
379-
event Event = detail::createSyclObjFromImpl<event>(
380-
std::make_shared<detail::event_impl>());
381379
std::vector<StreamImplPtr> Streams;
382380
if (Type == CGType::Kernel)
383381
Streams = std::move(Handler.MStreamStorage);
384382

385383
HandlerImpl->MEventMode = SubmitInfo.EventMode();
386384

387-
if (SubmitInfo.PostProcessorFunc()) {
388-
auto &PostProcess = *SubmitInfo.PostProcessorFunc();
389-
390-
bool IsKernel = Type == CGType::Kernel;
391-
bool KernelUsesAssert = false;
392-
393-
if (IsKernel)
394-
// Kernel only uses assert if it's non interop one
395-
KernelUsesAssert = !(Handler.MKernel && Handler.MKernel->isInterop()) &&
396-
ProgramManager::getInstance().kernelUsesAssert(
397-
Handler.MKernelName.c_str());
398-
finalizeHandler(Handler, Event);
399-
400-
PostProcess(IsKernel, KernelUsesAssert, Event);
401-
} else
402-
finalizeHandler(Handler, Event);
385+
auto Event = finalizeHandler(Handler, SubmitInfo.PostProcessorFunc());
403386

404387
addEvent(Event);
405388

sycl/source/detail/queue_impl.hpp

Lines changed: 111 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -781,79 +781,121 @@ class queue_impl {
781781
return ResEvent;
782782
}
783783

784-
// template is needed for proper unit testing
785784
template <typename HandlerType = handler>
786-
void finalizeHandler(HandlerType &Handler, event &EventRet) {
787-
if (MIsInorder) {
788-
// Accessing and changing of an event isn't atomic operation.
789-
// Hence, here is the lock for thread-safety.
790-
std::lock_guard<std::mutex> Lock{MMutex};
791-
792-
auto &EventToBuildDeps = MGraph.expired() ? MDefaultGraphDeps.LastEventPtr
793-
: MExtGraphDeps.LastEventPtr;
794-
795-
// This dependency is needed for the following purposes:
796-
// - host tasks are handled by the runtime and cannot be implicitly
797-
// synchronized by the backend.
798-
// - to prevent the 2nd kernel enqueue when the 1st kernel is blocked
799-
// by a host task. This dependency allows to build the enqueue order in
800-
// the RT but will not be passed to the backend. See getPIEvents in
801-
// Command.
802-
if (EventToBuildDeps) {
803-
// In the case where the last event was discarded and we are to run a
804-
// host_task, we insert a barrier into the queue and use the resulting
805-
// event as the dependency for the host_task.
806-
// Note that host_task events can never be discarded, so this will not
807-
// insert barriers between host_task enqueues.
808-
if (EventToBuildDeps->isDiscarded() &&
809-
getSyclObjImpl(Handler)->MCGType == CGType::CodeplayHostTask)
810-
EventToBuildDeps = insertHelperBarrier(Handler);
811-
812-
if (!EventToBuildDeps->isDiscarded())
813-
Handler.depends_on(EventToBuildDeps);
814-
}
785+
event finalizeHandlerInOrder(HandlerType &Handler) {
786+
// Accessing and changing of an event isn't atomic operation.
787+
// Hence, here is the lock for thread-safety.
788+
std::lock_guard<std::mutex> Lock{MMutex};
789+
790+
auto &EventToBuildDeps = MGraph.expired() ? MDefaultGraphDeps.LastEventPtr
791+
: MExtGraphDeps.LastEventPtr;
792+
793+
// This dependency is needed for the following purposes:
794+
// - host tasks are handled by the runtime and cannot be implicitly
795+
// synchronized by the backend.
796+
// - to prevent the 2nd kernel enqueue when the 1st kernel is blocked
797+
// by a host task. This dependency allows to build the enqueue order in
798+
// the RT but will not be passed to the backend. See getPIEvents in
799+
// Command.
800+
if (EventToBuildDeps) {
801+
// In the case where the last event was discarded and we are to run a
802+
// host_task, we insert a barrier into the queue and use the resulting
803+
// event as the dependency for the host_task.
804+
// Note that host_task events can never be discarded, so this will not
805+
// insert barriers between host_task enqueues.
806+
if (EventToBuildDeps->isDiscarded() &&
807+
getSyclObjImpl(Handler)->MCGType == CGType::CodeplayHostTask)
808+
EventToBuildDeps = insertHelperBarrier(Handler);
809+
810+
if (!EventToBuildDeps->isDiscarded())
811+
Handler.depends_on(EventToBuildDeps);
812+
}
813+
814+
// If there is an external event set, add it as a dependency and clear it.
815+
// We do not need to hold the lock as MLastEventMtx will ensure the last
816+
// event reflects the corresponding external event dependence as well.
817+
std::optional<event> ExternalEvent = popExternalEvent();
818+
if (ExternalEvent)
819+
Handler.depends_on(*ExternalEvent);
820+
821+
auto EventRet = Handler.finalize();
822+
EventToBuildDeps = getSyclObjImpl(EventRet);
823+
824+
return EventRet;
825+
}
826+
827+
template <typename HandlerType = handler>
828+
event finalizeHandlerOutOfOrder(HandlerType &Handler) {
829+
const CGType Type = getSyclObjImpl(Handler)->MCGType;
830+
std::lock_guard<std::mutex> Lock{MMutex};
831+
// The following code supports barrier synchronization if host task is
832+
// involved in the scenario. Native barriers cannot handle host task
833+
// dependency so in the case where some commands were not enqueued
834+
// (blocked), we track them to prevent barrier from being enqueued
835+
// earlier.
836+
{
837+
std::lock_guard<std::mutex> RequestLock(MMissedCleanupRequestsMtx);
838+
for (auto &UpdatedGraph : MMissedCleanupRequests)
839+
doUnenqueuedCommandCleanup(UpdatedGraph);
840+
MMissedCleanupRequests.clear();
841+
}
842+
auto &Deps = MGraph.expired() ? MDefaultGraphDeps : MExtGraphDeps;
843+
if (Type == CGType::Barrier && !Deps.UnenqueuedCmdEvents.empty()) {
844+
Handler.depends_on(Deps.UnenqueuedCmdEvents);
845+
}
846+
if (Deps.LastBarrier &&
847+
(Type == CGType::CodeplayHostTask || (!Deps.LastBarrier->isEnqueued())))
848+
Handler.depends_on(Deps.LastBarrier);
849+
850+
auto EventRet = Handler.finalize();
851+
EventImplPtr EventRetImpl = getSyclObjImpl(EventRet);
852+
if (Type == CGType::CodeplayHostTask)
853+
Deps.UnenqueuedCmdEvents.push_back(EventRetImpl);
854+
else if (Type == CGType::Barrier || Type == CGType::BarrierWaitlist) {
855+
Deps.LastBarrier = EventRetImpl;
856+
Deps.UnenqueuedCmdEvents.clear();
857+
} else if (!EventRetImpl->isEnqueued()) {
858+
Deps.UnenqueuedCmdEvents.push_back(EventRetImpl);
859+
}
860+
861+
return EventRet;
862+
}
863+
864+
template <typename HandlerType = handler>
865+
event finalizeHandlerPostProcess(
866+
HandlerType &Handler,
867+
const optional<SubmitPostProcessF> &PostProcessorFunc) {
868+
auto HandlerImpl = detail::getSyclObjImpl(Handler);
869+
const CGType Type = HandlerImpl->MCGType;
870+
871+
bool IsKernel = Type == CGType::Kernel;
872+
bool KernelUsesAssert = false;
873+
874+
if (IsKernel)
875+
// Kernel only uses assert if it's non interop one
876+
KernelUsesAssert = !(Handler.MKernel && Handler.MKernel->isInterop()) &&
877+
ProgramManager::getInstance().kernelUsesAssert(
878+
Handler.MKernelName.c_str());
879+
880+
auto Event = MIsInorder ? finalizeHandlerInOrder(Handler)
881+
: finalizeHandlerOutOfOrder(Handler);
882+
883+
auto &PostProcess = *PostProcessorFunc;
815884

816-
// If there is an external event set, add it as a dependency and clear it.
817-
// We do not need to hold the lock as MLastEventMtx will ensure the last
818-
// event reflects the corresponding external event dependence as well.
819-
std::optional<event> ExternalEvent = popExternalEvent();
820-
if (ExternalEvent)
821-
Handler.depends_on(*ExternalEvent);
885+
PostProcess(IsKernel, KernelUsesAssert, Event);
822886

823-
EventRet = Handler.finalize();
824-
EventToBuildDeps = getSyclObjImpl(EventRet);
887+
return Event;
888+
}
889+
890+
// template is needed for proper unit testing
891+
template <typename HandlerType = handler>
892+
event finalizeHandler(HandlerType &Handler,
893+
const optional<SubmitPostProcessF> &PostProcessorFunc) {
894+
if (PostProcessorFunc) {
895+
return finalizeHandlerPostProcess(Handler, PostProcessorFunc);
825896
} else {
826-
const CGType Type = getSyclObjImpl(Handler)->MCGType;
827-
std::lock_guard<std::mutex> Lock{MMutex};
828-
// The following code supports barrier synchronization if host task is
829-
// involved in the scenario. Native barriers cannot handle host task
830-
// dependency so in the case where some commands were not enqueued
831-
// (blocked), we track them to prevent barrier from being enqueued
832-
// earlier.
833-
{
834-
std::lock_guard<std::mutex> RequestLock(MMissedCleanupRequestsMtx);
835-
for (auto &UpdatedGraph : MMissedCleanupRequests)
836-
doUnenqueuedCommandCleanup(UpdatedGraph);
837-
MMissedCleanupRequests.clear();
838-
}
839-
auto &Deps = MGraph.expired() ? MDefaultGraphDeps : MExtGraphDeps;
840-
if (Type == CGType::Barrier && !Deps.UnenqueuedCmdEvents.empty()) {
841-
Handler.depends_on(Deps.UnenqueuedCmdEvents);
842-
}
843-
if (Deps.LastBarrier && (Type == CGType::CodeplayHostTask ||
844-
(!Deps.LastBarrier->isEnqueued())))
845-
Handler.depends_on(Deps.LastBarrier);
846-
847-
EventRet = Handler.finalize();
848-
EventImplPtr EventRetImpl = getSyclObjImpl(EventRet);
849-
if (Type == CGType::CodeplayHostTask)
850-
Deps.UnenqueuedCmdEvents.push_back(EventRetImpl);
851-
else if (Type == CGType::Barrier || Type == CGType::BarrierWaitlist) {
852-
Deps.LastBarrier = EventRetImpl;
853-
Deps.UnenqueuedCmdEvents.clear();
854-
} else if (!EventRetImpl->isEnqueued()) {
855-
Deps.UnenqueuedCmdEvents.push_back(EventRetImpl);
856-
}
897+
return MIsInorder ? finalizeHandlerInOrder(Handler)
898+
: finalizeHandlerOutOfOrder(Handler);
857899
}
858900
}
859901

sycl/source/handler.cpp

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -532,11 +532,11 @@ event handler::finalize() {
532532

533533
if (DiscardEvent) {
534534
EnqueueKernel();
535-
auto EventImpl = std::make_shared<detail::event_impl>(
536-
detail::event_impl::HES_Discarded);
537-
MLastEvent = detail::createSyclObjFromImpl<event>(std::move(EventImpl));
535+
const auto &EventImpl = detail::getSyclObjImpl(MLastEvent);
536+
EventImpl->setStateDiscarded();
538537
} else {
539-
NewEvent = std::make_shared<detail::event_impl>(MQueue);
538+
NewEvent = detail::getSyclObjImpl(MLastEvent);
539+
NewEvent->setQueue(MQueue);
540540
NewEvent->setWorkerQueue(MQueue);
541541
NewEvent->setContextImpl(MQueue->getContextImplPtr());
542542
NewEvent->setStateIncomplete();
@@ -549,8 +549,6 @@ event handler::finalize() {
549549
NewEvent->getPreparedDepsEvents() = impl->CGData.MEvents;
550550
NewEvent->cleanDepEventsThroughOneLevel();
551551
}
552-
553-
MLastEvent = detail::createSyclObjFromImpl<event>(std::move(NewEvent));
554552
}
555553
return MLastEvent;
556554
}

sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ class LimitedHandler {
5252
sycl::detail::CGType MCGType;
5353
std::shared_ptr<MockQueueImpl> MQueue;
5454
std::shared_ptr<sycl::detail::handler_impl> impl;
55+
std::shared_ptr<detail::kernel_impl> MKernel;
56+
detail::string MKernelName;
5557
};
5658

5759
// Needed to use EXPECT_CALL to verify depends_on that originally appends lst
@@ -80,17 +82,16 @@ TEST_F(SchedulerTest, InOrderQueueSyncCheck) {
8082

8183
// Check that tasks submitted to an in-order queue implicitly depend_on the
8284
// previous task, this is needed to properly sync blocking & blocked tasks.
83-
sycl::event Event;
8485
{
8586
LimitedHandlerSimulation MockCGH{detail::CGType::CodeplayHostTask, Queue};
8687
EXPECT_CALL(MockCGH, depends_on(An<const sycl::detail::EventImplPtr &>()))
8788
.Times(0);
88-
Queue->finalizeHandler<LimitedHandlerSimulation>(MockCGH, Event);
89+
Queue->finalizeHandler<LimitedHandlerSimulation>(MockCGH, std::nullopt);
8990
}
9091
{
9192
LimitedHandlerSimulation MockCGH{detail::CGType::CodeplayHostTask, Queue};
9293
EXPECT_CALL(MockCGH, depends_on(An<const sycl::detail::EventImplPtr &>()))
9394
.Times(1);
94-
Queue->finalizeHandler<LimitedHandlerSimulation>(MockCGH, Event);
95+
Queue->finalizeHandler<LimitedHandlerSimulation>(MockCGH, std::nullopt);
9596
}
9697
}

0 commit comments

Comments
 (0)