Skip to content

Commit c6b7b8e

Browse files
[SYCL] Add mode where only last command in each batch yields a host-visible event (#5354)
1 parent 159a516 commit c6b7b8e

File tree

3 files changed

+151
-107
lines changed

3 files changed

+151
-107
lines changed

sycl/doc/EnvironmentVariables.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ variables in production code.</span>
145145
| `SYCL_PI_LEVEL_ZERO_FILTER_EVENT_WAIT_LIST` | Integer | When set to 0, disables filtering of signaled events from wait lists when using the Level Zero backend. The default is 1. |
146146
| `SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE` | Any(\*) | This environment variable enables users to control use of copy engines for copy operations. If the value is an integer, it will allow the use of copy engines, if available in the device, in Level Zero plugin to transfer SYCL buffer or image data between the host and/or device(s) and to fill SYCL buffer or image data in device or shared memory. The value of this environment variable can also be a pair of the form "lower_index:upper_index" where the indices point to copy engines in a list of all available copy engines. The default is 1. |
147147
| `SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY` (experimental) | Integer | Allows the use of copy engine, if available in the device, in Level Zero plugin for device to device copy operations. The default is 0. This option is experimental and will be removed once heuristics are added to make a decision about use of copy engine for device to device copy operations. |
148-
| `SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS` | Any(\*) | Enable support of device-scope events whose state is not visible to the host. If enabled the Level Zero plugin would create all events having device-scope only and create proxy host-visible events for them when their status is needed (wait/query) on the host. The default is 0, meaning all events are host-visible. |
148+
| `SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS` | Any(\*) | Enable support of device-scope events whose state is not visible to the host. If enabled mode is SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1 the Level Zero plugin would create all events having device-scope only and create proxy host-visible events for them when their status is needed (wait/query) on the host. If enabled mode is SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=2 the Level Zero plugin would create all events having device-scope and add proxy host-visible event at the end of each command-list submission. The default is 0, meaning all events are host-visible. |
149149

150150
## Debugging variables for CUDA Plugin
151151

sycl/plugins/level_zero/pi_level_zero.cpp

Lines changed: 136 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ extern "C" {
3232
// Forward declarartions.
3333
static pi_result EventRelease(pi_event Event, pi_queue LockedQueue);
3434
static pi_result QueueRelease(pi_queue Queue, pi_queue LockedQueue);
35+
static pi_result EventCreate(pi_context Context, bool HostVisible,
36+
pi_event *RetEvent);
3537
}
3638

3739
namespace {
@@ -186,12 +188,31 @@ static void zePrint(const char *Format, ...) {
186188
}
187189
}
188190

189-
// Controls whether device-scope events are used.
190-
static const bool ZeAllHostVisibleEvents = [] {
191+
// Controls whether device-scope events are used, and how.
192+
static const enum EventsScope {
193+
// All events are created host-visible (the default mode)
194+
AllHostVisible,
195+
// All events are created with device-scope and only when
196+
// host waits them or queries their status that a proxy
197+
// host-visible event is created and set to signal after
198+
// original event signals.
199+
OnDemandHostVisibleProxy,
200+
// All events are created with device-scope and only
201+
// when a batch of commands is submitted for execution a
202+
// last command in that batch is added to signal host-visible
203+
// completion of each command in this batch.
204+
LastCommandInBatchHostVisible
205+
} EventsScope = [] {
191206
const auto DeviceEventsStr =
192207
std::getenv("SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS");
193-
bool result = (DeviceEventsStr ? (std::atoi(DeviceEventsStr) == 0) : true);
194-
return result;
208+
209+
switch (DeviceEventsStr ? std::atoi(DeviceEventsStr) : 0) {
210+
case 1:
211+
return OnDemandHostVisibleProxy;
212+
case 2:
213+
return LastCommandInBatchHostVisible;
214+
}
215+
return AllHostVisible;
195216
}();
196217

197218
// Maximum number of events that can be present in an event ZePool is captured
@@ -415,14 +436,11 @@ _pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &Pool,
415436
ze_event_pool_flag_t ZePoolFlag = {};
416437
std::list<ze_event_pool_handle_t> *ZePoolCache;
417438

418-
if (ZeAllHostVisibleEvents) {
419-
ZePoolFlag = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
420-
ZePoolCache = &ZeEventPoolCache;
421-
} else if (HostVisible) {
439+
if (HostVisible) {
422440
ZePoolFlag = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
423441
ZePoolCache = &ZeHostVisibleEventPoolCache;
424442
} else {
425-
ZePoolCache = &ZeEventPoolCache;
443+
ZePoolCache = &ZeDeviceScopeEventPoolCache;
426444
}
427445

428446
// Remove full pool from the cache.
@@ -468,30 +486,24 @@ pi_result _pi_context::decrementUnreleasedEventsInPool(pi_event Event) {
468486
return PI_SUCCESS;
469487
}
470488

489+
std::list<ze_event_pool_handle_t> *ZePoolCache;
490+
if (Event->IsHostVisible()) {
491+
ZePoolCache = &ZeHostVisibleEventPoolCache;
492+
} else {
493+
ZePoolCache = &ZeDeviceScopeEventPoolCache;
494+
}
495+
471496
// Put the empty pool to the cache of the pools.
472497
std::lock_guard<std::mutex> Lock(ZeEventPoolCacheMutex);
473498
if (NumEventsUnreleasedInEventPool[Event->ZeEventPool] == 0)
474499
die("Invalid event release: event pool doesn't have unreleased events");
475500
if (--NumEventsUnreleasedInEventPool[Event->ZeEventPool] == 0) {
476-
if (ZeEventPoolCache.front() != Event->ZeEventPool) {
477-
ZeEventPoolCache.push_back(Event->ZeEventPool);
501+
if (ZePoolCache->front() != Event->ZeEventPool) {
502+
ZePoolCache->push_back(Event->ZeEventPool);
478503
}
479504
NumEventsAvailableInEventPool[Event->ZeEventPool] = MaxNumEventsPerPool;
480505
}
481506

482-
if (Event->ZeHostVisibleEventPool) {
483-
if (NumEventsUnreleasedInEventPool[Event->ZeHostVisibleEventPool] == 0)
484-
die("Invalid host visible event release: host visible event pool doesn't "
485-
"have unreleased events");
486-
if (--NumEventsUnreleasedInEventPool[Event->ZeHostVisibleEventPool] == 0) {
487-
if (ZeHostVisibleEventPoolCache.front() !=
488-
Event->ZeHostVisibleEventPool) {
489-
ZeHostVisibleEventPoolCache.push_back(Event->ZeHostVisibleEventPool);
490-
}
491-
NumEventsAvailableInEventPool[Event->ZeHostVisibleEventPool] =
492-
MaxNumEventsPerPool;
493-
}
494-
}
495507
return PI_SUCCESS;
496508
}
497509

@@ -788,12 +800,12 @@ pi_result _pi_context::finalize() {
788800
// For example, event pool caches would be still alive.
789801
{
790802
std::lock_guard<std::mutex> Lock(ZeEventPoolCacheMutex);
791-
for (auto &ZePool : ZeEventPoolCache)
803+
for (auto &ZePool : ZeDeviceScopeEventPoolCache)
792804
ZE_CALL(zeEventPoolDestroy, (ZePool));
793805
for (auto &ZePool : ZeHostVisibleEventPoolCache)
794806
ZE_CALL(zeEventPoolDestroy, (ZePool));
795807

796-
ZeEventPoolCache.clear();
808+
ZeDeviceScopeEventPoolCache.clear();
797809
ZeHostVisibleEventPoolCache.clear();
798810
}
799811

@@ -1321,6 +1333,39 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList,
13211333
KernelsToBeSubmitted.clear();
13221334
}
13231335

1336+
// In this mode all inner-batch events have device visibility only,
1337+
// and we want the last command in the batch to signal a host-visible
1338+
// event that anybody waiting for any event in the batch will
1339+
// really be using.
1340+
//
1341+
if (EventsScope == LastCommandInBatchHostVisible) {
1342+
// Create a "proxy" host-visible event.
1343+
//
1344+
pi_event HostVisibleEvent;
1345+
PI_CALL(EventCreate(Context, true, &HostVisibleEvent));
1346+
1347+
// Update each command's event in the command-list to "see" this
1348+
// proxy event as a host-visible counterpart.
1349+
for (auto &Event : CommandList->second.EventList) {
1350+
Event->HostVisibleEvent = HostVisibleEvent;
1351+
PI_CALL(piEventRetain(HostVisibleEvent));
1352+
}
1353+
1354+
// Decrement the reference count by 1 so all the remaining references
1355+
// are from the other commands in this batch. This host-visible event
1356+
// will be destroyed after all events in the batch are gone.
1357+
PI_CALL(piEventRelease(HostVisibleEvent));
1358+
// Indicate no cleanup is needed for this PI event as it is special.
1359+
HostVisibleEvent->CleanedUp = true;
1360+
1361+
// Finally set to signal the host-visible event at the end of the
1362+
// command-list.
1363+
// TODO: see if we need a barrier here (or explicit wait for all events in
1364+
// the batch).
1365+
ZE_CALL(zeCommandListAppendSignalEvent,
1366+
(CommandList->first, HostVisibleEvent->ZeEvent));
1367+
}
1368+
13241369
// Close the command list and have it ready for dispatch.
13251370
ZE_CALL(zeCommandListClose, (CommandList->first));
13261371
// Offload command list to the GPU for asynchronous execution
@@ -1504,9 +1549,10 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList(
15041549
auto ZeEvent = EventList[I]->ZeEvent;
15051550

15061551
// Poll of the host-visible events.
1507-
auto ZeEventHostVisible = EventList[I]->getHostVisibleEvent();
1508-
if (FilterEventWaitList && ZeEventHostVisible) {
1509-
auto Res = ZE_CALL_NOCHECK(zeEventQueryStatus, (ZeEventHostVisible));
1552+
auto HostVisibleEvent = EventList[I]->HostVisibleEvent;
1553+
if (FilterEventWaitList && HostVisibleEvent) {
1554+
auto Res =
1555+
ZE_CALL_NOCHECK(zeEventQueryStatus, (HostVisibleEvent->ZeEvent));
15101556
if (Res == ZE_RESULT_SUCCESS) {
15111557
// Event has already completed, don't put it into the list
15121558
continue;
@@ -1792,8 +1838,11 @@ pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms,
17921838
if (NumPlatforms)
17931839
*NumPlatforms = PiPlatformsCache->size();
17941840

1795-
zePrint("Using %s events\n",
1796-
ZeAllHostVisibleEvents ? "all host-visible" : "device-only");
1841+
zePrint("Using events scope: %s\n",
1842+
EventsScope == AllHostVisible ? "all host-visible"
1843+
: EventsScope == OnDemandHostVisibleProxy
1844+
? "on demand host-visible proxy"
1845+
: "only last command in a batch is host-visible");
17971846
return PI_SUCCESS;
17981847
}
17991848

@@ -4724,45 +4773,16 @@ pi_result piextKernelGetNativeHandle(pi_kernel Kernel,
47244773
//
47254774
// Events
47264775
//
4727-
ze_event_handle_t _pi_event::getHostVisibleEvent() const {
4728-
if (ZeAllHostVisibleEvents) {
4729-
return ZeEvent;
4730-
} else if (ZeHostVisibleEvent) {
4731-
return ZeHostVisibleEvent;
4732-
} else {
4733-
return nullptr;
4734-
}
4735-
}
4736-
47374776
pi_result
4738-
_pi_event::getOrCreateHostVisibleEvent(ze_event_handle_t &HostVisibleEvent) {
4777+
_pi_event::getOrCreateHostVisibleEvent(ze_event_handle_t &ZeHostVisibleEvent) {
47394778

4740-
if (ZeAllHostVisibleEvents) {
4741-
HostVisibleEvent = ZeEvent;
4742-
} else if (ZeHostVisibleEvent) {
4743-
HostVisibleEvent = ZeHostVisibleEvent;
4744-
} else {
4745-
size_t Index;
4746-
ze_event_pool_handle_t ZeEventPool = {};
4747-
if (auto Res =
4748-
Context->getFreeSlotInExistingOrNewPool(ZeEventPool, Index, true))
4749-
return Res;
4779+
if (!HostVisibleEvent) {
4780+
if (EventsScope != OnDemandHostVisibleProxy)
4781+
die("getOrCreateHostVisibleEvent: missing host-visible event");
47504782

4751-
// Create a "proxy" host-visible event.
4752-
//
4753-
// TODO: consider creating just single host-visible proxy event to
4754-
// represent multiple device-scope events. E.g. have a host-visible
4755-
// event at the end of each command-list to represent device-scope
4756-
// events from every command in that command-list.
4757-
//
4758-
ZeStruct<ze_event_desc_t> ZeEventDesc;
4759-
ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
4760-
ZeEventDesc.wait = 0;
4761-
ZeEventDesc.index = Index;
4762-
4763-
ZE_CALL(zeEventCreate, (ZeEventPool, &ZeEventDesc, &ZeHostVisibleEvent));
4764-
ZeHostVisibleEventPool = ZeEventPool;
4765-
HostVisibleEvent = ZeHostVisibleEvent;
4783+
// Create a "proxy" host-visible event on demand.
4784+
PI_CALL(EventCreate(Context, true, &HostVisibleEvent));
4785+
HostVisibleEvent->CleanedUp = true;
47664786

47674787
// Submit the command(s) signalling the proxy event to the queue.
47684788
// We have to first submit a wait for the device-only event for which this
@@ -4783,36 +4803,41 @@ _pi_event::getOrCreateHostVisibleEvent(ze_event_handle_t &HostVisibleEvent) {
47834803
ZE_CALL(zeCommandListAppendWaitOnEvents,
47844804
(CommandList->first, 1, &ZeEvent));
47854805
ZE_CALL(zeCommandListAppendSignalEvent,
4786-
(CommandList->first, ZeHostVisibleEvent));
4806+
(CommandList->first, HostVisibleEvent->ZeEvent));
47874807

47884808
if (auto Res = Queue->executeCommandList(CommandList, false, OkToBatch))
47894809
return Res;
47904810
}
47914811
}
4812+
4813+
ZeHostVisibleEvent = HostVisibleEvent->ZeEvent;
47924814
return PI_SUCCESS;
47934815
}
47944816

4795-
pi_result piEventCreate(pi_context Context, pi_event *RetEvent) {
4817+
static pi_result EventCreate(pi_context Context, bool HostVisible,
4818+
pi_event *RetEvent) {
47964819
size_t Index = 0;
47974820
ze_event_pool_handle_t ZeEventPool = {};
4798-
if (auto Res = Context->getFreeSlotInExistingOrNewPool(ZeEventPool, Index))
4821+
if (auto Res = Context->getFreeSlotInExistingOrNewPool(ZeEventPool, Index,
4822+
HostVisible))
47994823
return Res;
48004824

48014825
ze_event_handle_t ZeEvent;
48024826
ZeStruct<ze_event_desc_t> ZeEventDesc;
48034827
ZeEventDesc.index = Index;
48044828
ZeEventDesc.wait = 0;
4805-
//
4806-
// Set the scope to "device" for every event. This is sufficient for global
4807-
// device access and peer device access. If needed to be waited on the host
4808-
// we are doing special handling, see piEventsWait.
4809-
//
4810-
// TODO: see if "sub-device" (ZE_EVENT_SCOPE_FLAG_SUBDEVICE) can better be
4811-
// used in some circumstances.
4812-
//
4813-
if (ZeAllHostVisibleEvents) {
4829+
4830+
if (HostVisible) {
48144831
ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
48154832
} else {
4833+
//
4834+
// Set the scope to "device" for every event. This is sufficient for global
4835+
// device access and peer device access. If needed to be seen on the host
4836+
// we are doing special handling, see EventsScope options.
4837+
//
4838+
// TODO: see if "sub-device" (ZE_EVENT_SCOPE_FLAG_SUBDEVICE) can better be
4839+
// used in some circumstances.
4840+
//
48164841
ZeEventDesc.signal = 0;
48174842
}
48184843

@@ -4828,9 +4853,17 @@ pi_result piEventCreate(pi_context Context, pi_event *RetEvent) {
48284853
} catch (...) {
48294854
return PI_ERROR_UNKNOWN;
48304855
}
4856+
4857+
if (HostVisible)
4858+
(*RetEvent)->HostVisibleEvent = *RetEvent;
4859+
48314860
return PI_SUCCESS;
48324861
}
48334862

4863+
pi_result piEventCreate(pi_context Context, pi_event *RetEvent) {
4864+
return EventCreate(Context, EventsScope == AllHostVisible, RetEvent);
4865+
}
4866+
48344867
pi_result piEventGetInfo(pi_event Event, pi_event_info ParamName,
48354868
size_t ParamValueSize, void *ParamValue,
48364869
size_t *ParamValueSizeRet) {
@@ -4860,10 +4893,11 @@ pi_result piEventGetInfo(pi_event Event, pi_event_info ParamName,
48604893
// Make sure that we query a host-visible event only.
48614894
// If one wasn't yet created then don't create it here as well, and
48624895
// just conservatively return that event is not yet completed.
4863-
auto ZeHostVisibleEvent = Event->getHostVisibleEvent();
4864-
if (ZeHostVisibleEvent) {
4896+
auto HostVisibleEvent = Event->HostVisibleEvent;
4897+
if (HostVisibleEvent) {
48654898
ze_result_t ZeResult;
4866-
ZeResult = ZE_CALL_NOCHECK(zeEventQueryStatus, (ZeHostVisibleEvent));
4899+
ZeResult =
4900+
ZE_CALL_NOCHECK(zeEventQueryStatus, (HostVisibleEvent->ZeEvent));
48674901
if (ZeResult == ZE_RESULT_SUCCESS) {
48684902
return getInfo(ParamValueSize, ParamValue, ParamValueSizeRet,
48694903
pi_int32{CL_COMPLETE}); // Untie from OpenCL
@@ -5072,15 +5106,17 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) {
50725106
if (NumEvents && !EventList) {
50735107
return PI_INVALID_EVENT;
50745108
}
5075-
// Make sure to add all host-visible "proxy" event signals if needed.
5076-
// This ensures that all signalling commands are submitted below and
5077-
// thus proxy events can be waited without a deadlock.
5078-
//
5079-
for (uint32_t I = 0; I < NumEvents; I++) {
5080-
ze_event_handle_t ZeHostVisibleEvent;
5081-
if (auto Res =
5082-
EventList[I]->getOrCreateHostVisibleEvent(ZeHostVisibleEvent))
5083-
return Res;
5109+
if (EventsScope == OnDemandHostVisibleProxy) {
5110+
// Make sure to add all host-visible "proxy" event signals if needed.
5111+
// This ensures that all signalling commands are submitted below and
5112+
// thus proxy events can be waited without a deadlock.
5113+
//
5114+
for (uint32_t I = 0; I < NumEvents; I++) {
5115+
ze_event_handle_t ZeHostVisibleEvent;
5116+
if (auto Res =
5117+
EventList[I]->getOrCreateHostVisibleEvent(ZeHostVisibleEvent))
5118+
return Res;
5119+
}
50845120
}
50855121
// Submit dependent open command lists for execution, if any
50865122
for (uint32_t I = 0; I < NumEvents; I++) {
@@ -5096,10 +5132,11 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) {
50965132
}
50975133
}
50985134
for (uint32_t I = 0; I < NumEvents; I++) {
5099-
ze_event_handle_t ZeEvent = EventList[I]->getHostVisibleEvent();
5100-
if (!ZeEvent)
5135+
auto HostVisibleEvent = EventList[I]->HostVisibleEvent;
5136+
if (!HostVisibleEvent)
51015137
die("The host-visible proxy event missing");
51025138

5139+
ze_event_handle_t ZeEvent = HostVisibleEvent->ZeEvent;
51035140
zePrint("ZeEvent = %#lx\n", pi_cast<std::uintptr_t>(ZeEvent));
51045141
ZE_CALL(zeHostSynchronize, (ZeEvent));
51055142

@@ -5159,8 +5196,12 @@ static pi_result EventRelease(pi_event Event, pi_queue LockedQueue) {
51595196
if (Event->OwnZeEvent) {
51605197
ZE_CALL(zeEventDestroy, (Event->ZeEvent));
51615198
}
5162-
if (Event->ZeHostVisibleEvent) {
5163-
ZE_CALL(zeEventDestroy, (Event->ZeHostVisibleEvent));
5199+
// It is possible that host-visible event was never created.
5200+
// In case it was check if that's different from this same event
5201+
// and release a reference to it.
5202+
if (Event->HostVisibleEvent && Event->HostVisibleEvent != Event) {
5203+
// Decrement ref-count of the host-visible proxy event.
5204+
PI_CALL(piEventRelease(Event->HostVisibleEvent));
51645205
}
51655206

51665207
auto Context = Event->Context;

0 commit comments

Comments
 (0)