Skip to content

Commit 7a2caca

Browse files
authored
Merge pull request #2117 from pbalcer/fix-filter-out-same-cmdlists
Fix urEnqueueEventsWaitWithBarrier when used with interop events
2 parents 1d1808a + 96f66e0 commit 7a2caca

File tree

3 files changed

+170
-44
lines changed

3 files changed

+170
-44
lines changed

source/adapters/level_zero/event.cpp

Lines changed: 60 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -171,48 +171,63 @@ ur_result_t urEnqueueEventsWaitWithBarrier(
171171
std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
172172

173173
// Helper function for appending a barrier to a command list.
174-
auto insertBarrierIntoCmdList =
175-
[&Queue](ur_command_list_ptr_t CmdList,
176-
const _ur_ze_event_list_t &EventWaitList,
177-
ur_event_handle_t &Event, bool IsInternal) {
178-
UR_CALL(createEventAndAssociateQueue(
179-
Queue, &Event, UR_COMMAND_EVENTS_WAIT_WITH_BARRIER, CmdList,
180-
IsInternal, false));
181-
182-
Event->WaitList = EventWaitList;
183-
184-
// For in-order queue we don't need a real barrier, just wait for
185-
// requested events in potentially different queues and add a "barrier"
186-
// event signal because it is already guaranteed that previous commands
187-
// in this queue are completed when the signal is started.
188-
//
189-
// Only consideration here is that when profiling is used, signalEvent
190-
// cannot be used if EventWaitList.Lenght == 0. In those cases, we need
191-
// to fallback directly to barrier to have correct timestamps. See here:
192-
// https://spec.oneapi.io/level-zero/latest/core/api.html?highlight=appendsignalevent#_CPPv430zeCommandListAppendSignalEvent24ze_command_list_handle_t17ze_event_handle_t
193-
//
194-
// TODO: this and other special handling of in-order queues to be
195-
// updated when/if Level Zero adds native support for in-order queues.
196-
//
197-
if (Queue->isInOrderQueue() && InOrderBarrierBySignal &&
198-
!Queue->isProfilingEnabled()) {
199-
// If we are using driver in order lists, then append wait on events
200-
// is unnecessary and we can signal the event created.
201-
if (EventWaitList.Length && !CmdList->second.IsInOrderList) {
202-
ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
203-
(CmdList->first, EventWaitList.Length,
204-
EventWaitList.ZeEventList));
174+
auto insertBarrierIntoCmdList = [&Queue](ur_command_list_ptr_t CmdList,
175+
_ur_ze_event_list_t &EventWaitList,
176+
ur_event_handle_t &Event,
177+
bool IsInternal) {
178+
UR_CALL(createEventAndAssociateQueue(Queue, &Event,
179+
UR_COMMAND_EVENTS_WAIT_WITH_BARRIER,
180+
CmdList, IsInternal, false));
181+
182+
Event->WaitList = EventWaitList;
183+
184+
// For in-order queue we don't need a real barrier, just wait for
185+
// requested events in potentially different queues and add a "barrier"
186+
// event signal because it is already guaranteed that previous commands
187+
// in this queue are completed when the signal is started.
188+
//
189+
// Only consideration here is that when profiling is used, signalEvent
190+
// cannot be used if EventWaitList.Lenght == 0. In those cases, we need
191+
// to fallback directly to barrier to have correct timestamps. See here:
192+
// https://spec.oneapi.io/level-zero/latest/core/api.html?highlight=appendsignalevent#_CPPv430zeCommandListAppendSignalEvent24ze_command_list_handle_t17ze_event_handle_t
193+
//
194+
// TODO: this and other special handling of in-order queues to be
195+
// updated when/if Level Zero adds native support for in-order queues.
196+
//
197+
if (Queue->isInOrderQueue() && InOrderBarrierBySignal &&
198+
!Queue->isProfilingEnabled()) {
199+
// If we are using driver in order lists, then append wait on events
200+
// is unnecessary IF the cmdlists match.
201+
if (EventWaitList.Length) {
202+
if (CmdList->second.IsInOrderList) {
203+
for (unsigned i = EventWaitList.Length; i-- < 0;) {
204+
// if the events is from the same cmdlist, we can remove it
205+
// from the waitlist.
206+
if (EventWaitList.UrEventList[i]->CommandList == CmdList) {
207+
EventWaitList.Length--;
208+
if (EventWaitList.Length != i) {
209+
std::swap(EventWaitList.UrEventList[i],
210+
EventWaitList.UrEventList[EventWaitList.Length]);
211+
std::swap(EventWaitList.ZeEventList[i],
212+
EventWaitList.ZeEventList[EventWaitList.Length]);
213+
}
214+
}
205215
}
206-
ZE2UR_CALL(zeCommandListAppendSignalEvent,
207-
(CmdList->first, Event->ZeEvent));
208-
} else {
209-
ZE2UR_CALL(zeCommandListAppendBarrier,
210-
(CmdList->first, Event->ZeEvent, EventWaitList.Length,
211-
EventWaitList.ZeEventList));
212216
}
217+
ZE2UR_CALL(
218+
zeCommandListAppendWaitOnEvents,
219+
(CmdList->first, EventWaitList.Length, EventWaitList.ZeEventList));
220+
}
221+
ZE2UR_CALL(zeCommandListAppendSignalEvent,
222+
(CmdList->first, Event->ZeEvent));
223+
} else {
224+
ZE2UR_CALL(zeCommandListAppendBarrier,
225+
(CmdList->first, Event->ZeEvent, EventWaitList.Length,
226+
EventWaitList.ZeEventList));
227+
}
213228

214-
return UR_RESULT_SUCCESS;
215-
};
229+
return UR_RESULT_SUCCESS;
230+
};
216231

217232
// If the queue is in-order then each command in it effectively acts as a
218233
// barrier, so we don't need to do anything except if we were requested
@@ -349,9 +364,9 @@ ur_result_t urEnqueueEventsWaitWithBarrier(
349364
// command-lists.
350365
std::vector<ur_event_handle_t> EventWaitVector(CmdLists.size());
351366
for (size_t I = 0; I < CmdLists.size(); ++I) {
352-
UR_CALL(insertBarrierIntoCmdList(CmdLists[I], _ur_ze_event_list_t{},
353-
EventWaitVector[I],
354-
true /*IsInternal*/));
367+
_ur_ze_event_list_t waitlist;
368+
UR_CALL(insertBarrierIntoCmdList(
369+
CmdLists[I], waitlist, EventWaitVector[I], true /*IsInternal*/));
355370
}
356371
// If there were multiple queues we need to create a "convergence" event to
357372
// be our active barrier. This convergence event is signalled by a barrier
@@ -376,8 +391,9 @@ ur_result_t urEnqueueEventsWaitWithBarrier(
376391
// If there is only a single queue then insert a barrier and the single
377392
// result event can be used as our active barrier and used as the return
378393
// event. Take into account whether output event is discarded or not.
379-
UR_CALL(insertBarrierIntoCmdList(CmdLists[0], _ur_ze_event_list_t{},
380-
ResultEvent, IsInternal));
394+
_ur_ze_event_list_t waitlist;
395+
UR_CALL(insertBarrierIntoCmdList(CmdLists[0], waitlist, ResultEvent,
396+
IsInternal));
381397
}
382398

383399
// Execute each command list so the barriers can be encountered.

test/adapters/level_zero/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ if(UR_BUILD_ADAPTER_L0)
1515
SOURCES
1616
urProgramLink.cpp
1717
urKernelCreateWithNativeHandle.cpp
18+
urEventCreateWithNativeHandle.cpp
1819
ENVIRONMENT
1920
"UR_ADAPTERS_FORCE_LOAD=\"$<TARGET_FILE:ur_adapter_level_zero>\""
2021
)
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
// Copyright (C) 2024 Intel Corporation
2+
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
3+
// See LICENSE.TXT
4+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
5+
6+
#include "ur_api.h"
7+
#include "uur/checks.h"
8+
#include "ze_api.h"
9+
#include <cstring>
10+
#include <thread>
11+
#include <uur/fixtures.h>
12+
13+
using namespace std::chrono_literals;
14+
using urLevelZeroEventNativeHandleTest = uur::urQueueTest;
15+
UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urLevelZeroEventNativeHandleTest);
16+
17+
#define TEST_MEMCPY_SIZE 4096
18+
19+
TEST_P(urLevelZeroEventNativeHandleTest, WaitForNative) {
20+
ze_event_pool_desc_t desc;
21+
desc.stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC;
22+
desc.pNext = nullptr;
23+
desc.count = 1;
24+
desc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
25+
26+
ur_native_handle_t nativeContext;
27+
ASSERT_SUCCESS(urContextGetNativeHandle(context, &nativeContext));
28+
29+
ur_native_handle_t nativeDevice;
30+
ASSERT_SUCCESS(urDeviceGetNativeHandle(device, &nativeDevice));
31+
32+
ze_event_pool_handle_t pool = nullptr;
33+
34+
ASSERT_EQ(zeEventPoolCreate((ze_context_handle_t)nativeContext, &desc, 1,
35+
(ze_device_handle_t *)&nativeDevice, &pool),
36+
ZE_RESULT_SUCCESS);
37+
38+
ze_event_desc_t eventDesc;
39+
eventDesc.pNext = nullptr;
40+
eventDesc.stype = ZE_STRUCTURE_TYPE_EVENT_DESC;
41+
eventDesc.index = 0;
42+
eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
43+
eventDesc.wait = 0;
44+
45+
ze_event_handle_t zeEvent;
46+
ASSERT_EQ(zeEventCreate(pool, &eventDesc, &zeEvent), ZE_RESULT_SUCCESS);
47+
48+
ur_event_native_properties_t pprops;
49+
pprops.isNativeHandleOwned = false;
50+
pprops.pNext = nullptr;
51+
pprops.stype = UR_STRUCTURE_TYPE_EVENT_NATIVE_PROPERTIES;
52+
53+
ur_event_handle_t urEvent;
54+
ASSERT_SUCCESS(urEventCreateWithNativeHandle((ur_native_handle_t)zeEvent,
55+
context, &pprops, &urEvent));
56+
57+
int *src = (int *)malloc(TEST_MEMCPY_SIZE);
58+
memset(src, 0xc, TEST_MEMCPY_SIZE);
59+
60+
int *dst = (int *)malloc(TEST_MEMCPY_SIZE);
61+
memset(dst, 0, TEST_MEMCPY_SIZE);
62+
63+
int *dst2 = (int *)malloc(TEST_MEMCPY_SIZE);
64+
memset(dst, 0, TEST_MEMCPY_SIZE);
65+
66+
ur_event_handle_t memcpyEvent2;
67+
ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, dst2, src, TEST_MEMCPY_SIZE,
68+
0, nullptr, &memcpyEvent2));
69+
70+
ur_event_handle_t memcpyEvent3;
71+
ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, dst2, src, TEST_MEMCPY_SIZE,
72+
0, nullptr, &memcpyEvent3));
73+
74+
// just to make wait lists contain more than 1 event
75+
ur_event_handle_t events[] = {memcpyEvent2, urEvent, memcpyEvent3};
76+
77+
ur_event_handle_t waitEvent;
78+
ASSERT_SUCCESS(
79+
urEnqueueEventsWaitWithBarrier(queue, 3, events, &waitEvent));
80+
81+
ur_event_handle_t memcpyEvent;
82+
ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, dst, src, TEST_MEMCPY_SIZE,
83+
1, &waitEvent, &memcpyEvent));
84+
85+
// urQueueFinish would hang, so we flush and then wait
86+
// some time to make sure the gpu had plenty of time
87+
// to do the memcpy.
88+
urQueueFlush(queue);
89+
std::this_thread::sleep_for(500ms);
90+
91+
ASSERT_NE(memcmp(src, dst, TEST_MEMCPY_SIZE), 0);
92+
93+
zeEventHostSignal(zeEvent);
94+
95+
urQueueFinish(queue);
96+
97+
ASSERT_EQ(memcmp(src, dst, 4096), 0);
98+
99+
free(src);
100+
free(dst);
101+
free(dst2);
102+
urEventRelease(urEvent);
103+
urEventRelease(waitEvent);
104+
urEventRelease(memcpyEvent);
105+
urEventRelease(memcpyEvent2);
106+
urEventRelease(memcpyEvent3);
107+
zeEventDestroy(zeEvent);
108+
zeEventPoolDestroy(pool);
109+
}

0 commit comments

Comments
 (0)