Skip to content

Commit 99489ad

Browse files
Merge pull request #1880 from hdelan/l0-native-enqueue
[L0] L0 impl for enqueue native command
2 parents 3e762e0 + 3f13f69 commit 99489ad

File tree

8 files changed

+250
-5
lines changed

8 files changed

+250
-5
lines changed

source/adapters/level_zero/device.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -877,7 +877,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
877877
}
878878
case UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP: {
879879
// L0 doesn't support enqueueing native work through the urNativeEnqueueExp
880-
return ReturnValue(static_cast<ur_bool_t>(false));
880+
return ReturnValue(static_cast<ur_bool_t>(true));
881881
}
882882

883883
case UR_DEVICE_INFO_ESIMD_SUPPORT: {

source/adapters/level_zero/enqueue_native.cpp

Lines changed: 65 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,72 @@
1010

1111
#include <ur_api.h>
1212

13+
#include "logger/ur_logger.hpp"
1314
#include "queue.hpp"
15+
#include "ur_level_zero.hpp"
1416

1517
ur_result_t ur_queue_handle_legacy_t_::enqueueNativeCommandExp(
16-
ur_exp_enqueue_native_command_function_t, void *, uint32_t,
17-
const ur_mem_handle_t *, const ur_exp_enqueue_native_command_properties_t *,
18-
uint32_t, const ur_event_handle_t *, ur_event_handle_t *) {
19-
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
18+
ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, void *data,
19+
uint32_t, const ur_mem_handle_t *,
20+
const ur_exp_enqueue_native_command_properties_t *,
21+
uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventList,
22+
ur_event_handle_t *phEvent) {
23+
auto Queue = this;
24+
std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
25+
26+
bool UseCopyEngine = false;
27+
28+
// Please note that the following code should be run before the
29+
// subsequent getAvailableCommandList() call so that there is no
30+
// dead-lock from waiting unsubmitted events in an open batch.
31+
// The createAndRetainUrZeEventList() has the proper side-effect
32+
// of submitting batches with dependent events.
33+
//
34+
_ur_ze_event_list_t TmpWaitList;
35+
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
36+
NumEventsInWaitList, phEventList, Queue, UseCopyEngine));
37+
38+
// Get a new command list to be used on this call
39+
ur_command_list_ptr_t CommandList{};
40+
// TODO: Change UseCopyEngine argument to 'true' once L0 backend
41+
// support is added
42+
UR_CALL(Queue->Context->getAvailableCommandList(
43+
Queue, CommandList, UseCopyEngine, NumEventsInWaitList, phEventList));
44+
45+
// TODO: do we need to create a unique command type for this?
46+
ze_event_handle_t ZeEvent = nullptr;
47+
ur_event_handle_t InternalEvent;
48+
bool IsInternal = phEvent == nullptr;
49+
ur_event_handle_t *Event = phEvent ? phEvent : &InternalEvent;
50+
UR_CALL(createEventAndAssociateQueue(Queue, Event,
51+
UR_COMMAND_ENQUEUE_NATIVE_EXP,
52+
CommandList, IsInternal, false));
53+
ZeEvent = (*Event)->ZeEvent;
54+
(*Event)->WaitList = TmpWaitList;
55+
56+
const auto &WaitList = (*Event)->WaitList;
57+
if (WaitList.Length) {
58+
ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
59+
(CommandList->first, WaitList.Length, WaitList.ZeEventList));
60+
}
61+
62+
UR_CALL(Queue->executeCommandList(CommandList, false, false));
63+
UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
64+
UseCopyEngine, 0, nullptr));
65+
66+
{
67+
ScopedCommandList Active{Queue, CommandList->first};
68+
69+
// Call interop func which enqueues native async work
70+
pfnNativeEnqueue(Queue, data);
71+
}
72+
73+
UR_CALL(Queue->executeCommandList(CommandList, false, false));
74+
UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
75+
UseCopyEngine, 0, nullptr));
76+
77+
ZE2UR_CALL(zeCommandListAppendSignalEvent, (CommandList->first, ZeEvent));
78+
79+
UR_CALL(Queue->executeCommandList(CommandList, false));
80+
return UR_RESULT_SUCCESS;
2081
}

source/adapters/level_zero/queue.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -705,6 +705,15 @@ ur_result_t ur_queue_handle_legacy_t_::queueGetNativeHandle(
705705
) {
706706
auto Queue = this;
707707

708+
// Needed for EnqueueNativeCommandExp, so that the native queue 'got' in the
709+
// interop func is the as the native queue used to manage dependencies
710+
// before the interop func invocation
711+
if (Queue->getThreadLocalCommandList() != ze_command_list_handle_t{0}) {
712+
auto ZeCmdList = ur_cast<ze_command_list_handle_t *>(NativeQueue);
713+
*ZeCmdList = Queue->getThreadLocalCommandList();
714+
return UR_RESULT_SUCCESS;
715+
}
716+
708717
// Lock automatically releases when this goes out of scope.
709718
std::shared_lock<ur_shared_mutex> lock(Queue->Mutex);
710719

source/adapters/level_zero/queue.hpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,12 @@ struct ur_queue_handle_legacy_t_ : _ur_object, public ur_queue_handle_t_ {
423423
uint32_t, const ur_event_handle_t *,
424424
ur_event_handle_t *) override;
425425

426+
// Thread local stream will be used if ScopedStream is active
427+
static ze_command_list_handle_t &getThreadLocalCommandList() {
428+
static thread_local ze_command_list_handle_t CommandList{0};
429+
return CommandList;
430+
}
431+
426432
using queue_type = ur_device_handle_t_::queue_group_info_t::type;
427433
// PI queue is in general a one to many mapping to L0 native queues.
428434
struct ur_queue_group_t {
@@ -941,3 +947,23 @@ ur_result_t setSignalEvent(ur_queue_handle_legacy_t Queue, bool UseCopyEngine,
941947
ur_result_t CleanupEventListFromResetCmdList(
942948
std::vector<ur_event_handle_t> &EventListToCleanup,
943949
bool QueueLocked = false);
950+
951+
// RAII object to make hQueue command list getter methods all return the same
952+
// command list within the lifetime of this object.
953+
//
954+
// This is useful for urEnqueueNativeCommandExp where we want guarantees that
955+
// the user submitted native calls will be dispatched to a known command list,
956+
// which must be "got" within the user submitted fuction.
957+
class ScopedCommandList {
958+
ur_queue_handle_legacy_t hQueue;
959+
960+
public:
961+
ScopedCommandList(ur_queue_handle_legacy_t hQueue,
962+
ze_command_list_handle_t CommandList)
963+
: hQueue{hQueue} {
964+
hQueue->getThreadLocalCommandList() = CommandList;
965+
}
966+
~ScopedCommandList() {
967+
hQueue->getThreadLocalCommandList() = ze_command_list_handle_t{0};
968+
}
969+
};

test/conformance/exp_enqueue_native/CMakeLists.txt

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,21 @@ if (UR_BUILD_ADAPTER_CUDA)
1515
target_link_libraries(test-exp_enqueue_native PRIVATE cudadrv)
1616
endif()
1717

18+
if (UR_BUILD_ADAPTER_L0)
19+
add_conformance_test_with_kernels_environment(
20+
exp_enqueue_native
21+
enqueue_native_level_zero.cpp
22+
)
23+
target_link_libraries(test-exp_enqueue_native PRIVATE
24+
LevelZeroLoader
25+
LevelZeroLoader-Headers
26+
)
27+
28+
target_include_directories(test-exp_enqueue_native PRIVATE
29+
${PROJECT_SOURCE_DIR}/source
30+
${PROJECT_SOURCE_DIR}/source/adapters/level_zero
31+
LevelZeroLoader-Headers
32+
)
33+
endif()
34+
1835
# TODO: Add more tests for different triples
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
// Copyright (C) 2024 Intel Corporation
2+
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
3+
// See LICENSE.TXT
4+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
5+
6+
#include "ze_api.h"
7+
8+
#include <uur/fixtures.h>
9+
#include <vector>
10+
11+
using T = uint32_t;
12+
13+
struct urLevelZeroEnqueueNativeCommandTest : uur::urQueueTest {
14+
void SetUp() {
15+
UUR_RETURN_ON_FATAL_FAILURE(uur::urQueueTest::SetUp());
16+
17+
host_vec = std::vector<T>(global_size, 0);
18+
ASSERT_EQ(host_vec.size(), global_size);
19+
ASSERT_SUCCESS(urUSMDeviceAlloc(context, device, nullptr, nullptr,
20+
allocation_size, &device_ptr));
21+
ASSERT_NE(device_ptr, nullptr);
22+
}
23+
static constexpr T val = 42;
24+
static constexpr uint32_t global_size = 1e7;
25+
std::vector<T> host_vec;
26+
void *device_ptr = nullptr;
27+
static constexpr size_t allocation_size = sizeof(val) * global_size;
28+
};
29+
30+
UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urLevelZeroEnqueueNativeCommandTest);
31+
32+
struct InteropData1 {
33+
void *fill_ptr;
34+
};
35+
36+
// Fill a device ptr with the pattern val
37+
void interop_func_1(ur_queue_handle_t hQueue, void *data) {
38+
ze_command_list_handle_t CommandList;
39+
ASSERT_SUCCESS(urQueueGetNativeHandle(hQueue, nullptr,
40+
(ur_native_handle_t *)&CommandList));
41+
InteropData1 *func_data = reinterpret_cast<InteropData1 *>(data);
42+
43+
// If L0 interop becomes a real use case we should make a new UR entry point
44+
// to propagate events into and out of the the interop func.
45+
zeCommandListAppendMemoryFill(
46+
CommandList, func_data->fill_ptr,
47+
&urLevelZeroEnqueueNativeCommandTest::val,
48+
sizeof(urLevelZeroEnqueueNativeCommandTest::val),
49+
urLevelZeroEnqueueNativeCommandTest::allocation_size, nullptr, 0,
50+
nullptr);
51+
}
52+
53+
struct InteropData2 {
54+
void *from, *to;
55+
};
56+
57+
// Read from device ptr to host ptr
58+
void interop_func_2(ur_queue_handle_t hQueue, void *data) {
59+
ze_command_list_handle_t CommandList;
60+
ASSERT_SUCCESS(urQueueGetNativeHandle(hQueue, nullptr,
61+
(ur_native_handle_t *)&CommandList));
62+
InteropData2 *func_data = reinterpret_cast<InteropData2 *>(data);
63+
64+
// If L0 interop becomes a real use case we should make a new UR entry point
65+
// to propagate events into and out of the the interop func.
66+
zeCommandListAppendMemoryCopy(
67+
CommandList, func_data->to, func_data->from,
68+
urLevelZeroEnqueueNativeCommandTest::allocation_size, nullptr, 0,
69+
nullptr);
70+
}
71+
72+
TEST_P(urLevelZeroEnqueueNativeCommandTest, Success) {
73+
InteropData1 data_1{device_ptr};
74+
ur_event_handle_t event_1;
75+
ASSERT_SUCCESS(urEnqueueNativeCommandExp(
76+
queue, &interop_func_1, &data_1, 0, nullptr /*phMemList=*/,
77+
nullptr /*pProperties=*/, 0, nullptr /*phEventWaitList=*/, &event_1));
78+
}
79+
80+
TEST_P(urLevelZeroEnqueueNativeCommandTest, Dependencies) {
81+
ur_event_handle_t event_1, event_2;
82+
83+
InteropData1 data_1{device_ptr};
84+
ASSERT_SUCCESS(urEnqueueNativeCommandExp(
85+
queue, &interop_func_1, &data_1, 0, nullptr /*phMemList=*/,
86+
nullptr /*pProperties=*/, 0, nullptr /*phEventWaitList=*/, &event_1));
87+
88+
InteropData2 data_2{device_ptr, host_vec.data()};
89+
ASSERT_SUCCESS(urEnqueueNativeCommandExp(
90+
queue, &interop_func_2, &data_2, 0, nullptr /*phMemList=*/,
91+
nullptr /*pProperties=*/, 1, &event_1, &event_2));
92+
urQueueFinish(queue);
93+
for (auto &i : host_vec) {
94+
ASSERT_EQ(i, val);
95+
}
96+
}
97+
98+
TEST_P(urLevelZeroEnqueueNativeCommandTest, DependenciesURBefore) {
99+
ur_event_handle_t event_1, event_2;
100+
101+
ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptr, sizeof(val), &val,
102+
allocation_size, 0,
103+
nullptr /*phEventWaitList=*/, &event_1));
104+
105+
InteropData2 data_2{device_ptr, host_vec.data()};
106+
ASSERT_SUCCESS(urEnqueueNativeCommandExp(
107+
queue, &interop_func_2, &data_2, 0, nullptr /*phMemList=*/,
108+
nullptr /*pProperties=*/, 1, &event_1, &event_2));
109+
urQueueFinish(queue);
110+
for (auto &i : host_vec) {
111+
ASSERT_EQ(i, val);
112+
}
113+
}
114+
115+
TEST_P(urLevelZeroEnqueueNativeCommandTest, DependenciesURAfter) {
116+
ur_event_handle_t event_1;
117+
118+
InteropData1 data_1{device_ptr};
119+
ASSERT_SUCCESS(urEnqueueNativeCommandExp(
120+
queue, &interop_func_1, &data_1, 0, nullptr /*phMemList=*/,
121+
nullptr /*pProperties=*/, 0, nullptr /*phEventWaitList=*/, &event_1));
122+
123+
urEnqueueUSMMemcpy(queue, /*blocking*/ true, host_vec.data(), device_ptr,
124+
allocation_size, 1, &event_1, nullptr);
125+
for (auto &i : host_vec) {
126+
ASSERT_EQ(i, val);
127+
}
128+
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
urLevelZeroEnqueueNativeCommandTest.Success{{.*}}
2+
urLevelZeroEnqueueNativeCommandTest.Dependencies{{.*}}
3+
urLevelZeroEnqueueNativeCommandTest.DependenciesURBefore{{.*}}
4+
urLevelZeroEnqueueNativeCommandTest.DependenciesURAfter{{.*}}

test/conformance/exp_enqueue_native/exp_enqueue_native_adapter_level_zero.match

Whitespace-only changes.

0 commit comments

Comments
 (0)