Skip to content

Commit f2afed2

Browse files
author
Hugh Delaney
committed
Try L0 impl for enqueue native command
Draft impl for discussion.
1 parent 4763308 commit f2afed2

File tree

3 files changed

+195
-4
lines changed

3 files changed

+195
-4
lines changed

source/adapters/level_zero/enqueue_native.cpp

Lines changed: 50 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,57 @@
1010

1111
#include <ur_api.h>
1212

13+
#include "logger/ur_logger.hpp"
1314
#include "queue.hpp"
15+
#include "ur_level_zero.hpp"
1416

1517
ur_result_t ur_queue_handle_legacy_t_::enqueueNativeCommandExp(
16-
ur_exp_enqueue_native_command_function_t, void *, uint32_t,
17-
const ur_mem_handle_t *, const ur_exp_enqueue_native_command_properties_t *,
18-
uint32_t, const ur_event_handle_t *, ur_event_handle_t *) {
19-
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
18+
ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, void *data,
19+
uint32_t, const ur_mem_handle_t *,
20+
const ur_exp_enqueue_native_command_properties_t *,
21+
uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList,
22+
ur_event_handle_t *phEvent) {
23+
auto Queue = this;
24+
25+
// TODO: Do I need this lock?
26+
std::scoped_lock<ur_shared_mutex> Lock(Queue->Mutex);
27+
28+
// TODO: What do I need to do with phMemList? Will a ur_mem_handle_t always
29+
// be usable as a native arg from within pfnNativeEnqueue, or should some
30+
// mem migration happen?
31+
32+
bool UseCopyEngine = false;
33+
_ur_ze_event_list_t TmpWaitList;
34+
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
35+
NumEventsInWaitList, phEventWaitList, Queue, UseCopyEngine));
36+
37+
// Get a new command list to be used on this call
38+
ur_command_list_ptr_t CommandList{};
39+
UR_CALL(Queue->Context->getAvailableCommandList(
40+
Queue, CommandList, UseCopyEngine, NumEventsInWaitList, phEventWaitList,
41+
true /* AllowBatching */));
42+
43+
ze_event_handle_t ZeEvent = nullptr;
44+
ur_event_handle_t InternalEvent{};
45+
bool IsInternal = phEvent == nullptr;
46+
ur_event_handle_t *Event = phEvent ? phEvent : &InternalEvent;
47+
48+
UR_CALL(createEventAndAssociateQueue(Queue, Event,
49+
UR_COMMAND_ENQUEUE_NATIVE_EXP,
50+
CommandList, IsInternal, false));
51+
UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event,
52+
NumEventsInWaitList, phEventWaitList,
53+
CommandList->second.ZeQueue));
54+
(*Event)->WaitList = TmpWaitList;
55+
56+
// FIXME: blocking synchronization. Make this faster
57+
Queue->queueFinish();
58+
59+
// Execute interop func
60+
pfnNativeEnqueue(Queue, data);
61+
62+
// FIXME: blocking synchronization. Make this faster
63+
Queue->queueFinish();
64+
65+
return UR_RESULT_SUCCESS;
2066
}

test/conformance/exp_enqueue_native/CMakeLists.txt

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,21 @@ if (UR_BUILD_ADAPTER_CUDA)
1515
target_link_libraries(test-exp_enqueue_native PRIVATE cudadrv)
1616
endif()
1717

18+
if (UR_BUILD_ADAPTER_L0)
19+
add_conformance_test_with_kernels_environment(
20+
exp_enqueue_native
21+
enqueue_native_level_zero.cpp
22+
)
23+
target_link_libraries(test-exp_enqueue_native PRIVATE
24+
LevelZeroLoader
25+
LevelZeroLoader-Headers
26+
)
27+
28+
target_include_directories(test-exp_enqueue_native PRIVATE
29+
${PROJECT_SOURCE_DIR}/source
30+
${PROJECT_SOURCE_DIR}/source/adapters/level_zero
31+
LevelZeroLoader-Headers
32+
)
33+
endif()
34+
1835
# TODO: Add more tests for different triples
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
// Copyright (C) 2024 Intel Corporation
2+
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
3+
// See LICENSE.TXT
4+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
5+
6+
#include "ze_api.h"
7+
8+
#include <uur/fixtures.h>
9+
#include <vector>
10+
11+
using T = uint32_t;
12+
13+
struct urLevelZeroEnqueueNativeCommandTest : uur::urQueueTest {
14+
void SetUp() {
15+
UUR_RETURN_ON_FATAL_FAILURE(uur::urQueueTest::SetUp());
16+
17+
host_vec = std::vector<T>(global_size, 0);
18+
ASSERT_EQ(host_vec.size(), global_size);
19+
ASSERT_SUCCESS(urUSMDeviceAlloc(context, device, nullptr, nullptr,
20+
allocation_size, &device_ptr));
21+
ASSERT_NE(device_ptr, nullptr);
22+
}
23+
static constexpr T val = 42;
24+
static constexpr uint32_t global_size = 1e7;
25+
std::vector<T> host_vec;
26+
void *device_ptr = nullptr;
27+
static constexpr size_t allocation_size = sizeof(val) * global_size;
28+
};
29+
30+
UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urLevelZeroEnqueueNativeCommandTest);
31+
32+
struct InteropData1 {
33+
void *fill_ptr;
34+
};
35+
36+
// Fill a device ptr with the pattern val
37+
void interop_func_1(ur_queue_handle_t hQueue, void *data) {
38+
ze_command_list_handle_t CommandList;
39+
ASSERT_SUCCESS(urQueueGetNativeHandle(hQueue, nullptr,
40+
(ur_native_handle_t *)&CommandList));
41+
InteropData1 *func_data = reinterpret_cast<InteropData1 *>(data);
42+
43+
// If L0 interop becomes a real use case we should make a new UR entry point
44+
// to propagate events into and out of the the interop func.
45+
zeCommandListAppendMemoryFill(
46+
CommandList, func_data->fill_ptr,
47+
&urLevelZeroEnqueueNativeCommandTest::val,
48+
sizeof(urLevelZeroEnqueueNativeCommandTest::val),
49+
urLevelZeroEnqueueNativeCommandTest::allocation_size, nullptr, 0,
50+
nullptr);
51+
}
52+
53+
struct InteropData2 {
54+
void *from, *to;
55+
};
56+
57+
// Read from device ptr to host ptr
58+
void interop_func_2(ur_queue_handle_t hQueue, void *data) {
59+
ze_command_list_handle_t CommandList;
60+
ASSERT_SUCCESS(urQueueGetNativeHandle(hQueue, nullptr,
61+
(ur_native_handle_t *)&CommandList));
62+
InteropData2 *func_data = reinterpret_cast<InteropData2 *>(data);
63+
64+
// If L0 interop becomes a real use case we should make a new UR entry point
65+
// to propagate events into and out of the the interop func.
66+
zeCommandListAppendMemoryCopy(
67+
CommandList, func_data->to, func_data->from,
68+
urLevelZeroEnqueueNativeCommandTest::allocation_size, nullptr, 0,
69+
nullptr);
70+
}
71+
72+
TEST_P(urLevelZeroEnqueueNativeCommandTest, Success) {
73+
InteropData1 data_1{device_ptr};
74+
ur_event_handle_t event_1;
75+
ASSERT_SUCCESS(urEnqueueNativeCommandExp(
76+
queue, &interop_func_1, &data_1, 0, nullptr /*phMemList=*/,
77+
nullptr /*pProperties=*/, 0, nullptr /*phEventWaitList=*/, &event_1));
78+
}
79+
80+
TEST_P(urLevelZeroEnqueueNativeCommandTest, Dependencies) {
81+
ur_event_handle_t event_1, event_2;
82+
83+
InteropData1 data_1{device_ptr};
84+
ASSERT_SUCCESS(urEnqueueNativeCommandExp(
85+
queue, &interop_func_1, &data_1, 0, nullptr /*phMemList=*/,
86+
nullptr /*pProperties=*/, 0, nullptr /*phEventWaitList=*/, &event_1));
87+
88+
InteropData2 data_2{device_ptr, host_vec.data()};
89+
ASSERT_SUCCESS(urEnqueueNativeCommandExp(
90+
queue, &interop_func_2, &data_2, 0, nullptr /*phMemList=*/,
91+
nullptr /*pProperties=*/, 1, &event_1, &event_2));
92+
urQueueFinish(queue);
93+
for (auto &i : host_vec) {
94+
ASSERT_EQ(i, val);
95+
}
96+
}
97+
98+
TEST_P(urLevelZeroEnqueueNativeCommandTest, DependenciesURBefore) {
99+
ur_event_handle_t event_1, event_2;
100+
101+
ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptr, sizeof(val), &val,
102+
allocation_size, 0,
103+
nullptr /*phEventWaitList=*/, &event_1));
104+
105+
InteropData2 data_2{device_ptr, host_vec.data()};
106+
ASSERT_SUCCESS(urEnqueueNativeCommandExp(
107+
queue, &interop_func_2, &data_2, 0, nullptr /*phMemList=*/,
108+
nullptr /*pProperties=*/, 1, &event_1, &event_2));
109+
urQueueFinish(queue);
110+
for (auto &i : host_vec) {
111+
ASSERT_EQ(i, val);
112+
}
113+
}
114+
115+
TEST_P(urLevelZeroEnqueueNativeCommandTest, DependenciesURAfter) {
116+
ur_event_handle_t event_1;
117+
118+
InteropData1 data_1{device_ptr};
119+
ASSERT_SUCCESS(urEnqueueNativeCommandExp(
120+
queue, &interop_func_1, &data_1, 0, nullptr /*phMemList=*/,
121+
nullptr /*pProperties=*/, 0, nullptr /*phEventWaitList=*/, &event_1));
122+
123+
urEnqueueUSMMemcpy(queue, /*blocking*/ true, host_vec.data(), device_ptr,
124+
allocation_size, 1, &event_1, nullptr);
125+
for (auto &i : host_vec) {
126+
ASSERT_EQ(i, val);
127+
}
128+
}

0 commit comments

Comments
 (0)