Skip to content

Commit 507001c

Browse files
authored
[UR][L0] Fix L0 teardown checks for stability (#17818)
- Address the race conditions with L0 Loader teardown timing such that L0 teardown is verified before handle destruction in all cases and uses a L0 loader api to verify stability. --------- Signed-off-by: Neil R. Spruit <neil.r.spruit@intel.com>
1 parent 84fac44 commit 507001c

24 files changed

+143
-168
lines changed

unified-runtime/cmake/FetchLevelZero.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ if (NOT DEFINED LEVEL_ZERO_LIBRARY OR NOT DEFINED LEVEL_ZERO_INCLUDE_DIR)
4343
set(UR_LEVEL_ZERO_LOADER_REPO "https://github.com/oneapi-src/level-zero.git")
4444
endif()
4545
if (UR_LEVEL_ZERO_LOADER_TAG STREQUAL "")
46-
set(UR_LEVEL_ZERO_LOADER_TAG v1.21.1)
46+
set(UR_LEVEL_ZERO_LOADER_TAG ecfe375b30cc04265b20ac1b7996a85d0910f3ed)
4747
endif()
4848

4949
# Disable due to a bug https://github.com/oneapi-src/level-zero/issues/104

unified-runtime/source/adapters/level_zero/command_buffer.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -447,16 +447,16 @@ void ur_exp_command_buffer_handle_t_::cleanupCommandBufferResources() {
447447

448448
// Release the memory allocated to the CommandList stored in the
449449
// command_buffer
450-
if (ZeComputeCommandList) {
450+
if (ZeComputeCommandList && checkL0LoaderTeardown()) {
451451
ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeComputeCommandList));
452452
}
453-
if (useCopyEngine() && ZeCopyCommandList) {
453+
if (useCopyEngine() && ZeCopyCommandList && checkL0LoaderTeardown()) {
454454
ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCopyCommandList));
455455
}
456456

457457
// Release the memory allocated to the CommandListResetEvents stored in the
458458
// command_buffer
459-
if (ZeCommandListResetEvents) {
459+
if (ZeCommandListResetEvents && checkL0LoaderTeardown()) {
460460
ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandListResetEvents));
461461
}
462462

@@ -504,7 +504,9 @@ void ur_exp_command_buffer_handle_t_::cleanupCommandBufferResources() {
504504
// Release fences allocated to command-buffer
505505
for (auto &ZeFencePair : ZeFencesMap) {
506506
auto &ZeFence = ZeFencePair.second;
507-
ZE_CALL_NOCHECK(zeFenceDestroy, (ZeFence));
507+
if (checkL0LoaderTeardown()) {
508+
ZE_CALL_NOCHECK(zeFenceDestroy, (ZeFence));
509+
}
508510
}
509511

510512
auto ReleaseIndirectMem = [](ur_kernel_handle_t Kernel) {

unified-runtime/source/adapters/level_zero/common.hpp

Lines changed: 8 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include <unistd.h>
2626
#endif
2727

28+
#include <loader/ze_loader.h>
2829
#include <ur/ur.hpp>
2930
#include <ur_ddi.h>
3031
#include <ze_api.h>
@@ -38,65 +39,15 @@
3839
struct _ur_platform_handle_t;
3940

4041
[[maybe_unused]] static bool checkL0LoaderTeardown() {
41-
bool loaderStable = true;
42-
#ifdef _WIN32
43-
uint32_t ZeDriverCount = 0;
44-
HMODULE zeLoader = LoadLibrary("ze_loader.dll");
45-
if (zeLoader) {
46-
typedef ze_result_t (*zeDriverGet_t)(uint32_t *, ze_driver_handle_t *);
47-
zeDriverGet_t zeDriverGetLoader =
48-
(zeDriverGet_t)GetProcAddress(zeLoader, "zeDriverGet");
49-
if (zeDriverGetLoader) {
50-
ze_result_t result = zeDriverGetLoader(&ZeDriverCount, nullptr);
51-
logger::debug(
52-
"ZE ---> checkL0LoaderTeardown result = {} driver count = {}", result,
53-
ZeDriverCount);
54-
if (result != ZE_RESULT_SUCCESS || ZeDriverCount == 0) {
55-
loaderStable = false;
56-
}
57-
} else {
58-
logger::debug("ZE ---> checkL0LoaderTeardown: Failed to get address of "
59-
"zeDriverGet");
60-
loaderStable = false;
61-
}
62-
FreeLibrary(zeLoader);
63-
} else {
64-
logger::debug(
65-
"ZE ---> checkL0LoaderTeardown: Failed to load ze_loader.dll");
66-
loaderStable = false;
67-
}
68-
#else
69-
uint32_t ZeDriverCount = 0;
70-
void *zeLoader = dlopen("libze_loader.so.1", RTLD_LAZY);
71-
if (zeLoader) {
72-
typedef ze_result_t (*zeDriverGet_t)(uint32_t *, ze_driver_handle_t *);
73-
zeDriverGet_t zeDriverGetLoader =
74-
(zeDriverGet_t)dlsym(zeLoader, "zeDriverGet");
75-
if (zeDriverGetLoader) {
76-
ze_result_t result = zeDriverGetLoader(&ZeDriverCount, nullptr);
77-
logger::debug(
78-
"ZE ---> checkL0LoaderTeardown result = {} driver count = {}", result,
79-
ZeDriverCount);
80-
if (result != ZE_RESULT_SUCCESS || ZeDriverCount == 0) {
81-
loaderStable = false;
82-
}
83-
} else {
84-
logger::debug("ZE ---> checkL0LoaderTeardown: Failed to get address of "
85-
"zeDriverGet");
86-
loaderStable = false;
42+
try {
43+
if (!zelCheckIsLoaderInTearDown()) {
44+
return true;
8745
}
88-
dlclose(zeLoader);
89-
} else {
90-
logger::debug(
91-
"ZE ---> checkL0LoaderTeardown: Failed to load libze_loader.so.1");
92-
loaderStable = false;
46+
} catch (...) {
9347
}
94-
#endif
95-
if (!loaderStable) {
96-
logger::debug(
97-
"ZE ---> checkL0LoaderTeardown: Loader is not stable, returning false");
98-
}
99-
return loaderStable;
48+
logger::debug(
49+
"ZE ---> checkL0LoaderTeardown: Loader is in teardown or is unstable");
50+
return false;
10051
}
10152

10253
// Controls UR L0 calls tracing.
@@ -329,9 +280,6 @@ struct _ur_object {
329280
// Indicates if we own the native handle or it came from interop that
330281
// asked to not transfer the ownership to SYCL RT.
331282
bool OwnNativeHandle = false;
332-
333-
// Indicates if this object is an interop handle.
334-
bool IsInteropNativeHandle = false;
335283
};
336284

337285
// Record for a memory allocation. This structure is used to keep information

unified-runtime/source/adapters/level_zero/context.cpp

Lines changed: 46 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,6 @@ ur_result_t urContextCreateWithNativeHandle(
152152
ur_context_handle_t_ *UrContext = new ur_context_handle_t_(
153153
ZeContext, NumDevices, Devices, OwnNativeHandle);
154154
UrContext->initialize();
155-
UrContext->IsInteropNativeHandle = true;
156155
*Context = reinterpret_cast<ur_context_handle_t>(UrContext);
157156
} catch (const std::bad_alloc &) {
158157
return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
@@ -265,11 +264,8 @@ ur_result_t ContextReleaseHelper(ur_context_handle_t Context) {
265264
Contexts.erase(It);
266265
}
267266
ze_context_handle_t DestroyZeContext =
268-
((Context->OwnNativeHandle && !Context->IsInteropNativeHandle) ||
269-
(Context->OwnNativeHandle && Context->IsInteropNativeHandle &&
270-
checkL0LoaderTeardown()))
271-
? Context->ZeContext
272-
: nullptr;
267+
(Context->OwnNativeHandle && checkL0LoaderTeardown()) ? Context->ZeContext
268+
: nullptr;
273269

274270
// Clean up any live memory associated with Context
275271
ur_result_t Result = Context->finalize();
@@ -286,8 +282,12 @@ ur_result_t ContextReleaseHelper(ur_context_handle_t Context) {
286282
if (DestroyZeContext) {
287283
auto ZeResult = ZE_CALL_NOCHECK(zeContextDestroy, (DestroyZeContext));
288284
// Gracefully handle the case that L0 was already unloaded.
289-
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
285+
if (ZeResult && (ZeResult != ZE_RESULT_ERROR_UNINITIALIZED ||
286+
ZeResult != ZE_RESULT_ERROR_UNKNOWN))
290287
return ze2urResult(ZeResult);
288+
if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
289+
ZeResult = ZE_RESULT_ERROR_UNINITIALIZED;
290+
}
291291
}
292292

293293
return Result;
@@ -308,12 +308,15 @@ ur_result_t ur_context_handle_t_::finalize() {
308308
std::scoped_lock<ur_mutex> Lock(EventCacheMutex);
309309
for (auto &EventCache : EventCaches) {
310310
for (auto &Event : EventCache) {
311-
if (!Event->IsInteropNativeHandle ||
312-
(Event->IsInteropNativeHandle && checkL0LoaderTeardown())) {
311+
if (checkL0LoaderTeardown()) {
313312
auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent));
314313
// Gracefully handle the case that L0 was already unloaded.
315-
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
314+
if (ZeResult && (ZeResult != ZE_RESULT_ERROR_UNINITIALIZED ||
315+
ZeResult != ZE_RESULT_ERROR_UNKNOWN))
316316
return ze2urResult(ZeResult);
317+
if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
318+
ZeResult = ZE_RESULT_ERROR_UNINITIALIZED;
319+
}
317320
}
318321
Event->ZeEvent = nullptr;
319322
delete Event;
@@ -325,41 +328,61 @@ ur_result_t ur_context_handle_t_::finalize() {
325328
std::scoped_lock<ur_mutex> Lock(ZeEventPoolCacheMutex);
326329
for (auto &ZePoolCache : ZeEventPoolCache) {
327330
for (auto &ZePool : ZePoolCache) {
328-
auto ZeResult = ZE_CALL_NOCHECK(zeEventPoolDestroy, (ZePool));
329-
// Gracefully handle the case that L0 was already unloaded.
330-
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
331-
return ze2urResult(ZeResult);
331+
if (checkL0LoaderTeardown()) {
332+
auto ZeResult = ZE_CALL_NOCHECK(zeEventPoolDestroy, (ZePool));
333+
// Gracefully handle the case that L0 was already unloaded.
334+
if (ZeResult && (ZeResult != ZE_RESULT_ERROR_UNINITIALIZED ||
335+
ZeResult != ZE_RESULT_ERROR_UNKNOWN))
336+
return ze2urResult(ZeResult);
337+
if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
338+
ZeResult = ZE_RESULT_ERROR_UNINITIALIZED;
339+
}
340+
}
332341
}
333342
ZePoolCache.clear();
334343
}
335344
}
336345

337-
// Destroy the command list used for initializations
338-
auto ZeResult = ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandListInit));
339-
// Gracefully handle the case that L0 was already unloaded.
340-
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
341-
return ze2urResult(ZeResult);
346+
if (checkL0LoaderTeardown()) {
347+
// Destroy the command list used for initializations
348+
auto ZeResult = ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandListInit));
349+
// Gracefully handle the case that L0 was already unloaded.
350+
if (ZeResult && (ZeResult != ZE_RESULT_ERROR_UNINITIALIZED ||
351+
ZeResult != ZE_RESULT_ERROR_UNKNOWN))
352+
return ze2urResult(ZeResult);
353+
if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
354+
ZeResult = ZE_RESULT_ERROR_UNINITIALIZED;
355+
}
356+
}
342357

343358
std::scoped_lock<ur_mutex> Lock(ZeCommandListCacheMutex);
344359
for (auto &List : ZeComputeCommandListCache) {
345360
for (auto &Item : List.second) {
346361
ze_command_list_handle_t ZeCommandList = Item.first;
347-
if (ZeCommandList) {
362+
if (ZeCommandList && checkL0LoaderTeardown()) {
348363
auto ZeResult = ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandList));
349364
// Gracefully handle the case that L0 was already unloaded.
350-
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
365+
if (ZeResult && (ZeResult != ZE_RESULT_ERROR_UNINITIALIZED ||
366+
ZeResult != ZE_RESULT_ERROR_UNKNOWN))
351367
return ze2urResult(ZeResult);
368+
if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
369+
ZeResult = ZE_RESULT_ERROR_UNINITIALIZED;
370+
}
352371
}
353372
}
354373
}
355374
for (auto &List : ZeCopyCommandListCache) {
356375
for (auto &Item : List.second) {
357376
ze_command_list_handle_t ZeCommandList = Item.first;
358-
if (ZeCommandList) {
377+
if (ZeCommandList && checkL0LoaderTeardown()) {
359378
auto ZeResult = ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandList));
360379
// Gracefully handle the case that L0 was already unloaded.
361-
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
380+
if (ZeResult && (ZeResult != ZE_RESULT_ERROR_UNINITIALIZED ||
381+
ZeResult != ZE_RESULT_ERROR_UNKNOWN))
362382
return ze2urResult(ZeResult);
383+
if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
384+
ZeResult = ZE_RESULT_ERROR_UNINITIALIZED;
385+
}
363386
}
364387
}
365388
}

unified-runtime/source/adapters/level_zero/device.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1536,7 +1536,6 @@ ur_result_t urDeviceCreateWithNativeHandle(
15361536
if (Dev == nullptr)
15371537
return UR_RESULT_ERROR_INVALID_VALUE;
15381538

1539-
Dev->IsInteropNativeHandle = true;
15401539
*Device = Dev;
15411540
return UR_RESULT_SUCCESS;
15421541
}

unified-runtime/source/adapters/level_zero/event.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -985,7 +985,6 @@ ur_result_t urEventCreateWithNativeHandle(
985985
UREvent->CleanedUp = true;
986986

987987
*Event = reinterpret_cast<ur_event_handle_t>(UREvent);
988-
UREvent->IsInteropNativeHandle = true;
989988

990989
return UR_RESULT_SUCCESS;
991990
}
@@ -1074,7 +1073,7 @@ ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent(
10741073
* leaks or resource mismanagement.
10751074
*/
10761075
ur_event_handle_t_::~ur_event_handle_t_() {
1077-
if (this->ZeEvent && this->Completed) {
1076+
if (this->ZeEvent && this->Completed && checkL0LoaderTeardown()) {
10781077
if (this->UrQueue && !this->UrQueue->isDiscardEvents())
10791078
ZE_CALL_NOCHECK(zeEventDestroy, (this->ZeEvent));
10801079
}
@@ -1105,12 +1104,15 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) {
11051104
}
11061105
if (Event->OwnNativeHandle) {
11071106
if (DisableEventsCaching) {
1108-
if (!Event->IsInteropNativeHandle ||
1109-
(Event->IsInteropNativeHandle && checkL0LoaderTeardown())) {
1107+
if (checkL0LoaderTeardown()) {
11101108
auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent));
11111109
// Gracefully handle the case that L0 was already unloaded.
1112-
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
1110+
if (ZeResult && (ZeResult != ZE_RESULT_ERROR_UNINITIALIZED ||
1111+
ZeResult != ZE_RESULT_ERROR_UNKNOWN))
11131112
return ze2urResult(ZeResult);
1113+
if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
1114+
ZeResult = ZE_RESULT_ERROR_UNINITIALIZED;
1115+
}
11141116
}
11151117
Event->ZeEvent = nullptr;
11161118
auto Context = Event->Context;

unified-runtime/source/adapters/level_zero/image.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,9 @@ ur_result_t urBindlessImagesUnsampledImageHandleDestroyExp(
313313
auto item = hDevice->ZeOffsetToImageHandleMap.find(hImage);
314314

315315
if (item != hDevice->ZeOffsetToImageHandleMap.end()) {
316-
ZE2UR_CALL(zeImageDestroy, (item->second));
316+
if (checkL0LoaderTeardown()) {
317+
ZE2UR_CALL(zeImageDestroy, (item->second));
318+
}
317319
hDevice->ZeOffsetToImageHandleMap.erase(item);
318320
} else {
319321
return UR_RESULT_ERROR_INVALID_NULL_HANDLE;

unified-runtime/source/adapters/level_zero/kernel.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -940,12 +940,15 @@ ur_result_t urKernelRelease(
940940
auto KernelProgram = Kernel->Program;
941941
if (Kernel->OwnNativeHandle) {
942942
for (auto &ZeKernel : Kernel->ZeKernels) {
943-
if (!Kernel->IsInteropNativeHandle ||
944-
(Kernel->IsInteropNativeHandle && checkL0LoaderTeardown())) {
943+
if (checkL0LoaderTeardown()) {
945944
auto ZeResult = ZE_CALL_NOCHECK(zeKernelDestroy, (ZeKernel));
946945
// Gracefully handle the case that L0 was already unloaded.
947-
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
946+
if (ZeResult && (ZeResult != ZE_RESULT_ERROR_UNINITIALIZED ||
947+
ZeResult != ZE_RESULT_ERROR_UNKNOWN))
948948
return ze2urResult(ZeResult);
949+
if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
950+
ZeResult = ZE_RESULT_ERROR_UNINITIALIZED;
951+
}
949952
}
950953
}
951954
}
@@ -1157,7 +1160,6 @@ ur_result_t urKernelCreateWithNativeHandle(
11571160
}
11581161

11591162
Kernel->Program = Program;
1160-
Kernel->IsInteropNativeHandle = true;
11611163

11621164
UR_CALL(Kernel->initialize());
11631165

unified-runtime/source/adapters/level_zero/memory.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1563,7 +1563,6 @@ ur_result_t urMemImageCreateWithNativeHandle(
15631563
auto OwnNativeHandle = Properties ? Properties->isNativeHandleOwned : false;
15641564
UR_CALL(createUrMemFromZeImage(Context, ZeHImage, OwnNativeHandle,
15651565
ZeImageDesc, Mem));
1566-
(*Mem)->IsInteropNativeHandle = true;
15671566

15681567
return UR_RESULT_SUCCESS;
15691568
}
@@ -1663,13 +1662,16 @@ ur_result_t urMemRelease(
16631662
if (Image->OwnNativeHandle) {
16641663
UR_CALL(Mem->getZeHandle(ZeHandleImage, ur_mem_handle_t_::write_only,
16651664
nullptr, nullptr, 0u));
1666-
if (!Image->IsInteropNativeHandle ||
1667-
(Image->IsInteropNativeHandle && checkL0LoaderTeardown())) {
1665+
if (checkL0LoaderTeardown()) {
16681666
auto ZeResult = ZE_CALL_NOCHECK(
16691667
zeImageDestroy, (ur_cast<ze_image_handle_t>(ZeHandleImage)));
16701668
// Gracefully handle the case that L0 was already unloaded.
1671-
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
1669+
if (ZeResult && (ZeResult != ZE_RESULT_ERROR_UNINITIALIZED ||
1670+
ZeResult != ZE_RESULT_ERROR_UNKNOWN))
16721671
return ze2urResult(ZeResult);
1672+
if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
1673+
ZeResult = ZE_RESULT_ERROR_UNINITIALIZED;
1674+
}
16731675
}
16741676
}
16751677
delete Image;
@@ -1776,7 +1778,6 @@ ur_result_t urMemBufferCreateWithNativeHandle(
17761778
Buffer = new _ur_buffer(Context, Size, Device, ur_cast<char *>(NativeMem),
17771779
OwnNativeHandle);
17781780
*Mem = reinterpret_cast<ur_mem_handle_t>(Buffer);
1779-
(*Mem)->IsInteropNativeHandle = true;
17801781
} catch (const std::bad_alloc &) {
17811782
return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
17821783
} catch (...) {

unified-runtime/source/adapters/level_zero/physical_mem.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,10 @@ ur_result_t urPhysicalMemRelease(ur_physical_mem_handle_t hPhysicalMem) {
5050
if (!hPhysicalMem->RefCount.decrementAndTest())
5151
return UR_RESULT_SUCCESS;
5252

53-
ZE2UR_CALL(zePhysicalMemDestroy, (hPhysicalMem->Context->getZeHandle(),
54-
hPhysicalMem->ZePhysicalMem));
53+
if (checkL0LoaderTeardown()) {
54+
ZE2UR_CALL(zePhysicalMemDestroy, (hPhysicalMem->Context->getZeHandle(),
55+
hPhysicalMem->ZePhysicalMem));
56+
}
5557
delete hPhysicalMem;
5658

5759
return UR_RESULT_SUCCESS;

0 commit comments

Comments
 (0)