Skip to content

Commit 3984e84

Browse files
committed
POC: use UMF CUDA provider
Signed-off-by: Lukasz Dorau <lukasz.dorau@intel.com>
1 parent 68aed2d commit 3984e84

File tree

5 files changed

+129
-11
lines changed

5 files changed

+129
-11
lines changed

source/adapters/cuda/device.hpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111

1212
#include <ur/ur.hpp>
1313

14+
#include <umf/providers/provider_cuda.h>
15+
1416
#include "common.hpp"
1517

1618
struct ur_device_handle_t_ {
@@ -23,6 +25,7 @@ struct ur_device_handle_t_ {
2325
std::atomic_uint32_t RefCount;
2426
ur_platform_handle_t Platform;
2527
uint32_t DeviceIndex;
28+
umf_memory_provider_handle_t umfCUDAprovider[UMF_MEMORY_TYPE_SHARED];
2629

2730
static constexpr uint32_t MaxWorkItemDimensions = 3u;
2831
size_t MaxWorkItemSizes[MaxWorkItemDimensions];
@@ -115,6 +118,16 @@ struct ur_device_handle_t_ {
115118

116119
uint32_t getNumComputeUnits() const noexcept { return NumComputeUnits; };
117120

121+
void setUmfCUDAprovider(umf_usm_memory_type_t memType,
122+
umf_memory_provider_handle_t _umfCUDAprovider) {
123+
umfCUDAprovider[(int)memType - 1] = _umfCUDAprovider;
124+
}
125+
126+
umf_memory_provider_handle_t
127+
getUmfCUDAprovider(umf_usm_memory_type_t memType) {
128+
return umfCUDAprovider[(int)memType - 1];
129+
}
130+
118131
// bookkeeping for mipmappedArray leaks in Mapping external Memory
119132
std::map<CUarray, CUmipmappedArray> ChildCuarrayFromMipmapMap;
120133
};

source/adapters/cuda/memory.cpp

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "context.hpp"
1515
#include "enqueue.hpp"
1616
#include "memory.hpp"
17+
#include "umf_helpers.hpp"
1718

1819
/// Creates a UR Memory object using a CUDA memory allocation.
1920
/// Can trigger a manual copy depending on the mode.
@@ -49,7 +50,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate(
4950
cuMemHostRegister(HostPtr, size, CU_MEMHOSTREGISTER_DEVICEMAP));
5051
AllocMode = BufferMem::AllocMode::UseHostPtr;
5152
} else if (flags & UR_MEM_FLAG_ALLOC_HOST_POINTER) {
52-
UR_CHECK_ERROR(cuMemAllocHost(&HostPtr, size));
53+
// UR_CHECK_ERROR(cuMemAllocHost(&HostPtr, size));
54+
std::vector<ur_device_handle_t> Devices = hContext->getDevices();
55+
ur_device_handle_t Device0 = Devices[0];
56+
umf_memory_provider_handle_t umfCUDAprovider =
57+
Device0->getUmfCUDAprovider(
58+
umf_usm_memory_type_t::UMF_MEMORY_TYPE_HOST);
59+
umf_result_t umf_result =
60+
umfMemoryProviderAlloc(umfCUDAprovider, size, 0, &HostPtr);
61+
UR_CHECK_ERROR(umf::umf2urResult(umf_result));
5362
AllocMode = BufferMem::AllocMode::AllocHostPtr;
5463
} else if (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) {
5564
AllocMode = BufferMem::AllocMode::CopyIn;
@@ -440,7 +449,13 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
440449
CU_MEMHOSTALLOC_DEVICEMAP));
441450
UR_CHECK_ERROR(cuMemHostGetDevicePointer(&DevPtr, Buffer.HostPtr, 0));
442451
} else {
443-
UR_CHECK_ERROR(cuMemAlloc(&DevPtr, Buffer.Size));
452+
// UR_CHECK_ERROR(cuMemAlloc(&DevPtr, Buffer.Size));
453+
umf_memory_provider_handle_t umfCUDAprovider =
454+
hDevice->getUmfCUDAprovider(
455+
umf_usm_memory_type_t::UMF_MEMORY_TYPE_DEVICE);
456+
umf_result_t umf_result = umfMemoryProviderAlloc(
457+
umfCUDAprovider, Buffer.Size, 0, (void **)&DevPtr);
458+
UR_CHECK_ERROR(umf::umf2urResult(umf_result));
444459
}
445460
} else {
446461
CUarray ImageArray{};

source/adapters/cuda/platform.cpp

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "common.hpp"
1414
#include "context.hpp"
1515
#include "device.hpp"
16+
#include "umf_helpers.hpp"
1617

1718
#include <cassert>
1819
#include <cuda.h>
@@ -115,6 +116,67 @@ urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries,
115116
Result = UR_RESULT_ERROR_OUT_OF_RESOURCES;
116117
throw;
117118
}
119+
120+
try {
121+
umf_cuda_memory_provider_params_handle_t cu_memory_provider_params =
122+
nullptr;
123+
umf_result_t umf_result =
124+
umfCUDAMemoryProviderParamsCreate(&cu_memory_provider_params);
125+
if (umf_result != UMF_RESULT_SUCCESS) {
126+
Result = umf::umf2urResult(umf_result);
127+
throw Result;
128+
}
129+
130+
for (int i = 0; i < NumDevices; ++i) {
131+
ur_device_handle_t_ *device_handle = Platform.Devices[i].get();
132+
CUdevice device = device_handle->get();
133+
CUcontext context = device_handle->getNativeContext();
134+
135+
for (int memType = UMF_MEMORY_TYPE_HOST;
136+
memType <= UMF_MEMORY_TYPE_SHARED; memType++) {
137+
umf_result = umfCUDAMemoryProviderParamsSetContext(
138+
cu_memory_provider_params, context);
139+
if (umf_result != UMF_RESULT_SUCCESS) {
140+
Result = umf::umf2urResult(umf_result);
141+
throw Result;
142+
}
143+
144+
umf_result = umfCUDAMemoryProviderParamsSetDevice(
145+
cu_memory_provider_params, device);
146+
if (umf_result != UMF_RESULT_SUCCESS) {
147+
Result = umf::umf2urResult(umf_result);
148+
throw Result;
149+
}
150+
151+
umf_result = umfCUDAMemoryProviderParamsSetMemoryType(
152+
cu_memory_provider_params, (umf_usm_memory_type_t)memType);
153+
if (umf_result != UMF_RESULT_SUCCESS) {
154+
Result = umf::umf2urResult(umf_result);
155+
throw Result;
156+
}
157+
158+
umf_memory_provider_handle_t umfCUDAprovider = nullptr;
159+
umf_result = umfMemoryProviderCreate(umfCUDAMemoryProviderOps(),
160+
cu_memory_provider_params,
161+
&umfCUDAprovider);
162+
if (umf_result != UMF_RESULT_SUCCESS) {
163+
Result = umf::umf2urResult(umf_result);
164+
throw Result;
165+
}
166+
167+
device_handle->setUmfCUDAprovider(
168+
(umf_usm_memory_type_t)memType, umfCUDAprovider);
169+
}
170+
}
171+
172+
umfCUDAMemoryProviderParamsDestroy(cu_memory_provider_params);
173+
} catch (ur_result_t Err) {
174+
Result = Err;
175+
throw Err;
176+
} catch (...) {
177+
Result = UR_RESULT_ERROR_OUT_OF_RESOURCES;
178+
throw;
179+
}
118180
},
119181
Result);
120182

source/adapters/cuda/usm.cpp

Lines changed: 36 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
102102
return UR_RESULT_SUCCESS;
103103
}
104104

105-
ur_result_t USMFreeImpl(ur_context_handle_t, void *Pointer) {
105+
ur_result_t USMFreeImpl(ur_context_handle_t hContext, void *Pointer) {
106106
ur_result_t Result = UR_RESULT_SUCCESS;
107107
try {
108108
unsigned int IsManaged;
@@ -114,14 +114,25 @@ ur_result_t USMFreeImpl(ur_context_handle_t, void *Pointer) {
114114
(CUdeviceptr)Pointer));
115115
UR_ASSERT(Type == CU_MEMORYTYPE_DEVICE || Type == CU_MEMORYTYPE_HOST,
116116
UR_RESULT_ERROR_INVALID_MEM_OBJECT);
117+
118+
std::vector<ur_device_handle_t> Devices = hContext->getDevices();
119+
ur_device_handle_t Device0 = Devices[0];
120+
umf_memory_provider_handle_t umfCUDAprovider;
121+
117122
if (IsManaged || Type == CU_MEMORYTYPE_DEVICE) {
118123
// Memory allocated with cuMemAlloc and cuMemAllocManaged must be freed
119124
// with cuMemFree
120-
UR_CHECK_ERROR(cuMemFree((CUdeviceptr)Pointer));
125+
umfCUDAprovider = Device0->getUmfCUDAprovider(
126+
umf_usm_memory_type_t::UMF_MEMORY_TYPE_DEVICE);
121127
} else {
122128
// Memory allocated with cuMemAllocHost must be freed with cuMemFreeHost
123-
UR_CHECK_ERROR(cuMemFreeHost(Pointer));
129+
umfCUDAprovider = Device0->getUmfCUDAprovider(
130+
umf_usm_memory_type_t::UMF_MEMORY_TYPE_HOST);
124131
}
132+
133+
umf_result_t umf_result = umfMemoryProviderFree(umfCUDAprovider, Pointer,
134+
0 /* size is unknown */);
135+
UR_CHECK_ERROR(umf::umf2urResult(umf_result));
125136
} catch (ur_result_t Err) {
126137
Result = Err;
127138
}
@@ -143,7 +154,12 @@ ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t,
143154
uint32_t Alignment) {
144155
try {
145156
ScopedContext Active(Device);
146-
UR_CHECK_ERROR(cuMemAlloc((CUdeviceptr *)ResultPtr, Size));
157+
// UR_CHECK_ERROR(cuMemAlloc((CUdeviceptr *)ResultPtr, Size));
158+
umf_memory_provider_handle_t umfCUDAprovider = Device->getUmfCUDAprovider(
159+
umf_usm_memory_type_t::UMF_MEMORY_TYPE_DEVICE);
160+
umf_result_t umf_result =
161+
umfMemoryProviderAlloc(umfCUDAprovider, Size, Alignment, ResultPtr);
162+
UR_CHECK_ERROR(umf::umf2urResult(umf_result));
147163
} catch (ur_result_t Err) {
148164
return Err;
149165
}
@@ -164,8 +180,13 @@ ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t,
164180
uint32_t Alignment) {
165181
try {
166182
ScopedContext Active(Device);
167-
UR_CHECK_ERROR(cuMemAllocManaged((CUdeviceptr *)ResultPtr, Size,
168-
CU_MEM_ATTACH_GLOBAL));
183+
// UR_CHECK_ERROR(cuMemAllocManaged((CUdeviceptr *)ResultPtr, Size,
184+
// CU_MEM_ATTACH_GLOBAL));
185+
umf_memory_provider_handle_t umfCUDAprovider = Device->getUmfCUDAprovider(
186+
umf_usm_memory_type_t::UMF_MEMORY_TYPE_SHARED);
187+
umf_result_t umf_result =
188+
umfMemoryProviderAlloc(umfCUDAprovider, Size, Alignment, ResultPtr);
189+
UR_CHECK_ERROR(umf::umf2urResult(umf_result));
169190
} catch (ur_result_t Err) {
170191
return Err;
171192
}
@@ -179,11 +200,18 @@ ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t,
179200
return UR_RESULT_SUCCESS;
180201
}
181202

182-
ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t,
203+
ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t hContext,
183204
ur_usm_host_mem_flags_t, size_t Size,
184205
uint32_t Alignment) {
185206
try {
186-
UR_CHECK_ERROR(cuMemAllocHost(ResultPtr, Size));
207+
// UR_CHECK_ERROR(cuMemAllocHost(ResultPtr, Size));
208+
std::vector<ur_device_handle_t> Devices = hContext->getDevices();
209+
ur_device_handle_t Device0 = Devices[0];
210+
umf_memory_provider_handle_t umfCUDAprovider = Device0->getUmfCUDAprovider(
211+
umf_usm_memory_type_t::UMF_MEMORY_TYPE_HOST);
212+
umf_result_t umf_result =
213+
umfMemoryProviderAlloc(umfCUDAprovider, Size, Alignment, ResultPtr);
214+
UR_CHECK_ERROR(umf::umf2urResult(umf_result));
187215
} catch (ur_result_t Err) {
188216
return Err;
189217
}

source/common/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ else()
6464
set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "Build UMF examples")
6565
set(UMF_BUILD_SHARED_LIBRARY ${UMF_BUILD_SHARED_LIBRARY} CACHE INTERNAL "Build UMF shared library")
6666
set(UMF_BUILD_LIBUMF_POOL_DISJOINT ON CACHE INTERNAL "Build Disjoint Pool")
67-
set(UMF_BUILD_CUDA_PROVIDER OFF CACHE INTERNAL "Build UMF CUDA provider")
67+
set(UMF_BUILD_CUDA_PROVIDER ON CACHE INTERNAL "Build UMF CUDA provider")
6868

6969
FetchContent_MakeAvailable(unified-memory-framework)
7070
FetchContent_GetProperties(unified-memory-framework)

0 commit comments

Comments
 (0)