Skip to content

Commit 5695973

Browse files
author
Hugh Delaney
committed
Change to calculating device index per ctx
The modulo operator is not sufficient to calculate the device Idx in a context. It is necessary instead to find the index of a device in the context's Devices vector. We could also use a map for this, but since we use this ctx local idx quite a lot, I was hesitant to transform all vectors to std::maps because of possible perf implications of using a different data structure.
1 parent 7d51b62 commit 5695973

File tree

6 files changed

+105
-91
lines changed

6 files changed

+105
-91
lines changed

source/adapters/cuda/context.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,13 @@ struct ur_context_handle_t_ {
116116
return Devices;
117117
}
118118

119+
// Gets the index of the device relative to other devices in the context
120+
int getDeviceIndex(ur_device_handle_t hDevice) {
121+
auto It = std::find(Devices.begin(), Devices.end(), hDevice);
122+
assert(It != Devices.end());
123+
return std::distance(Devices.begin(), It);
124+
}
125+
119126
uint32_t incrementReferenceCount() noexcept { return ++RefCount; }
120127

121128
uint32_t decrementReferenceCount() noexcept { return --RefCount; }

source/adapters/cuda/memory.cpp

Lines changed: 37 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -429,11 +429,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition(
429429
ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
430430
const ur_device_handle_t hDevice) {
431431
ScopedContext Active(hDevice);
432+
auto DeviceIdx = Mem->getContext()->getDeviceIndex(hDevice);
432433
ur_lock LockGuard(Mem->MemoryAllocationMutex);
433434

434435
if (Mem->isBuffer()) {
435436
auto &Buffer = std::get<BufferMem>(Mem->Mem);
436-
auto &DevPtr = Buffer.Ptrs[hDevice->getIndex() % Buffer.Ptrs.size()];
437+
auto &DevPtr = Buffer.Ptrs[DeviceIdx];
437438

438439
// Allocation has already been made
439440
if (DevPtr != BufferMem::native_type{0}) {
@@ -456,11 +457,11 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
456457
try {
457458
auto &Image = std::get<SurfaceMem>(Mem->Mem);
458459
// Allocation has already been made
459-
if (Image.Arrays[hDevice->getIndex() % Image.Arrays.size()]) {
460+
if (Image.Arrays[DeviceIdx]) {
460461
return UR_RESULT_SUCCESS;
461462
}
462463
UR_CHECK_ERROR(cuArray3DCreate(&ImageArray, &Image.ArrayDesc));
463-
Image.Arrays[hDevice->getIndex() % Image.Arrays.size()] = ImageArray;
464+
Image.Arrays[DeviceIdx] = ImageArray;
464465

465466
// CUDA_RESOURCE_DESC is a union of different structs, shown here
466467
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXOBJECT.html
@@ -475,7 +476,7 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
475476
ImageResDesc.flags = 0;
476477

477478
UR_CHECK_ERROR(cuSurfObjectCreate(&Surface, &ImageResDesc));
478-
Image.SurfObjs[hDevice->getIndex() % Image.SurfObjs.size()] = Surface;
479+
Image.SurfObjs[DeviceIdx] = Surface;
479480
} catch (ur_result_t Err) {
480481
if (ImageArray) {
481482
UR_CHECK_ERROR(cuArrayDestroy(ImageArray));
@@ -590,9 +591,8 @@ ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem,
590591
UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
591592
// Device allocation has already been initialized with most up to date
592593
// data in buffer
593-
if (Mem->HaveMigratedToDeviceSinceLastWrite
594-
[hDevice->getIndex() %
595-
Mem->HaveMigratedToDeviceSinceLastWrite.size()]) {
594+
if (Mem->HaveMigratedToDeviceSinceLastWrite[Mem->getContext()->getDeviceIndex(
595+
hDevice)]) {
596596
return UR_RESULT_SUCCESS;
597597
}
598598

@@ -603,8 +603,35 @@ ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem,
603603
UR_CHECK_ERROR(migrateImageToDevice(Mem, hDevice));
604604
}
605605

606-
Mem->HaveMigratedToDeviceSinceLastWrite
607-
[hDevice->getIndex() % Mem->HaveMigratedToDeviceSinceLastWrite.size()] =
608-
true;
606+
Mem->HaveMigratedToDeviceSinceLastWrite[Mem->getContext()->getDeviceIndex(
607+
hDevice)] = true;
609608
return UR_RESULT_SUCCESS;
610609
}
610+
611+
BufferMem::native_type
612+
BufferMem::getPtrWithOffset(const ur_device_handle_t Device, size_t Offset) {
613+
if (ur_result_t Err = allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
614+
Err != UR_RESULT_SUCCESS) {
615+
throw Err;
616+
}
617+
return reinterpret_cast<native_type>(
618+
reinterpret_cast<uint8_t *>(
619+
Ptrs[OuterMemStruct->getContext()->getDeviceIndex(Device)]) +
620+
Offset);
621+
}
622+
623+
CUarray SurfaceMem::getArray(const ur_device_handle_t Device) {
624+
if (ur_result_t Err = allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
625+
Err != UR_RESULT_SUCCESS) {
626+
throw Err;
627+
}
628+
return Arrays[OuterMemStruct->getContext()->getDeviceIndex(Device)];
629+
}
630+
631+
CUsurfObject SurfaceMem::getSurface(const ur_device_handle_t Device) {
632+
if (ur_result_t Err = allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
633+
Err != UR_RESULT_SUCCESS) {
634+
throw Err;
635+
}
636+
return SurfObjs[OuterMemStruct->getContext()->getDeviceIndex(Device)];
637+
}

source/adapters/cuda/memory.hpp

Lines changed: 9 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,6 @@
2020
#include "device.hpp"
2121
#include "event.hpp"
2222

23-
ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t,
24-
const ur_device_handle_t);
25-
ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t,
26-
const ur_device_handle_t);
27-
2823
// Handler for plain, pointer-based CUDA allocations
2924
struct BufferMem {
3025

@@ -97,16 +92,7 @@ struct BufferMem {
9792

9893
BufferMem(const BufferMem &Buffer) = default;
9994

100-
native_type getPtrWithOffset(const ur_device_handle_t Device, size_t Offset) {
101-
if (ur_result_t Err =
102-
allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
103-
Err != UR_RESULT_SUCCESS) {
104-
throw Err;
105-
}
106-
return reinterpret_cast<native_type>(
107-
reinterpret_cast<uint8_t *>(Ptrs[Device->getIndex() % Ptrs.size()]) +
108-
Offset);
109-
}
95+
native_type getPtrWithOffset(const ur_device_handle_t Device, size_t Offset);
11096

11197
native_type getPtr(const ur_device_handle_t Device) {
11298
return getPtrWithOffset(Device, 0);
@@ -269,23 +255,10 @@ struct SurfaceMem {
269255
}
270256

271257
// Will allocate a new array on device if not already allocated
272-
CUarray getArray(const ur_device_handle_t Device) {
273-
if (ur_result_t Err =
274-
allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
275-
Err != UR_RESULT_SUCCESS) {
276-
throw Err;
277-
}
278-
return Arrays[Device->getIndex() % Arrays.size()];
279-
}
258+
CUarray getArray(const ur_device_handle_t Device);
259+
280260
// Will allocate a new surface on device if not already allocated
281-
CUsurfObject getSurface(const ur_device_handle_t Device) {
282-
if (ur_result_t Err =
283-
allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
284-
Err != UR_RESULT_SUCCESS) {
285-
throw Err;
286-
}
287-
return SurfObjs[Device->getIndex() % SurfObjs.size()];
288-
}
261+
CUsurfObject getSurface(const ur_device_handle_t Device);
289262

290263
ur_mem_type_t getType() { return ImageDesc.type; }
291264

@@ -515,9 +488,11 @@ struct ur_mem_handle_t_ {
515488
for (const auto &Device : Context->getDevices()) {
516489
// This event is never an interop event so will always have an associated
517490
// queue
518-
HaveMigratedToDeviceSinceLastWrite
519-
[Device->getIndex() % HaveMigratedToDeviceSinceLastWrite.size()] =
520-
Device == NewEvent->getQueue()->getDevice();
491+
HaveMigratedToDeviceSinceLastWrite[Context->getDeviceIndex(Device)] =
492+
Device == NewEvent->getQueue()->getDevice();
521493
}
522494
}
523495
};
496+
497+
ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t,
498+
const ur_device_handle_t);

source/adapters/hip/context.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,13 @@ struct ur_context_handle_t_ {
112112
return Devices;
113113
}
114114

115+
// Gets the index of the device relative to other devices in the context
116+
int getDeviceIndex(ur_device_handle_t hDevice) {
117+
auto It = std::find(Devices.begin(), Devices.end(), hDevice);
118+
assert(It != Devices.end());
119+
return std::distance(Devices.begin(), It);
120+
}
121+
115122
uint32_t incrementReferenceCount() noexcept { return ++RefCount; }
116123

117124
uint32_t decrementReferenceCount() noexcept { return --RefCount; }

source/adapters/hip/memory.cpp

Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -455,12 +455,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRetain(ur_mem_handle_t hMem) {
455455
ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
456456
const ur_device_handle_t hDevice) {
457457
ScopedContext Active(hDevice);
458+
auto DeviceIdx = Mem->getContext()->getDeviceIndex(hDevice);
458459
ur_lock LockGuard(Mem->MemoryAllocationMutex);
459460

460461
if (Mem->isBuffer()) {
461462
auto &Buffer = std::get<BufferMem>(Mem->Mem);
462-
hipDeviceptr_t &DevPtr =
463-
Buffer.Ptrs[hDevice->getIndex() % Buffer.Ptrs.size()];
463+
hipDeviceptr_t &DevPtr = Buffer.Ptrs[DeviceIdx];
464464

465465
// Allocation has already been made
466466
if (DevPtr != BufferMem::native_type{0}) {
@@ -483,12 +483,12 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
483483
try {
484484
auto &Image = std::get<SurfaceMem>(Mem->Mem);
485485
// Allocation has already been made
486-
if (Image.Arrays[hDevice->getIndex() % Image.Arrays.size()]) {
486+
if (Image.Arrays[DeviceIdx]) {
487487
return UR_RESULT_SUCCESS;
488488
}
489489
UR_CHECK_ERROR(hipArray3DCreate(
490490
reinterpret_cast<hipCUarray *>(&ImageArray), &Image.ArrayDesc));
491-
Image.Arrays[hDevice->getIndex() % Image.Arrays.size()] = ImageArray;
491+
Image.Arrays[DeviceIdx] = ImageArray;
492492
// HIP_RESOURCE_DESC is a union of different structs, shown here
493493
// We need to fill it as described here to use it for a surface or texture
494494
// HIP_RESOURCE_DESC::resType must be HIP_RESOURCE_TYPE_ARRAY and
@@ -500,7 +500,7 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
500500
ImageResDesc.resType = hipResourceTypeArray;
501501

502502
UR_CHECK_ERROR(hipCreateSurfaceObject(&Surface, &ImageResDesc));
503-
Image.SurfObjs[hDevice->getIndex() % Image.SurfObjs.size()] = Surface;
503+
Image.SurfObjs[DeviceIdx] = Surface;
504504
} catch (ur_result_t Err) {
505505
if (ImageArray) {
506506
UR_CHECK_ERROR(hipFreeArray(ImageArray));
@@ -609,11 +609,10 @@ inline ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
609609
ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem,
610610
const ur_device_handle_t hDevice) {
611611
UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
612+
auto DeviceIdx = Mem->getContext()->getDeviceIndex(hDevice);
612613
// Device allocation has already been initialized with most up to date
613614
// data in buffer
614-
if (Mem->HaveMigratedToDeviceSinceLastWrite
615-
[hDevice->getIndex() %
616-
Mem->HaveMigratedToDeviceSinceLastWrite.size()])
615+
if (Mem->HaveMigratedToDeviceSinceLastWrite[DeviceIdx])
617616
return UR_RESULT_SUCCESS;
618617

619618
ScopedContext Active(hDevice);
@@ -623,8 +622,34 @@ ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem,
623622
UR_CHECK_ERROR(migrateImageToDevice(Mem, hDevice));
624623
}
625624

626-
Mem->HaveMigratedToDeviceSinceLastWrite
627-
[hDevice->getIndex() % Mem->HaveMigratedToDeviceSinceLastWrite.size()] =
628-
true;
625+
Mem->HaveMigratedToDeviceSinceLastWrite[DeviceIdx] = true;
629626
return UR_RESULT_SUCCESS;
630627
}
628+
629+
BufferMem::native_type
630+
BufferMem::getPtrWithOffset(const ur_device_handle_t Device, size_t Offset) {
631+
if (ur_result_t Err = allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
632+
Err != UR_RESULT_SUCCESS) {
633+
throw Err;
634+
}
635+
return reinterpret_cast<native_type>(
636+
reinterpret_cast<uint8_t *>(
637+
Ptrs[OuterMemStruct->getContext()->getDeviceIndex(Device)]) +
638+
Offset);
639+
}
640+
641+
hipArray *SurfaceMem::getArray(const ur_device_handle_t Device) {
642+
if (ur_result_t Err = allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
643+
Err != UR_RESULT_SUCCESS) {
644+
throw Err;
645+
}
646+
return Arrays[OuterMemStruct->getContext()->getDeviceIndex(Device)];
647+
}
648+
649+
hipSurfaceObject_t SurfaceMem::getSurface(const ur_device_handle_t Device) {
650+
if (ur_result_t Err = allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
651+
Err != UR_RESULT_SUCCESS) {
652+
throw Err;
653+
}
654+
return SurfObjs[OuterMemStruct->getContext()->getDeviceIndex(Device)];
655+
}

source/adapters/hip/memory.hpp

Lines changed: 9 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -9,20 +9,14 @@
99
//===----------------------------------------------------------------------===//
1010
#pragma once
1111

12+
#include "common.hpp"
1213
#include "context.hpp"
1314
#include "event.hpp"
1415
#include <cassert>
1516
#include <memory>
1617
#include <unordered_map>
1718
#include <variant>
1819

19-
#include "common.hpp"
20-
21-
ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t,
22-
const ur_device_handle_t);
23-
ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t,
24-
const ur_device_handle_t);
25-
2620
// Handler for plain, pointer-based HIP allocations
2721
struct BufferMem {
2822
struct BufferMap {
@@ -95,16 +89,7 @@ struct BufferMem {
9589

9690
// This will allocate memory on device with index Index if there isn't already
9791
// an active allocation on the device
98-
native_type getPtrWithOffset(const ur_device_handle_t Device, size_t Offset) {
99-
if (ur_result_t Err =
100-
allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
101-
Err != UR_RESULT_SUCCESS) {
102-
throw Err;
103-
}
104-
return reinterpret_cast<native_type>(
105-
reinterpret_cast<uint8_t *>(Ptrs[Device->getIndex() % Ptrs.size()]) +
106-
Offset);
107-
}
92+
native_type getPtrWithOffset(const ur_device_handle_t Device, size_t Offset);
10893

10994
// This will allocate memory on device if there isn't already an active
11095
// allocation on the device
@@ -261,24 +246,10 @@ struct SurfaceMem {
261246
}
262247

263248
// Will allocate a new array on device if not already allocated
264-
hipArray *getArray(const ur_device_handle_t Device) {
265-
if (ur_result_t Err =
266-
allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
267-
Err != UR_RESULT_SUCCESS) {
268-
throw Err;
269-
}
270-
return Arrays[Device->getIndex() % Arrays.size()];
271-
}
249+
hipArray *getArray(const ur_device_handle_t Device);
272250

273251
// Will allocate a new surface on device if not already allocated
274-
hipSurfaceObject_t getSurface(const ur_device_handle_t Device) {
275-
if (ur_result_t Err =
276-
allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
277-
Err != UR_RESULT_SUCCESS) {
278-
throw Err;
279-
}
280-
return SurfObjs[Device->getIndex() % SurfObjs.size()];
281-
}
252+
hipSurfaceObject_t getSurface(const ur_device_handle_t Device);
282253

283254
ur_mem_type_t getImageType() const noexcept { return ImageDesc.type; }
284255

@@ -511,9 +482,11 @@ struct ur_mem_handle_t_ {
511482
urEventRetain(NewEvent);
512483
LastEventWritingToMemObj = NewEvent;
513484
for (const auto &Device : Context->getDevices()) {
514-
HaveMigratedToDeviceSinceLastWrite
515-
[Device->getIndex() % HaveMigratedToDeviceSinceLastWrite.size()] =
516-
Device == NewEvent->getQueue()->getDevice();
485+
HaveMigratedToDeviceSinceLastWrite[Context->getDeviceIndex(Device)] =
486+
Device == NewEvent->getQueue()->getDevice();
517487
}
518488
}
519489
};
490+
491+
ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t,
492+
const ur_device_handle_t);

0 commit comments

Comments
 (0)