Skip to content

Commit f78d369

Browse files
committed
[EXP][CMDBUF] HIP adapter support for command buffers
Command buffer functions for HIP. Some additional methods and refactoring to reuse existing adapter code. This code was mainly written by Andrey Alekseenko @al42and Co-authored-by: Andrey Alekseenko andrey.alekseenko@scilifelab.se Co-authored-by: Ewan Crawford <ewan@codeplay.com>
1 parent 91c6068 commit f78d369

File tree

8 files changed

+1603
-241
lines changed

8 files changed

+1603
-241
lines changed

source/adapters/hip/command_buffer.cpp

Lines changed: 1020 additions & 112 deletions
Large diffs are not rendered by default.

source/adapters/hip/command_buffer.hpp

Lines changed: 307 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,312 @@
99
//===----------------------------------------------------------------------===//
1010

1111
#include <ur/ur.hpp>
12+
#include <ur_api.h>
1213

13-
/// Stub implementation of command-buffers for HIP
14+
#include "context.hpp"
15+
#include <hip/hip_runtime.h>
16+
#include <memory>
1417

15-
struct ur_exp_command_buffer_handle_t_ {};
18+
static inline const char *getUrResultString(ur_result_t Result) {
19+
switch (Result) {
20+
case UR_RESULT_SUCCESS:
21+
return "UR_RESULT_SUCCESS";
22+
case UR_RESULT_ERROR_INVALID_OPERATION:
23+
return "UR_RESULT_ERROR_INVALID_OPERATION";
24+
case UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES:
25+
return "UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES";
26+
case UR_RESULT_ERROR_INVALID_QUEUE:
27+
return "UR_RESULT_ERROR_INVALID_QUEUE";
28+
case UR_RESULT_ERROR_INVALID_VALUE:
29+
return "UR_RESULT_ERROR_INVALID_VALUE";
30+
case UR_RESULT_ERROR_INVALID_CONTEXT:
31+
return "UR_RESULT_ERROR_INVALID_CONTEXT";
32+
case UR_RESULT_ERROR_INVALID_PLATFORM:
33+
return "UR_RESULT_ERROR_INVALID_PLATFORM";
34+
case UR_RESULT_ERROR_INVALID_BINARY:
35+
return "UR_RESULT_ERROR_INVALID_BINARY";
36+
case UR_RESULT_ERROR_INVALID_PROGRAM:
37+
return "UR_RESULT_ERROR_INVALID_PROGRAM";
38+
case UR_RESULT_ERROR_INVALID_SAMPLER:
39+
return "UR_RESULT_ERROR_INVALID_SAMPLER";
40+
case UR_RESULT_ERROR_INVALID_BUFFER_SIZE:
41+
return "UR_RESULT_ERROR_INVALID_BUFFER_SIZE";
42+
case UR_RESULT_ERROR_INVALID_MEM_OBJECT:
43+
return "UR_RESULT_ERROR_INVALID_MEM_OBJECT";
44+
case UR_RESULT_ERROR_INVALID_EVENT:
45+
return "UR_RESULT_ERROR_INVALID_EVENT";
46+
case UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST:
47+
return "UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST";
48+
case UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET:
49+
return "UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET";
50+
case UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE:
51+
return "UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE";
52+
case UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE:
53+
return "UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE";
54+
case UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE:
55+
return "UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE";
56+
case UR_RESULT_ERROR_DEVICE_NOT_FOUND:
57+
return "UR_RESULT_ERROR_DEVICE_NOT_FOUND";
58+
case UR_RESULT_ERROR_INVALID_DEVICE:
59+
return "UR_RESULT_ERROR_INVALID_DEVICE";
60+
case UR_RESULT_ERROR_DEVICE_LOST:
61+
return "UR_RESULT_ERROR_DEVICE_LOST";
62+
case UR_RESULT_ERROR_DEVICE_REQUIRES_RESET:
63+
return "UR_RESULT_ERROR_DEVICE_REQUIRES_RESET";
64+
case UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE:
65+
return "UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE";
66+
case UR_RESULT_ERROR_DEVICE_PARTITION_FAILED:
67+
return "UR_RESULT_ERROR_DEVICE_PARTITION_FAILED";
68+
case UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT:
69+
return "UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT";
70+
case UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE:
71+
return "UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE";
72+
case UR_RESULT_ERROR_INVALID_WORK_DIMENSION:
73+
return "UR_RESULT_ERROR_INVALID_WORK_DIMENSION";
74+
case UR_RESULT_ERROR_INVALID_KERNEL_ARGS:
75+
return "UR_RESULT_ERROR_INVALID_KERNEL_ARGS";
76+
case UR_RESULT_ERROR_INVALID_KERNEL:
77+
return "UR_RESULT_ERROR_INVALID_KERNEL";
78+
case UR_RESULT_ERROR_INVALID_KERNEL_NAME:
79+
return "UR_RESULT_ERROR_INVALID_KERNEL_NAME";
80+
case UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX:
81+
return "UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX";
82+
case UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE:
83+
return "UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE";
84+
case UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE:
85+
return "UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE";
86+
case UR_RESULT_ERROR_INVALID_IMAGE_SIZE:
87+
return "UR_RESULT_ERROR_INVALID_IMAGE_SIZE";
88+
case UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR:
89+
return "UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR";
90+
case UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED:
91+
return "UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED";
92+
case UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE:
93+
return "UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE";
94+
case UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE:
95+
return "UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE";
96+
case UR_RESULT_ERROR_UNINITIALIZED:
97+
return "UR_RESULT_ERROR_UNINITIALIZED";
98+
case UR_RESULT_ERROR_OUT_OF_HOST_MEMORY:
99+
return "UR_RESULT_ERROR_OUT_OF_HOST_MEMORY";
100+
case UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY:
101+
return "UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY";
102+
case UR_RESULT_ERROR_OUT_OF_RESOURCES:
103+
return "UR_RESULT_ERROR_OUT_OF_RESOURCES";
104+
case UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE:
105+
return "UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE";
106+
case UR_RESULT_ERROR_PROGRAM_LINK_FAILURE:
107+
return "UR_RESULT_ERROR_PROGRAM_LINK_FAILURE";
108+
case UR_RESULT_ERROR_UNSUPPORTED_VERSION:
109+
return "UR_RESULT_ERROR_UNSUPPORTED_VERSION";
110+
case UR_RESULT_ERROR_UNSUPPORTED_FEATURE:
111+
return "UR_RESULT_ERROR_UNSUPPORTED_FEATURE";
112+
case UR_RESULT_ERROR_INVALID_ARGUMENT:
113+
return "UR_RESULT_ERROR_INVALID_ARGUMENT";
114+
case UR_RESULT_ERROR_INVALID_NULL_HANDLE:
115+
return "UR_RESULT_ERROR_INVALID_NULL_HANDLE";
116+
case UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE:
117+
return "UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE";
118+
case UR_RESULT_ERROR_INVALID_NULL_POINTER:
119+
return "UR_RESULT_ERROR_INVALID_NULL_POINTER";
120+
case UR_RESULT_ERROR_INVALID_SIZE:
121+
return "UR_RESULT_ERROR_INVALID_SIZE";
122+
case UR_RESULT_ERROR_UNSUPPORTED_SIZE:
123+
return "UR_RESULT_ERROR_UNSUPPORTED_SIZE";
124+
case UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT:
125+
return "UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT";
126+
case UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT:
127+
return "UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT";
128+
case UR_RESULT_ERROR_INVALID_ENUMERATION:
129+
return "UR_RESULT_ERROR_INVALID_ENUMERATION";
130+
case UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION:
131+
return "UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION";
132+
case UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT:
133+
return "UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT";
134+
case UR_RESULT_ERROR_INVALID_NATIVE_BINARY:
135+
return "UR_RESULT_ERROR_INVALID_NATIVE_BINARY";
136+
case UR_RESULT_ERROR_INVALID_GLOBAL_NAME:
137+
return "UR_RESULT_ERROR_INVALID_GLOBAL_NAME";
138+
case UR_RESULT_ERROR_INVALID_FUNCTION_NAME:
139+
return "UR_RESULT_ERROR_INVALID_FUNCTION_NAME";
140+
case UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION:
141+
return "UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION";
142+
case UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION:
143+
return "UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION";
144+
case UR_RESULT_ERROR_PROGRAM_UNLINKED:
145+
return "UR_RESULT_ERROR_PROGRAM_UNLINKED";
146+
case UR_RESULT_ERROR_OVERLAPPING_REGIONS:
147+
return "UR_RESULT_ERROR_OVERLAPPING_REGIONS";
148+
case UR_RESULT_ERROR_INVALID_HOST_PTR:
149+
return "UR_RESULT_ERROR_INVALID_HOST_PTR";
150+
case UR_RESULT_ERROR_INVALID_USM_SIZE:
151+
return "UR_RESULT_ERROR_INVALID_USM_SIZE";
152+
case UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE:
153+
return "UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE";
154+
case UR_RESULT_ERROR_ADAPTER_SPECIFIC:
155+
return "UR_RESULT_ERROR_ADAPTER_SPECIFIC";
156+
default:
157+
return "UR_RESULT_ERROR_UNKNOWN";
158+
}
159+
}
160+
161+
// Trace an internal UR call
162+
#define UR_TRACE(Call) \
163+
{ \
164+
ur_result_t Result; \
165+
UR_CALL(Call, Result); \
166+
}
167+
168+
// Trace an internal UR call and return the result to the user.
169+
#define UR_CALL(Call, Result) \
170+
{ \
171+
if (PrintTrace) \
172+
fprintf(stderr, "UR ---> %s\n", #Call); \
173+
Result = (Call); \
174+
if (PrintTrace) \
175+
fprintf(stderr, "UR <--- %s(%s)\n", #Call, getUrResultString(Result)); \
176+
}
177+
178+
// Handle to a kernel command.
179+
//
180+
// Struct that stores all the information related to a kernel command in a
181+
// command-buffer, such that the command can be recreated. When handles can
182+
// be returned from other command types this struct will need refactored.
183+
struct ur_exp_command_buffer_command_handle_t_ {
184+
ur_exp_command_buffer_command_handle_t_(
185+
ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel,
186+
std::shared_ptr<hipGraphNode_t> &&Node, hipKernelNodeParams Params,
187+
uint32_t WorkDim, const size_t *GlobalWorkOffsetPtr,
188+
const size_t *GlobalWorkSizePtr, const size_t *LocalWorkSizePtr);
189+
190+
void setGlobalOffset(const size_t *GlobalWorkOffsetPtr) {
191+
const size_t CopySize = sizeof(size_t) * WorkDim;
192+
std::memcpy(GlobalWorkOffset, GlobalWorkOffsetPtr, CopySize);
193+
if (WorkDim < 3) {
194+
const size_t ZeroSize = sizeof(size_t) * (3 - WorkDim);
195+
std::memset(GlobalWorkOffset + WorkDim, 0, ZeroSize);
196+
}
197+
}
198+
199+
void setGlobalSize(const size_t *GlobalWorkSizePtr) {
200+
const size_t CopySize = sizeof(size_t) * WorkDim;
201+
std::memcpy(GlobalWorkSize, GlobalWorkSizePtr, CopySize);
202+
if (WorkDim < 3) {
203+
const size_t ZeroSize = sizeof(size_t) * (3 - WorkDim);
204+
std::memset(GlobalWorkSize + WorkDim, 0, ZeroSize);
205+
}
206+
}
207+
208+
void setLocalSize(const size_t *LocalWorkSizePtr) {
209+
const size_t CopySize = sizeof(size_t) * WorkDim;
210+
std::memcpy(LocalWorkSize, LocalWorkSizePtr, CopySize);
211+
if (WorkDim < 3) {
212+
const size_t ZeroSize = sizeof(size_t) * (3 - WorkDim);
213+
std::memset(LocalWorkSize + WorkDim, 0, ZeroSize);
214+
}
215+
}
216+
217+
uint32_t incrementInternalReferenceCount() noexcept {
218+
return ++RefCountInternal;
219+
}
220+
uint32_t decrementInternalReferenceCount() noexcept {
221+
return --RefCountInternal;
222+
}
223+
224+
uint32_t incrementExternalReferenceCount() noexcept {
225+
return ++RefCountExternal;
226+
}
227+
uint32_t decrementExternalReferenceCount() noexcept {
228+
return --RefCountExternal;
229+
}
230+
uint32_t getExternalReferenceCount() const noexcept {
231+
return RefCountExternal;
232+
}
233+
234+
ur_exp_command_buffer_handle_t CommandBuffer;
235+
ur_kernel_handle_t Kernel;
236+
std::shared_ptr<hipGraphNode_t> Node;
237+
hipKernelNodeParams Params;
238+
239+
uint32_t WorkDim;
240+
size_t GlobalWorkOffset[3];
241+
size_t GlobalWorkSize[3];
242+
size_t LocalWorkSize[3];
243+
244+
private:
245+
std::atomic_uint32_t RefCountInternal;
246+
std::atomic_uint32_t RefCountExternal;
247+
};
248+
249+
struct ur_exp_command_buffer_handle_t_ {
250+
251+
ur_exp_command_buffer_handle_t_(ur_context_handle_t hContext,
252+
ur_device_handle_t hDevice, bool IsUpdatable);
253+
254+
~ur_exp_command_buffer_handle_t_();
255+
256+
void registerSyncPoint(ur_exp_command_buffer_sync_point_t SyncPoint,
257+
std::shared_ptr<hipGraphNode_t> HIPNode) {
258+
SyncPoints[SyncPoint] = HIPNode;
259+
NextSyncPoint++;
260+
}
261+
262+
ur_exp_command_buffer_sync_point_t getNextSyncPoint() const {
263+
return NextSyncPoint;
264+
}
265+
266+
// Helper to register next sync point
267+
// @param HIPNode Node to register as next sync point
268+
// @return Pointer to the sync that registers the Node
269+
ur_exp_command_buffer_sync_point_t
270+
addSyncPoint(std::shared_ptr<hipGraphNode_t> HIPNode) {
271+
ur_exp_command_buffer_sync_point_t SyncPoint = NextSyncPoint;
272+
registerSyncPoint(SyncPoint, HIPNode);
273+
return SyncPoint;
274+
}
275+
uint32_t incrementInternalReferenceCount() noexcept {
276+
return ++RefCountInternal;
277+
}
278+
uint32_t decrementInternalReferenceCount() noexcept {
279+
return --RefCountInternal;
280+
}
281+
uint32_t getInternalReferenceCount() const noexcept {
282+
return RefCountInternal;
283+
}
284+
285+
uint32_t incrementExternalReferenceCount() noexcept {
286+
return ++RefCountExternal;
287+
}
288+
uint32_t decrementExternalReferenceCount() noexcept {
289+
return --RefCountExternal;
290+
}
291+
uint32_t getExternalReferenceCount() const noexcept {
292+
return RefCountExternal;
293+
}
294+
295+
// UR context associated with this command-buffer
296+
ur_context_handle_t Context;
297+
// Device associated with this command buffer
298+
ur_device_handle_t Device;
299+
// Whether commands in the command-buffer can be updated
300+
bool IsUpdatable;
301+
// HIP Graph handle
302+
hipGraph_t HIPGraph;
303+
// HIP Graph Exec handle
304+
hipGraphExec_t HIPGraphExec;
305+
// Atomic variable counting the number of reference to this command_buffer
306+
// using std::atomic prevents data race when incrementing/decrementing.
307+
std::atomic_uint32_t RefCountInternal;
308+
std::atomic_uint32_t RefCountExternal;
309+
310+
// Map of sync_points to ur_events
311+
std::unordered_map<ur_exp_command_buffer_sync_point_t,
312+
std::shared_ptr<hipGraphNode_t>>
313+
SyncPoints;
314+
// Next sync_point value (may need to consider ways to reuse values if 32-bits
315+
// is not enough)
316+
ur_exp_command_buffer_sync_point_t NextSyncPoint;
317+
318+
// Handles to individual commands in the command-buffer
319+
std::vector<ur_exp_command_buffer_command_handle_t> CommandHandles;
320+
};

source/adapters/hip/device.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -539,6 +539,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
539539
// native asserts are in progress
540540
std::string SupportedExtensions = "";
541541
SupportedExtensions += "pi_ext_intel_devicelib_assert ";
542+
// Return supported for the UR command-buffer experimental feature
543+
SupportedExtensions += "ur_exp_command_buffer ";
542544
SupportedExtensions += " ";
543545

544546
hipDeviceProp_t Props;
@@ -843,7 +845,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
843845

844846
case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP:
845847
case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP:
846-
return ReturnValue(false);
848+
return ReturnValue(true);
847849

848850
default:
849851
break;

0 commit comments

Comments
 (0)