Skip to content

Commit 74ad837

Browse files
committed
Merge branch 'aaron/cl2DUSMOps' into aaron/clCTSFixMegaBranch
2 parents 8125902 + 603dcfb commit 74ad837

File tree

2 files changed

+184
-24
lines changed

2 files changed

+184
-24
lines changed

source/adapters/opencl/enqueue.cpp

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -178,12 +178,55 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
178178
size_t patternSize, size_t offset, size_t size,
179179
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
180180
ur_event_handle_t *phEvent) {
181+
// CL FillBuffer only allows pattern sizes up to the largest CL type:
182+
// long16/double16
183+
if (patternSize <= 128) {
184+
CL_RETURN_ON_FAILURE(
185+
clEnqueueFillBuffer(cl_adapter::cast<cl_command_queue>(hQueue),
186+
cl_adapter::cast<cl_mem>(hBuffer), pPattern,
187+
patternSize, offset, size, numEventsInWaitList,
188+
cl_adapter::cast<const cl_event *>(phEventWaitList),
189+
cl_adapter::cast<cl_event *>(phEvent)));
190+
return UR_RESULT_SUCCESS;
191+
}
192+
193+
auto NumValues = size / sizeof(uint64_t);
194+
auto HostBuffer = new uint64_t[NumValues];
195+
auto NumChunks = patternSize / sizeof(uint64_t);
196+
for (size_t i = 0; i < NumValues; i++) {
197+
HostBuffer[i] = static_cast<const uint64_t *>(pPattern)[i % NumChunks];
198+
}
181199

182-
CL_RETURN_ON_FAILURE(clEnqueueFillBuffer(
200+
cl_event WriteEvent = nullptr;
201+
auto ClErr = clEnqueueWriteBuffer(
183202
cl_adapter::cast<cl_command_queue>(hQueue),
184-
cl_adapter::cast<cl_mem>(hBuffer), pPattern, patternSize, offset, size,
203+
cl_adapter::cast<cl_mem>(hBuffer), false, offset, size, HostBuffer,
185204
numEventsInWaitList, cl_adapter::cast<const cl_event *>(phEventWaitList),
186-
cl_adapter::cast<cl_event *>(phEvent)));
205+
&WriteEvent);
206+
if (ClErr != CL_SUCCESS) {
207+
delete[] HostBuffer;
208+
CL_RETURN_ON_FAILURE(ClErr);
209+
}
210+
211+
auto DeleteCallback = [](cl_event, cl_int, void *pUserData) {
212+
delete[] static_cast<uint64_t *>(pUserData);
213+
};
214+
ClErr =
215+
clSetEventCallback(WriteEvent, CL_COMPLETE, DeleteCallback, HostBuffer);
216+
if (ClErr != CL_SUCCESS) {
217+
// We can attempt to recover gracefully by attempting to wait for the write
218+
// to finish and deleting the host buffer.
219+
clWaitForEvents(1, &WriteEvent);
220+
delete[] HostBuffer;
221+
clReleaseEvent(WriteEvent);
222+
CL_RETURN_ON_FAILURE(ClErr);
223+
}
224+
225+
if (phEvent) {
226+
*phEvent = cl_adapter::cast<ur_event_handle_t>(WriteEvent);
227+
} else {
228+
CL_RETURN_ON_FAILURE(clReleaseEvent(WriteEvent));
229+
}
187230

188231
return UR_RESULT_SUCCESS;
189232
}

source/adapters/opencl/usm.cpp

Lines changed: 138 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
197197
ur_queue_handle_t hQueue, void *ptr, size_t patternSize,
198198
const void *pPattern, size_t size, uint32_t numEventsInWaitList,
199199
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
200-
201200
// Have to look up the context from the kernel
202201
cl_context CLContext;
203202
cl_int CLErr = clGetCommandQueueInfo(
@@ -207,20 +206,97 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
207206
return mapCLErrorToUR(CLErr);
208207
}
209208

210-
clEnqueueMemFillINTEL_fn FuncPtr = nullptr;
211-
ur_result_t RetVal = cl_ext::getExtFuncFromContext<clEnqueueMemFillINTEL_fn>(
212-
CLContext, cl_ext::ExtFuncPtrCache->clEnqueueMemFillINTELCache,
213-
cl_ext::EnqueueMemFillName, &FuncPtr);
209+
if (patternSize <= 128) {
210+
clEnqueueMemFillINTEL_fn EnqueueMemFill = nullptr;
211+
UR_RETURN_ON_FAILURE(
212+
cl_ext::getExtFuncFromContext<clEnqueueMemFillINTEL_fn>(
213+
CLContext, cl_ext::ExtFuncPtrCache->clEnqueueMemFillINTELCache,
214+
cl_ext::EnqueueMemFillName, &EnqueueMemFill));
215+
216+
CL_RETURN_ON_FAILURE(
217+
EnqueueMemFill(cl_adapter::cast<cl_command_queue>(hQueue), ptr,
218+
pPattern, patternSize, size, numEventsInWaitList,
219+
cl_adapter::cast<const cl_event *>(phEventWaitList),
220+
cl_adapter::cast<cl_event *>(phEvent)));
221+
return UR_RESULT_SUCCESS;
222+
}
214223

215-
if (FuncPtr) {
216-
RetVal = mapCLErrorToUR(
217-
FuncPtr(cl_adapter::cast<cl_command_queue>(hQueue), ptr, pPattern,
218-
patternSize, size, numEventsInWaitList,
219-
cl_adapter::cast<const cl_event *>(phEventWaitList),
220-
cl_adapter::cast<cl_event *>(phEvent)));
224+
// OpenCL only supports pattern sizes as large as the largest CL type
225+
// (double16/long16 - 128 bytes), anything larger we need to do on the host
226+
// side and copy it into the target allocation.
227+
clHostMemAllocINTEL_fn HostMemAlloc = nullptr;
228+
UR_RETURN_ON_FAILURE(cl_ext::getExtFuncFromContext<clHostMemAllocINTEL_fn>(
229+
CLContext, cl_ext::ExtFuncPtrCache->clHostMemAllocINTELCache,
230+
cl_ext::HostMemAllocName, &HostMemAlloc));
231+
232+
clEnqueueMemcpyINTEL_fn USMMemcpy = nullptr;
233+
UR_RETURN_ON_FAILURE(cl_ext::getExtFuncFromContext<clEnqueueMemcpyINTEL_fn>(
234+
CLContext, cl_ext::ExtFuncPtrCache->clEnqueueMemcpyINTELCache,
235+
cl_ext::EnqueueMemcpyName, &USMMemcpy));
236+
237+
clMemBlockingFreeINTEL_fn USMFree = nullptr;
238+
UR_RETURN_ON_FAILURE(cl_ext::getExtFuncFromContext<clMemBlockingFreeINTEL_fn>(
239+
CLContext, cl_ext::ExtFuncPtrCache->clMemBlockingFreeINTELCache,
240+
cl_ext::MemBlockingFreeName, &USMFree));
241+
242+
cl_int ClErr = CL_SUCCESS;
243+
auto HostBuffer = static_cast<uint64_t *>(
244+
HostMemAlloc(CLContext, nullptr, size, 0, &ClErr));
245+
CL_RETURN_ON_FAILURE(ClErr);
246+
247+
auto NumValues = size / sizeof(uint64_t);
248+
auto NumChunks = patternSize / sizeof(uint64_t);
249+
for (size_t i = 0; i < NumValues; i++) {
250+
HostBuffer[i] = static_cast<const uint64_t *>(pPattern)[i % NumChunks];
221251
}
222252

223-
return RetVal;
253+
cl_event CopyEvent = nullptr;
254+
CL_RETURN_ON_FAILURE(USMMemcpy(
255+
cl_adapter::cast<cl_command_queue>(hQueue), false, ptr, HostBuffer, size,
256+
numEventsInWaitList, cl_adapter::cast<const cl_event *>(phEventWaitList),
257+
&CopyEvent));
258+
259+
struct DeleteCallbackInfo {
260+
DeleteCallbackInfo(clMemBlockingFreeINTEL_fn USMFree, cl_context CLContext,
261+
void *HostBuffer)
262+
: USMFree(USMFree), CLContext(CLContext), HostBuffer(HostBuffer) {
263+
clRetainContext(CLContext);
264+
}
265+
~DeleteCallbackInfo() {
266+
USMFree(CLContext, HostBuffer);
267+
clReleaseContext(CLContext);
268+
}
269+
DeleteCallbackInfo(const DeleteCallbackInfo &) = delete;
270+
DeleteCallbackInfo &operator=(const DeleteCallbackInfo &) = delete;
271+
272+
clMemBlockingFreeINTEL_fn USMFree;
273+
cl_context CLContext;
274+
void *HostBuffer;
275+
};
276+
277+
auto Info = new DeleteCallbackInfo(USMFree, CLContext, HostBuffer);
278+
279+
auto DeleteCallback = [](cl_event, cl_int, void *pUserData) {
280+
auto Info = static_cast<DeleteCallbackInfo *>(pUserData);
281+
delete Info;
282+
};
283+
284+
ClErr = clSetEventCallback(CopyEvent, CL_COMPLETE, DeleteCallback, Info);
285+
if (ClErr != CL_SUCCESS) {
286+
// We can attempt to recover gracefully by attempting to wait for the copy
287+
// to finish and deleting the info struct here.
288+
clWaitForEvents(1, &CopyEvent);
289+
delete Info;
290+
clReleaseEvent(CopyEvent);
291+
CL_RETURN_ON_FAILURE(ClErr);
292+
}
293+
if (phEvent) {
294+
*phEvent = cl_adapter::cast<ur_event_handle_t>(CopyEvent);
295+
} else {
296+
CL_RETURN_ON_FAILURE(clReleaseEvent(CopyEvent));
297+
}
298+
299+
return UR_RESULT_SUCCESS;
224300
}
225301

226302
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy(
@@ -343,18 +419,59 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill2D(
343419
[[maybe_unused]] uint32_t numEventsInWaitList,
344420
[[maybe_unused]] const ur_event_handle_t *phEventWaitList,
345421
[[maybe_unused]] ur_event_handle_t *phEvent) {
346-
return UR_RESULT_ERROR_INVALID_OPERATION;
422+
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
347423
}
348424

349425
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D(
350-
[[maybe_unused]] ur_queue_handle_t hQueue, [[maybe_unused]] bool blocking,
351-
[[maybe_unused]] void *pDst, [[maybe_unused]] size_t dstPitch,
352-
[[maybe_unused]] const void *pSrc, [[maybe_unused]] size_t srcPitch,
353-
[[maybe_unused]] size_t width, [[maybe_unused]] size_t height,
354-
[[maybe_unused]] uint32_t numEventsInWaitList,
355-
[[maybe_unused]] const ur_event_handle_t *phEventWaitList,
356-
[[maybe_unused]] ur_event_handle_t *phEvent) {
357-
return UR_RESULT_ERROR_INVALID_OPERATION;
426+
ur_queue_handle_t hQueue, bool blocking, void *pDst, size_t dstPitch,
427+
const void *pSrc, size_t srcPitch, size_t width, size_t height,
428+
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
429+
ur_event_handle_t *phEvent) {
430+
cl_context CLContext;
431+
CL_RETURN_ON_FAILURE(clGetCommandQueueInfo(
432+
cl_adapter::cast<cl_command_queue>(hQueue), CL_QUEUE_CONTEXT,
433+
sizeof(cl_context), &CLContext, nullptr));
434+
435+
clEnqueueMemcpyINTEL_fn FuncPtr = nullptr;
436+
ur_result_t RetVal = cl_ext::getExtFuncFromContext<clEnqueueMemcpyINTEL_fn>(
437+
CLContext, cl_ext::ExtFuncPtrCache->clEnqueueMemcpyINTELCache,
438+
cl_ext::EnqueueMemcpyName, &FuncPtr);
439+
440+
if (!FuncPtr) {
441+
return RetVal;
442+
}
443+
444+
std::vector<cl_event> Events(height);
445+
for (size_t HeightIndex = 0; HeightIndex < height; HeightIndex++) {
446+
cl_event Event = nullptr;
447+
auto ClResult =
448+
FuncPtr(cl_adapter::cast<cl_command_queue>(hQueue), false,
449+
static_cast<uint8_t *>(pDst) + dstPitch * HeightIndex,
450+
static_cast<const uint8_t *>(pSrc) + srcPitch * HeightIndex,
451+
width, numEventsInWaitList,
452+
cl_adapter::cast<const cl_event *>(phEventWaitList), &Event);
453+
Events[HeightIndex] = Event;
454+
if (ClResult != CL_SUCCESS) {
455+
for (const auto &E : Events) {
456+
clReleaseEvent(E);
457+
}
458+
CL_RETURN_ON_FAILURE(ClResult);
459+
}
460+
}
461+
cl_int ClResult = CL_SUCCESS;
462+
if (blocking) {
463+
ClResult = clWaitForEvents(Events.size(), Events.data());
464+
}
465+
if (phEvent && ClResult == CL_SUCCESS) {
466+
ClResult = clEnqueueBarrierWithWaitList(
467+
cl_adapter::cast<cl_command_queue>(hQueue), Events.size(),
468+
Events.data(), cl_adapter::cast<cl_event *>(phEvent));
469+
}
470+
for (const auto &E : Events) {
471+
CL_RETURN_ON_FAILURE(clReleaseEvent(E));
472+
}
473+
CL_RETURN_ON_FAILURE(ClResult)
474+
return UR_RESULT_SUCCESS;
358475
}
359476

360477
UR_APIEXPORT ur_result_t UR_APICALL

0 commit comments

Comments
 (0)