Skip to content

Mirror intel/llvm commits #2816

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jul 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/intel-llvm-mirror-base-commit
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3e95c0c70850b8b668116d9a491d25dd969c6329
63c70a1425d2c91fa54ec6495aae8ecfa7a5a10c
26 changes: 24 additions & 2 deletions source/adapters/level_zero/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1139,9 +1139,31 @@ ur_result_t urDeviceGetInfo(
return ReturnValue(Device->Platform->ZeBindlessImagesExtensionSupported &&
Device->ZeDeviceImageProperties->maxImageDims2D > 0);
}
case UR_DEVICE_INFO_MAX_IMAGE_LINEAR_WIDTH_EXP: {
ze_device_image_properties_t imageProps = {};
imageProps.stype = ZE_STRUCTURE_TYPE_DEVICE_IMAGE_PROPERTIES;
ze_device_pitched_alloc_exp_properties_t imageAllocProps = {};
imageAllocProps.stype =
ZE_STRUCTURE_TYPE_PITCHED_ALLOC_DEVICE_EXP_PROPERTIES;
imageProps.pNext = (void *)&imageAllocProps;

ZE_CALL_NOCHECK(zeDeviceGetImageProperties, (ZeDevice, &imageProps));

return ReturnValue(imageAllocProps.maxImageLinearWidth);
}
case UR_DEVICE_INFO_MAX_IMAGE_LINEAR_HEIGHT_EXP: {
ze_device_image_properties_t imageProps = {};
imageProps.stype = ZE_STRUCTURE_TYPE_DEVICE_IMAGE_PROPERTIES;
ze_device_pitched_alloc_exp_properties_t imageAllocProps = {};
imageAllocProps.stype =
ZE_STRUCTURE_TYPE_PITCHED_ALLOC_DEVICE_EXP_PROPERTIES;
imageProps.pNext = (void *)&imageAllocProps;

ZE_CALL_NOCHECK(zeDeviceGetImageProperties, (ZeDevice, &imageProps));

return ReturnValue(imageAllocProps.maxImageLinearHeight);
}
case UR_DEVICE_INFO_IMAGE_PITCH_ALIGN_EXP:
case UR_DEVICE_INFO_MAX_IMAGE_LINEAR_WIDTH_EXP:
case UR_DEVICE_INFO_MAX_IMAGE_LINEAR_HEIGHT_EXP:
case UR_DEVICE_INFO_MAX_IMAGE_LINEAR_PITCH_EXP:
UR_LOG(ERR, "Unsupported ParamName in urGetDeviceInfo");
UR_LOG(ERR, "ParamName=%{}(0x{})", ParamName, logger::toHex(ParamName));
Expand Down
185 changes: 57 additions & 128 deletions source/adapters/native_cpu/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,17 +52,6 @@ struct NDRDescT {
};
} // namespace native_cpu

#ifdef NATIVECPU_USE_OCK
static native_cpu::state getResizedState(const native_cpu::NDRDescT &ndr,
size_t itemsPerThread) {
native_cpu::state resized_state(
ndr.GlobalSize[0], ndr.GlobalSize[1], ndr.GlobalSize[2], itemsPerThread,
ndr.LocalSize[1], ndr.LocalSize[2], ndr.GlobalOffset[0],
ndr.GlobalOffset[1], ndr.GlobalOffset[2]);
return resized_state;
}
#endif

UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
Expand Down Expand Up @@ -112,6 +101,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
// TODO: add proper error checking
native_cpu::NDRDescT ndr(workDim, pGlobalWorkOffset, pGlobalWorkSize,
pLocalWorkSize);
unsigned long long numWI;
auto umulll_overflow = [](unsigned long long a, unsigned long long b,
unsigned long long *c) -> bool {
#ifdef __GNUC__
return __builtin_umulll_overflow(a, b, c);
#else
*c = a * b;
return a != 0 && b != *c / a;
#endif
};
if (umulll_overflow(ndr.GlobalSize[0], ndr.GlobalSize[1], &numWI) ||
umulll_overflow(numWI, ndr.GlobalSize[2], &numWI) || numWI > SIZE_MAX) {
return UR_RESULT_ERROR_OUT_OF_RESOURCES;
}

auto &tp = hQueue->getDevice()->tp;
const size_t numParallelThreads = tp.num_threads();
std::vector<std::future<void>> futures;
Expand All @@ -130,131 +134,56 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
auto kernel = std::make_unique<ur_kernel_handle_t_>(*hKernel);
kernel->updateMemPool(numParallelThreads);

const size_t numWG = numWG0 * numWG1 * numWG2;
const size_t numWGPerThread = numWG / numParallelThreads;
const size_t remainderWG = numWG - numWGPerThread * numParallelThreads;
// The fourth value is the linearized value.
std::array<size_t, 4> rangeStart = {0, 0, 0, 0};
for (unsigned t = 0; t < numParallelThreads; ++t) {
auto rangeEnd = rangeStart;
rangeEnd[3] += numWGPerThread + (t < remainderWG);
if (rangeEnd[3] == rangeStart[3])
break;
rangeEnd[0] = rangeEnd[3] % numWG0;
rangeEnd[1] = (rangeEnd[3] / numWG0) % numWG1;
rangeEnd[2] = rangeEnd[3] / (numWG0 * numWG1);
futures.emplace_back(
tp.schedule_task([state, &kernel = *kernel, rangeStart,
rangeEnd = rangeEnd[3], numWG0, numWG1,
#ifndef NATIVECPU_USE_OCK
for (unsigned g2 = 0; g2 < numWG2; g2++) {
for (unsigned g1 = 0; g1 < numWG1; g1++) {
for (unsigned g0 = 0; g0 < numWG0; g0++) {
for (unsigned local2 = 0; local2 < ndr.LocalSize[2]; local2++) {
for (unsigned local1 = 0; local1 < ndr.LocalSize[1]; local1++) {
for (unsigned local0 = 0; local0 < ndr.LocalSize[0]; local0++) {
state.update(g0, g1, g2, local0, local1, local2);
kernel->_subhandler(kernel->getArgs(1, 0).data(), &state);
}
}
}
}
}
}
localSize = ndr.LocalSize,
#endif
numParallelThreads](size_t threadId) mutable {
for (size_t g0 = rangeStart[0], g1 = rangeStart[1],
g2 = rangeStart[2], g3 = rangeStart[3];
g3 < rangeEnd; ++g3) {
#ifdef NATIVECPU_USE_OCK
state.update(g0, g1, g2);
kernel._subhandler(
kernel.getArgs(numParallelThreads, threadId).data(), &state);
#else
bool isLocalSizeOne =
ndr.LocalSize[0] == 1 && ndr.LocalSize[1] == 1 && ndr.LocalSize[2] == 1;
if (isLocalSizeOne && ndr.GlobalSize[0] > numParallelThreads &&
!kernel->hasLocalArgs()) {
// If the local size is one, we make the assumption that we are running a
// parallel_for over a sycl::range.
// Todo: we could add more compiler checks and
// kernel properties for this (e.g. check that no barriers are called).

// Todo: this assumes that dim 0 is the best dimension over which we want to
// parallelize

// Since we also vectorize the kernel, and vectorization happens within the
// work group loop, it's better to have a large-ish local size. We can
// divide the global range by the number of threads, set that as the local
// size and peel everything else.

size_t new_num_work_groups_0 = numParallelThreads;
size_t itemsPerThread = ndr.GlobalSize[0] / numParallelThreads;

for (unsigned g2 = 0; g2 < numWG2; g2++) {
for (unsigned g1 = 0; g1 < numWG1; g1++) {
for (unsigned g0 = 0; g0 < new_num_work_groups_0; g0 += 1) {
futures.emplace_back(tp.schedule_task(
[ndr, itemsPerThread, &kernel = *kernel, g0, g1, g2](size_t) {
native_cpu::state resized_state =
getResizedState(ndr, itemsPerThread);
resized_state.update(g0, g1, g2);
kernel._subhandler(kernel.getArgs().data(), &resized_state);
}));
}
// Peel the remaining work items. Since the local size is 1, we iterate
// over the work groups.
for (unsigned g0 = new_num_work_groups_0 * itemsPerThread; g0 < numWG0;
g0++) {
state.update(g0, g1, g2);
kernel->_subhandler(kernel->getArgs().data(), &state);
}
}
}

} else {
// We are running a parallel_for over an nd_range

if (numWG1 * numWG2 >= numParallelThreads) {
// Dimensions 1 and 2 have enough work, split them across the threadpool
for (unsigned g2 = 0; g2 < numWG2; g2++) {
for (unsigned g1 = 0; g1 < numWG1; g1++) {
futures.emplace_back(
tp.schedule_task([state, &kernel = *kernel, numWG0, g1, g2,
numParallelThreads](size_t threadId) mutable {
for (unsigned g0 = 0; g0 < numWG0; g0++) {
state.update(g0, g1, g2);
for (size_t local2 = 0; local2 < localSize[2]; ++local2) {
for (size_t local1 = 0; local1 < localSize[1]; ++local1) {
for (size_t local0 = 0; local0 < localSize[0]; ++local0) {
state.update(g0, g1, g2, local0, local1, local2);
kernel._subhandler(
kernel.getArgs(numParallelThreads, threadId).data(),
&state);
}
}));
}
}
} else {
// Split dimension 0 across the threadpool
// Here we try to create groups of workgroups in order to reduce
// synchronization overhead
for (unsigned g2 = 0; g2 < numWG2; g2++) {
for (unsigned g1 = 0; g1 < numWG1; g1++) {
for (unsigned g0 = 0; g0 < numWG0; g0++) {
groups.push_back([state, g0, g1, g2, numParallelThreads](
size_t threadId,
ur_kernel_handle_t_ &kernel) mutable {
state.update(g0, g1, g2);
kernel._subhandler(
kernel.getArgs(numParallelThreads, threadId).data(), &state);
});
}
}
}
auto numGroups = groups.size();
auto groupsPerThread = numGroups / numParallelThreads;
if (groupsPerThread) {
for (unsigned thread = 0; thread < numParallelThreads; thread++) {
futures.emplace_back(
tp.schedule_task([groups, thread, groupsPerThread,
&kernel = *kernel](size_t threadId) {
for (unsigned i = 0; i < groupsPerThread; i++) {
auto index = thread * groupsPerThread + i;
groups[index](threadId, kernel);
}
}));
}
}

// schedule the remaining tasks
auto remainder = numGroups % numParallelThreads;
if (remainder) {
futures.emplace_back(
tp.schedule_task([groups, remainder,
scheduled = numParallelThreads * groupsPerThread,
&kernel = *kernel](size_t threadId) {
for (unsigned i = 0; i < remainder; i++) {
auto index = scheduled + i;
groups[index](threadId, kernel);
}
}));
}
}
}
#endif
if (++g0 == numWG0) {
g0 = 0;
if (++g1 == numWG1) {
g1 = 0;
++g2;
}
}
}
}));
rangeStart = rangeEnd;
}

#endif // NATIVECPU_USE_OCK
event->set_futures(futures);

if (phEvent) {
Expand Down
48 changes: 25 additions & 23 deletions source/adapters/offload/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
hKernel->Args.getStorageSize(), &LaunchArgs, &EventOut));

if (phEvent) {
auto *Event = new ur_event_handle_t_();
auto *Event = new ur_event_handle_t_(UR_COMMAND_KERNEL_LAUNCH, hQueue);
Event->OffloadEvent = EventOut;
*phEvent = Event;
}
Expand All @@ -94,10 +94,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D(
}

namespace {
ur_result_t doMemcpy(ur_queue_handle_t hQueue, void *DestPtr,
ol_device_handle_t DestDevice, const void *SrcPtr,
ol_device_handle_t SrcDevice, size_t size, bool blocking,
uint32_t numEventsInWaitList,
ur_result_t doMemcpy(ur_command_t Command, ur_queue_handle_t hQueue,
void *DestPtr, ol_device_handle_t DestDevice,
const void *SrcPtr, ol_device_handle_t SrcDevice,
size_t size, bool blocking, uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent) {
// Ignore wait list for now
Expand All @@ -111,11 +111,11 @@ ur_result_t doMemcpy(ur_queue_handle_t hQueue, void *DestPtr,
SrcDevice, size, phEvent ? &EventOut : nullptr));

if (blocking) {
OL_RETURN_ON_ERR(olWaitQueue(hQueue->OffloadQueue));
OL_RETURN_ON_ERR(olSyncQueue(hQueue->OffloadQueue));
}

if (phEvent) {
auto *Event = new ur_event_handle_t_();
auto *Event = new ur_event_handle_t_(Command, hQueue);
Event->OffloadEvent = EventOut;
*phEvent = Event;
}
Expand All @@ -131,8 +131,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead(
char *DevPtr =
reinterpret_cast<char *>(std::get<BufferMem>(hBuffer->Mem).Ptr);

return doMemcpy(hQueue, pDst, Adapter->HostDevice, DevPtr + offset,
hQueue->OffloadDevice, size, blockingRead,
return doMemcpy(UR_COMMAND_MEM_BUFFER_READ, hQueue, pDst, Adapter->HostDevice,
DevPtr + offset, hQueue->OffloadDevice, size, blockingRead,
numEventsInWaitList, phEventWaitList, phEvent);
}

Expand All @@ -143,9 +143,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite(
char *DevPtr =
reinterpret_cast<char *>(std::get<BufferMem>(hBuffer->Mem).Ptr);

return doMemcpy(hQueue, DevPtr + offset, hQueue->OffloadDevice, pSrc,
Adapter->HostDevice, size, blockingWrite, numEventsInWaitList,
phEventWaitList, phEvent);
return doMemcpy(UR_COMMAND_MEM_BUFFER_WRITE, hQueue, DevPtr + offset,
hQueue->OffloadDevice, pSrc, Adapter->HostDevice, size,
blockingWrite, numEventsInWaitList, phEventWaitList, phEvent);
}

UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead(
Expand All @@ -159,10 +159,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead(
return Err;
}

return doMemcpy(hQueue, pDst, Adapter->HostDevice,
reinterpret_cast<const char *>(Ptr) + offset,
hQueue->OffloadDevice, count, blockingRead,
numEventsInWaitList, phEventWaitList, phEvent);
return doMemcpy(
UR_COMMAND_DEVICE_GLOBAL_VARIABLE_READ, hQueue, pDst, Adapter->HostDevice,
reinterpret_cast<const char *>(Ptr) + offset, hQueue->OffloadDevice,
count, blockingRead, numEventsInWaitList, phEventWaitList, phEvent);
}

UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite(
Expand All @@ -176,18 +176,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite(
return Err;
}

return doMemcpy(hQueue, reinterpret_cast<char *>(Ptr) + offset,
hQueue->OffloadDevice, pSrc, Adapter->HostDevice, count,
blockingWrite, numEventsInWaitList, phEventWaitList, phEvent);
return doMemcpy(UR_COMMAND_DEVICE_GLOBAL_VARIABLE_WRITE, hQueue,
reinterpret_cast<char *>(Ptr) + offset, hQueue->OffloadDevice,
pSrc, Adapter->HostDevice, count, blockingWrite,
numEventsInWaitList, phEventWaitList, phEvent);
}

ur_result_t enqueueNoOp(ur_queue_handle_t hQueue, ur_event_handle_t *phEvent) {
ur_result_t enqueueNoOp(ur_command_t Type, ur_queue_handle_t hQueue,
ur_event_handle_t *phEvent) {
// This path is a no-op, but we can't output a real event because
// Offload doesn't currently support creating arbitrary events, and we
// don't know the last real event in the queue. Instead we just have to
// wait on the whole queue and then return an empty (implicitly
// finished) event.
*phEvent = ur_event_handle_t_::createEmptyEvent();
*phEvent = ur_event_handle_t_::createEmptyEvent(Type, hQueue);
return urQueueFinish(hQueue);
}

Expand Down Expand Up @@ -221,7 +223,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
}

if (phEvent) {
enqueueNoOp(hQueue, phEvent);
enqueueNoOp(UR_COMMAND_MEM_BUFFER_MAP, hQueue, phEvent);
}
}
*ppRetMap = MapPtr;
Expand Down Expand Up @@ -255,7 +257,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
}

if (phEvent) {
enqueueNoOp(hQueue, phEvent);
enqueueNoOp(UR_COMMAND_MEM_UNMAP, hQueue, phEvent);
}
}
BufferImpl.unmap(pMappedPtr);
Expand Down
Loading