Skip to content

Commit fcdc6a7

Browse files
authored
Merge branch 'main' into yc-0903-handle-option-exception
2 parents f54a221 + 35b83de commit fcdc6a7

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+1381
-1087
lines changed

.github/workflows/cmake.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ jobs:
2020
compiler: [{c: gcc, cxx: g++}]
2121
libbacktrace: ['-DVAL_USE_LIBBACKTRACE_BACKTRACE=OFF']
2222
pool_tracking: ['-DUMF_ENABLE_POOL_TRACKING=ON', '-DUMF_ENABLE_POOL_TRACKING=OFF']
23-
latency_tracking: ['-DUMF_ENABLE_LATENCY_TRACKING=OFF']
23+
latency_tracking: ['-DUR_ENABLE_LATENCY_HISTOGRAM=OFF']
2424
include:
2525
- os: 'ubuntu-22.04'
2626
build_type: Release
@@ -40,7 +40,7 @@ jobs:
4040
- os: 'ubuntu-22.04'
4141
build_type: Release
4242
compiler: {c: clang, cxx: clang++}
43-
latency_tracking: '-DUMF_ENABLE_LATENCY_TRACKING=ON'
43+
latency_tracking: '-DUR_ENABLE_LATENCY_HISTOGRAM=ON'
4444
runs-on: ${{ (matrix.os == 'ubuntu-22.04' && github.repository_owner == 'oneapi-src') && 'intel-ubuntu-22.04' || matrix.os }}
4545

4646
steps:

include/ur_print.hpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17403,6 +17403,11 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
1740317403
return os;
1740417404
}
1740517405

17406+
inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const ur_bool_t value) {
17407+
os << (value ? "true" : "false");
17408+
return os;
17409+
}
17410+
1740617411
namespace ur::details {
1740717412
///////////////////////////////////////////////////////////////////////////////
1740817413
// @brief Print pointer value

scripts/templates/print.hpp.mako

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,11 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
411411
%endfor
412412
%endfor
413413

414+
inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const ur_bool_t value) {
415+
os << (value ? "true" : "false");
416+
return os;
417+
}
418+
414419
namespace ${x}::details {
415420
///////////////////////////////////////////////////////////////////////////////
416421
// @brief Print pointer value

source/adapters/cuda/device.cpp

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -57,12 +57,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
5757
return ReturnValue(4318u);
5858
}
5959
case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: {
60-
int ComputeUnits = 0;
61-
UR_CHECK_ERROR(cuDeviceGetAttribute(
62-
&ComputeUnits, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
63-
hDevice->get()));
64-
detail::ur::assertion(ComputeUnits >= 0);
65-
return ReturnValue(static_cast<uint32_t>(ComputeUnits));
60+
return ReturnValue(hDevice->getNumComputeUnits());
6661
}
6762
case UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: {
6863
return ReturnValue(MaxWorkItemDimensions);

source/adapters/cuda/device.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ struct ur_device_handle_t_ {
3232
int MaxCapacityLocalMem{0};
3333
int MaxChosenLocalMem{0};
3434
bool MaxLocalMemSizeChosen{false};
35+
uint32_t NumComputeUnits{0};
3536

3637
public:
3738
ur_device_handle_t_(native_type cuDevice, CUcontext cuContext, CUevent evBase,
@@ -54,6 +55,10 @@ struct ur_device_handle_t_ {
5455
sizeof(MaxWorkGroupSize), &MaxWorkGroupSize,
5556
nullptr));
5657

58+
UR_CHECK_ERROR(cuDeviceGetAttribute(
59+
reinterpret_cast<int *>(&NumComputeUnits),
60+
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, cuDevice));
61+
5762
// Set local mem max size if env var is present
5863
static const char *LocalMemSizePtrUR =
5964
std::getenv("UR_CUDA_MAX_LOCAL_MEM_SIZE");
@@ -107,6 +112,8 @@ struct ur_device_handle_t_ {
107112
int getMaxChosenLocalMem() const noexcept { return MaxChosenLocalMem; };
108113

109114
bool maxLocalMemSizeChosen() { return MaxLocalMemSizeChosen; };
115+
116+
uint32_t getNumComputeUnits() const noexcept { return NumComputeUnits; };
110117
};
111118

112119
int getAttribute(ur_device_handle_t Device, CUdevice_attribute Attribute);

source/adapters/cuda/image.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -759,13 +759,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
759759
cpy_desc.dstZ = pCopyRegion->dstOffset.z;
760760
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
761761
cpy_desc.srcHost = pSrc;
762-
cpy_desc.srcPitch = pCopyRegion->copyExtent.width * PixelSizeBytes;
763-
cpy_desc.srcHeight = pCopyRegion->copyExtent.height;
762+
cpy_desc.srcPitch = pSrcImageDesc->width * PixelSizeBytes;
763+
cpy_desc.srcHeight = std::max(uint64_t{1}, pSrcImageDesc->height);
764764
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
765765
cpy_desc.dstArray = (CUarray)pDst;
766766
cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width;
767767
cpy_desc.Height = std::max(uint64_t{1}, pCopyRegion->copyExtent.height);
768-
cpy_desc.Depth = pDstImageDesc->arraySize;
768+
cpy_desc.Depth = pCopyRegion->copyExtent.depth;
769769
UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream));
770770
}
771771
} else if (imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_HOST) {
@@ -855,10 +855,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
855855
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
856856
cpy_desc.dstHost = pDst;
857857
cpy_desc.dstPitch = pDstImageDesc->width * PixelSizeBytes;
858-
cpy_desc.dstHeight = pDstImageDesc->height;
858+
cpy_desc.dstHeight = std::max(uint64_t{1}, pDstImageDesc->height);
859859
cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width;
860860
cpy_desc.Height = std::max(uint64_t{1}, pCopyRegion->copyExtent.height);
861-
cpy_desc.Depth = pSrcImageDesc->arraySize;
861+
cpy_desc.Depth = pCopyRegion->copyExtent.depth;
862862
UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream));
863863
}
864864
} else {
@@ -932,7 +932,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
932932
cpy_desc.dstArray = (CUarray)pDst;
933933
cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width;
934934
cpy_desc.Height = std::max(uint64_t{1}, pCopyRegion->copyExtent.height);
935-
cpy_desc.Depth = pSrcImageDesc->arraySize;
935+
cpy_desc.Depth = pCopyRegion->copyExtent.depth;
936936
UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream));
937937
}
938938
// Synchronization is required here to handle the case of copying data

source/adapters/cuda/kernel.cpp

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -167,10 +167,46 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle(
167167
UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
168168
ur_kernel_handle_t hKernel, size_t localWorkSize,
169169
size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) {
170-
(void)hKernel;
171-
(void)localWorkSize;
172-
(void)dynamicSharedMemorySize;
173-
*pGroupCountRet = 1;
170+
UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_KERNEL);
171+
172+
// We need to set the active current device for this kernel explicitly here,
173+
// because the occupancy querying API does not take device parameter.
174+
ur_device_handle_t Device = hKernel->getProgram()->getDevice();
175+
ScopedContext Active(Device);
176+
try {
177+
// We need to calculate max num of work-groups using per-device semantics.
178+
179+
int MaxNumActiveGroupsPerCU{0};
180+
UR_CHECK_ERROR(cuOccupancyMaxActiveBlocksPerMultiprocessor(
181+
&MaxNumActiveGroupsPerCU, hKernel->get(), localWorkSize,
182+
dynamicSharedMemorySize));
183+
detail::ur::assertion(MaxNumActiveGroupsPerCU >= 0);
184+
// Handle the case where we can't have all SMs active with at least 1 group
185+
// per SM. In that case, the device is still able to run 1 work-group, hence
186+
// we will manually check if it is possible with the available HW resources.
187+
if (MaxNumActiveGroupsPerCU == 0) {
188+
size_t MaxWorkGroupSize{};
189+
urKernelGetGroupInfo(
190+
hKernel, Device, UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE,
191+
sizeof(MaxWorkGroupSize), &MaxWorkGroupSize, nullptr);
192+
size_t MaxLocalSizeBytes{};
193+
urDeviceGetInfo(Device, UR_DEVICE_INFO_LOCAL_MEM_SIZE,
194+
sizeof(MaxLocalSizeBytes), &MaxLocalSizeBytes, nullptr);
195+
if (localWorkSize > MaxWorkGroupSize ||
196+
dynamicSharedMemorySize > MaxLocalSizeBytes ||
197+
hasExceededMaxRegistersPerBlock(Device, hKernel, localWorkSize))
198+
*pGroupCountRet = 0;
199+
else
200+
*pGroupCountRet = 1;
201+
} else {
202+
// Multiply by the number of SMs (CUs = compute units) on the device in
203+
// order to retreive the total number of groups/blocks that can be
204+
// launched.
205+
*pGroupCountRet = Device->getNumComputeUnits() * MaxNumActiveGroupsPerCU;
206+
}
207+
} catch (ur_result_t Err) {
208+
return Err;
209+
}
174210
return UR_RESULT_SUCCESS;
175211
}
176212

source/adapters/level_zero/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ if(UR_BUILD_ADAPTER_L0)
118118
${CMAKE_CURRENT_SOURCE_DIR}/queue.hpp
119119
${CMAKE_CURRENT_SOURCE_DIR}/sampler.hpp
120120
${CMAKE_CURRENT_SOURCE_DIR}/helpers/kernel_helpers.hpp
121+
${CMAKE_CURRENT_SOURCE_DIR}/helpers/memory_helpers.hpp
121122
${CMAKE_CURRENT_SOURCE_DIR}/ur_level_zero.cpp
122123
${CMAKE_CURRENT_SOURCE_DIR}/common.cpp
123124
${CMAKE_CURRENT_SOURCE_DIR}/context.cpp
@@ -136,6 +137,7 @@ if(UR_BUILD_ADAPTER_L0)
136137
${CMAKE_CURRENT_SOURCE_DIR}/sampler.cpp
137138
${CMAKE_CURRENT_SOURCE_DIR}/image.cpp
138139
${CMAKE_CURRENT_SOURCE_DIR}/helpers/kernel_helpers.cpp
140+
${CMAKE_CURRENT_SOURCE_DIR}/helpers/memory_helpers.cpp
139141
${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.cpp
140142
)
141143

@@ -199,13 +201,15 @@ if(UR_BUILD_ADAPTER_L0_V2)
199201
${CMAKE_CURRENT_SOURCE_DIR}/platform.hpp
200202
${CMAKE_CURRENT_SOURCE_DIR}/program.hpp
201203
${CMAKE_CURRENT_SOURCE_DIR}/helpers/kernel_helpers.hpp
204+
${CMAKE_CURRENT_SOURCE_DIR}/helpers/memory_helpers.hpp
202205
${CMAKE_CURRENT_SOURCE_DIR}/adapter.cpp
203206
${CMAKE_CURRENT_SOURCE_DIR}/common.cpp
204207
${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
205208
${CMAKE_CURRENT_SOURCE_DIR}/ur_interface_loader.cpp
206209
${CMAKE_CURRENT_SOURCE_DIR}/platform.cpp
207210
${CMAKE_CURRENT_SOURCE_DIR}/program.cpp
208211
${CMAKE_CURRENT_SOURCE_DIR}/helpers/kernel_helpers.cpp
212+
${CMAKE_CURRENT_SOURCE_DIR}/helpers/memory_helpers.cpp
209213
${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.cpp
210214
# v2-only sources
211215
${CMAKE_CURRENT_SOURCE_DIR}/v2/command_list_cache.hpp
@@ -217,6 +221,7 @@ if(UR_BUILD_ADAPTER_L0_V2)
217221
${CMAKE_CURRENT_SOURCE_DIR}/v2/event_provider.hpp
218222
${CMAKE_CURRENT_SOURCE_DIR}/v2/event.hpp
219223
${CMAKE_CURRENT_SOURCE_DIR}/v2/kernel.hpp
224+
${CMAKE_CURRENT_SOURCE_DIR}/v2/memory.hpp
220225
${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_api.hpp
221226
${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_in_order.hpp
222227
${CMAKE_CURRENT_SOURCE_DIR}/v2/usm.hpp
@@ -229,6 +234,7 @@ if(UR_BUILD_ADAPTER_L0_V2)
229234
${CMAKE_CURRENT_SOURCE_DIR}/v2/event_provider_normal.cpp
230235
${CMAKE_CURRENT_SOURCE_DIR}/v2/event.cpp
231236
${CMAKE_CURRENT_SOURCE_DIR}/v2/kernel.cpp
237+
${CMAKE_CURRENT_SOURCE_DIR}/v2/memory.cpp
232238
${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_api.cpp
233239
${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_create.cpp
234240
${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_in_order.cpp

source/adapters/level_zero/context.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -512,7 +512,7 @@ ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool(
512512
// Create one event ZePool per MaxNumEventsPerPool events
513513
if (*ZePool == nullptr) {
514514
ze_event_pool_counter_based_exp_desc_t counterBasedExt = {
515-
ZE_STRUCTURE_TYPE_COUNTER_BASED_EVENT_POOL_EXP_DESC};
515+
ZE_STRUCTURE_TYPE_COUNTER_BASED_EVENT_POOL_EXP_DESC, nullptr, 0};
516516
ZeStruct<ze_event_pool_desc_t> ZeEventPoolDesc;
517517
ZeEventPoolDesc.count = MaxNumEventsPerPool;
518518
ZeEventPoolDesc.flags = 0;

source/adapters/level_zero/event.cpp

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -221,9 +221,8 @@ ur_result_t urEnqueueEventsWaitWithBarrier(
221221
return UR_RESULT_SUCCESS;
222222
}
223223

224-
ur_event_handle_t InternalEvent;
224+
ur_event_handle_t ResultEvent = nullptr;
225225
bool IsInternal = OutEvent == nullptr;
226-
ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
227226

228227
// For in-order queue and wait-list which is empty or has events from
229228
// the same queue just use the last command event as the barrier event.
@@ -234,7 +233,10 @@ ur_result_t urEnqueueEventsWaitWithBarrier(
234233
EventWaitList) &&
235234
Queue->LastCommandEvent && !Queue->LastCommandEvent->IsDiscarded) {
236235
UR_CALL(ur::level_zero::urEventRetain(Queue->LastCommandEvent));
237-
*Event = Queue->LastCommandEvent;
236+
ResultEvent = Queue->LastCommandEvent;
237+
if (OutEvent) {
238+
*OutEvent = ResultEvent;
239+
}
238240
return UR_RESULT_SUCCESS;
239241
}
240242

@@ -264,16 +266,21 @@ ur_result_t urEnqueueEventsWaitWithBarrier(
264266
EventWaitList, OkToBatch));
265267

266268
// Insert the barrier into the command-list and execute.
267-
UR_CALL(insertBarrierIntoCmdList(CmdList, TmpWaitList, *Event, IsInternal));
269+
UR_CALL(insertBarrierIntoCmdList(CmdList, TmpWaitList, ResultEvent,
270+
IsInternal));
268271

269272
UR_CALL(Queue->executeCommandList(CmdList, false, OkToBatch));
270273

271274
// Because of the dependency between commands in the in-order queue we don't
272275
// need to keep track of any active barriers if we have in-order queue.
273276
if (UseMultipleCmdlistBarriers && !Queue->isInOrderQueue()) {
274-
auto UREvent = reinterpret_cast<ur_event_handle_t>(*Event);
277+
auto UREvent = reinterpret_cast<ur_event_handle_t>(ResultEvent);
275278
Queue->ActiveBarriers.add(UREvent);
276279
}
280+
281+
if (OutEvent) {
282+
*OutEvent = ResultEvent;
283+
}
277284
return UR_RESULT_SUCCESS;
278285
}
279286

@@ -361,14 +368,14 @@ ur_result_t urEnqueueEventsWaitWithBarrier(
361368
// Insert a barrier with the events from each command-queue into the
362369
// convergence command list. The resulting event signals the convergence of
363370
// all barriers.
364-
UR_CALL(insertBarrierIntoCmdList(ConvergenceCmdList, BaseWaitList, *Event,
365-
IsInternal));
371+
UR_CALL(insertBarrierIntoCmdList(ConvergenceCmdList, BaseWaitList,
372+
ResultEvent, IsInternal));
366373
} else {
367374
// If there is only a single queue then insert a barrier and the single
368375
// result event can be used as our active barrier and used as the return
369376
// event. Take into account whether output event is discarded or not.
370-
UR_CALL(insertBarrierIntoCmdList(CmdLists[0], _ur_ze_event_list_t{}, *Event,
371-
IsInternal));
377+
UR_CALL(insertBarrierIntoCmdList(CmdLists[0], _ur_ze_event_list_t{},
378+
ResultEvent, IsInternal));
372379
}
373380

374381
// Execute each command list so the barriers can be encountered.
@@ -384,8 +391,10 @@ ur_result_t urEnqueueEventsWaitWithBarrier(
384391
}
385392

386393
UR_CALL(Queue->ActiveBarriers.clear());
387-
auto UREvent = reinterpret_cast<ur_event_handle_t>(*Event);
388-
Queue->ActiveBarriers.add(UREvent);
394+
Queue->ActiveBarriers.add(ResultEvent);
395+
if (OutEvent) {
396+
*OutEvent = ResultEvent;
397+
}
389398
return UR_RESULT_SUCCESS;
390399
}
391400

@@ -1508,8 +1517,8 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
15081517

15091518
std::shared_lock<ur_shared_mutex> Lock(EventList[I]->Mutex);
15101519

1511-
ur_device_handle_t QueueRootDevice;
1512-
ur_device_handle_t CurrentQueueRootDevice;
1520+
ur_device_handle_t QueueRootDevice = nullptr;
1521+
ur_device_handle_t CurrentQueueRootDevice = nullptr;
15131522
if (Queue) {
15141523
QueueRootDevice = Queue->Device;
15151524
CurrentQueueRootDevice = CurQueueDevice;

0 commit comments

Comments
 (0)