Skip to content

Commit cba22d4

Browse files
authored
[UR] Consolidate kernel launch entry points in UR. (#18385)
We have two exp variants of urEnqueueKernelLaunch that don't differ significantly from the core entry point, and both have been stable for over a year. This change merges these entry points into the core one, and also pulls all related exp functionality into the core spec. The exp features in question are exp-cooperative-kernels and exp-launch-properties.
1 parent e2f97a7 commit cba22d4

File tree

88 files changed

+1598
-3630
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

88 files changed

+1598
-3630
lines changed

sycl/include/sycl/info/device_traits.def

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -235,8 +235,7 @@ __SYCL_PARAM_TRAITS_SPEC(device, ext_oneapi_max_work_groups_2d, id<2>, __SYCL_TR
235235
__SYCL_PARAM_TRAITS_SPEC(device, ext_oneapi_max_work_groups_3d, id<3>,
236236
UR_DEVICE_INFO_MAX_WORK_GROUPS_3D)
237237
__SYCL_PARAM_TRAITS_SPEC(device, ext_oneapi_max_global_work_groups, size_t, __SYCL_TRAIT_HANDLED_IN_RT)
238-
__SYCL_PARAM_TRAITS_SPEC(device, ext_oneapi_cuda_cluster_group, bool,
239-
UR_DEVICE_INFO_CLUSTER_LAUNCH_SUPPORT_EXP)
238+
__SYCL_PARAM_TRAITS_SPEC(device, ext_oneapi_cuda_cluster_group, bool, __SYCL_TRAIT_HANDLED_IN_RT)
240239

241240
#ifdef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC_NEEDS_UNDEF
242241
#undef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC

sycl/source/detail/device_impl.hpp

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -886,11 +886,10 @@ class device_impl : public std::enable_shared_from_this<device_impl> {
886886
}
887887

888888
CASE(info::device::ext_oneapi_cuda_cluster_group) {
889-
if (getBackend() != backend::ext_oneapi_cuda)
890-
return false;
891-
892-
return get_info_impl_nocheck<UR_DEVICE_INFO_CLUSTER_LAUNCH_SUPPORT_EXP>()
893-
.value_or(0) != 0;
889+
auto SupportFlags =
890+
get_info_impl<UR_DEVICE_INFO_KERNEL_LAUNCH_CAPABILITIES>();
891+
return static_cast<bool>(
892+
SupportFlags & UR_KERNEL_LAUNCH_PROPERTIES_FLAG_CLUSTER_DIMENSION);
894893
}
895894

896895
// ext_codeplay_device_traits.def

sycl/source/detail/kernel_impl.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -371,7 +371,7 @@ kernel_impl::queryMaxNumWorkGroups(queue Queue,
371371

372372
uint32_t GroupCount{0};
373373
if (auto Result = Adapter->call_nocheck<
374-
UrApiKind::urKernelSuggestMaxCooperativeGroupCountExp>(
374+
UrApiKind::urKernelSuggestMaxCooperativeGroupCount>(
375375
Handle, DeviceHandleRef, Dimensions, WG, DynamicLocalMemorySize,
376376
&GroupCount);
377377
Result != UR_RESULT_ERROR_UNSUPPORTED_FEATURE &&

sycl/source/detail/scheduler/commands.cpp

Lines changed: 19 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -2474,65 +2474,43 @@ static ur_result_t SetKernelParamsAndLaunch(
24742474
if (EnforcedLocalSize)
24752475
LocalSize = RequiredWGSize;
24762476
}
2477-
24782477
const bool HasOffset = NDRDesc.GlobalOffset[0] != 0 ||
24792478
NDRDesc.GlobalOffset[1] != 0 ||
24802479
NDRDesc.GlobalOffset[2] != 0;
24812480

2482-
std::vector<ur_exp_launch_property_t> property_list;
2481+
std::vector<ur_kernel_launch_property_t> property_list;
2482+
24832483
if (KernelUsesClusterLaunch) {
2484-
ur_exp_launch_property_value_t launch_property_value_cluster_range;
2484+
ur_kernel_launch_property_value_t launch_property_value_cluster_range;
24852485
launch_property_value_cluster_range.clusterDim[0] =
24862486
NDRDesc.ClusterDimensions[0];
24872487
launch_property_value_cluster_range.clusterDim[1] =
24882488
NDRDesc.ClusterDimensions[1];
24892489
launch_property_value_cluster_range.clusterDim[2] =
24902490
NDRDesc.ClusterDimensions[2];
24912491

2492-
property_list.push_back({UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION,
2492+
property_list.push_back({UR_KERNEL_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION,
24932493
launch_property_value_cluster_range});
2494-
2495-
if (IsCooperative) {
2496-
ur_exp_launch_property_value_t launch_property_value_cooperative;
2497-
launch_property_value_cooperative.cooperative = 1;
2498-
property_list.push_back({UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE,
2499-
launch_property_value_cooperative});
2500-
}
2494+
}
2495+
if (IsCooperative) {
2496+
ur_kernel_launch_property_value_t launch_property_value_cooperative;
2497+
launch_property_value_cooperative.cooperative = 1;
2498+
property_list.push_back({UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE,
2499+
launch_property_value_cooperative});
25012500
}
25022501
// If there is no implicit arg, let the driver handle it via a property
25032502
if (WorkGroupMemorySize && !ImplicitLocalArg.has_value()) {
2504-
property_list.push_back(
2505-
{UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY, {{WorkGroupMemorySize}}});
2506-
}
2507-
if (!property_list.empty()) {
2508-
ur_event_handle_t UREvent = nullptr;
2509-
ur_result_t Error =
2510-
Adapter->call_nocheck<UrApiKind::urEnqueueKernelLaunchCustomExp>(
2511-
Queue.getHandleRef(), Kernel, NDRDesc.Dims,
2512-
HasOffset ? &NDRDesc.GlobalOffset[0] : nullptr,
2513-
&NDRDesc.GlobalSize[0], LocalSize, property_list.size(),
2514-
property_list.data(), RawEvents.size(),
2515-
RawEvents.empty() ? nullptr : &RawEvents[0],
2516-
OutEventImpl ? &UREvent : nullptr);
2517-
if ((Error == UR_RESULT_SUCCESS) && OutEventImpl) {
2518-
OutEventImpl->setHandle(UREvent);
2519-
}
2520-
return Error;
2503+
property_list.push_back({UR_KERNEL_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY,
2504+
{{WorkGroupMemorySize}}});
25212505
}
25222506
ur_event_handle_t UREvent = nullptr;
2523-
ur_result_t Error =
2524-
[&](auto... Args) {
2525-
if (IsCooperative) {
2526-
return Adapter
2527-
->call_nocheck<UrApiKind::urEnqueueCooperativeKernelLaunchExp>(
2528-
Args...);
2529-
}
2530-
return Adapter->call_nocheck<UrApiKind::urEnqueueKernelLaunch>(Args...);
2531-
}(Queue.getHandleRef(), Kernel, NDRDesc.Dims,
2532-
HasOffset ? &NDRDesc.GlobalOffset[0] : nullptr, &NDRDesc.GlobalSize[0],
2533-
LocalSize, RawEvents.size(),
2534-
RawEvents.empty() ? nullptr : &RawEvents[0],
2535-
OutEventImpl ? &UREvent : nullptr);
2507+
ur_result_t Error = Adapter->call_nocheck<UrApiKind::urEnqueueKernelLaunch>(
2508+
Queue.getHandleRef(), Kernel, NDRDesc.Dims,
2509+
HasOffset ? &NDRDesc.GlobalOffset[0] : nullptr, &NDRDesc.GlobalSize[0],
2510+
LocalSize, property_list.size(),
2511+
property_list.empty() ? nullptr : property_list.data(), RawEvents.size(),
2512+
RawEvents.empty() ? nullptr : &RawEvents[0],
2513+
OutEventImpl ? &UREvent : nullptr);
25362514
if (Error == UR_RESULT_SUCCESS && OutEventImpl) {
25372515
OutEventImpl->setHandle(UREvent);
25382516
}

sycl/source/detail/ur_device_info_ret_types.inc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,6 @@ MAP(UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_SUPPORT_EXP, ur_bool_t)
172172
MAP(UR_DEVICE_INFO_BINDLESS_SAMPLE_1D_USM_SUPPORT_EXP, ur_bool_t)
173173
MAP(UR_DEVICE_INFO_BINDLESS_SAMPLE_2D_USM_SUPPORT_EXP, ur_bool_t)
174174
MAP(UR_DEVICE_INFO_BINDLESS_UNIQUE_ADDRESSING_PER_DIM_SUPPORT_EXP, ur_bool_t)
175-
MAP(UR_DEVICE_INFO_CLUSTER_LAUNCH_SUPPORT_EXP, ur_bool_t)
176175
MAP(UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP, ur_bool_t)
177176
MAP(UR_DEVICE_INFO_CUBEMAP_SUPPORT_EXP, ur_bool_t)
178177
MAP(UR_DEVICE_INFO_EXTERNAL_MEMORY_IMPORT_SUPPORT_EXP, ur_bool_t)
@@ -188,4 +187,5 @@ MAP(UR_DEVICE_INFO_MIPMAP_LEVEL_REFERENCE_SUPPORT_EXP, ur_bool_t)
188187
MAP(UR_DEVICE_INFO_MIPMAP_MAX_ANISOTROPY_EXP, uint32_t)
189188
MAP(UR_DEVICE_INFO_MIPMAP_SUPPORT_EXP, ur_bool_t)
190189
MAP(UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP, ur_bool_t)
190+
MAP(UR_DEVICE_INFO_KERNEL_LAUNCH_CAPABILITIES, ur_kernel_launch_properties_flags_t)
191191
// clang-format on

sycl/test-e2e/ClusterLaunch/enqueueLaunchCustom_check_event_deps.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// Checks whether or not event Dependencies are honored by
2-
// urEnqueueKernelLaunchCustomExp
2+
// urEnqueueKernelLaunch with cluster dimensions
33
// REQUIRES: target-nvidia, aspect-ext_oneapi_cuda_cluster_group
44
// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_90 -o %t.out
55
// RUN: %{run} %t.out

sycl/unittests/helpers/UrMock.hpp

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -393,10 +393,9 @@ inline ur_result_t mock_urEventGetInfo(void *pParams) {
393393
}
394394
}
395395

396-
inline ur_result_t
397-
mock_urKernelSuggestMaxCooperativeGroupCountExp(void *pParams) {
396+
inline ur_result_t mock_urKernelSuggestMaxCooperativeGroupCount(void *pParams) {
398397
auto params = reinterpret_cast<
399-
ur_kernel_suggest_max_cooperative_group_count_exp_params_t *>(pParams);
398+
ur_kernel_suggest_max_cooperative_group_count_params_t *>(pParams);
400399
**params->ppGroupCountRet = 1;
401400
return UR_RESULT_SUCCESS;
402401
}
@@ -571,8 +570,8 @@ template <sycl::backend Backend = backend::opencl> class UrMock {
571570
ADD_DEFAULT_OVERRIDE(urProgramGetInfo, mock_urProgramGetInfo)
572571
ADD_DEFAULT_OVERRIDE(urKernelGetGroupInfo, mock_urKernelGetGroupInfo)
573572
ADD_DEFAULT_OVERRIDE(urEventGetInfo, mock_urEventGetInfo)
574-
ADD_DEFAULT_OVERRIDE(urKernelSuggestMaxCooperativeGroupCountExp,
575-
mock_urKernelSuggestMaxCooperativeGroupCountExp)
573+
ADD_DEFAULT_OVERRIDE(urKernelSuggestMaxCooperativeGroupCount,
574+
mock_urKernelSuggestMaxCooperativeGroupCount)
576575
ADD_DEFAULT_OVERRIDE(urDeviceSelectBinary, mock_urDeviceSelectBinary)
577576
ADD_DEFAULT_OVERRIDE(urPlatformGetBackendOption,
578577
mock_urPlatformGetBackendOption)

0 commit comments

Comments
 (0)