Skip to content

Commit 6e8efa3

Browse files
authored
Merge pull request #1958 from igchor/kernel_helpers
[L0] move kernel helper functions to a separate file
2 parents 78c003e + 5001a40 commit 6e8efa3

File tree

6 files changed

+257
-198
lines changed

6 files changed

+257
-198
lines changed

source/adapters/level_zero/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ if(UR_BUILD_ADAPTER_L0)
112112
${CMAKE_CURRENT_SOURCE_DIR}/queue_api.hpp
113113
${CMAKE_CURRENT_SOURCE_DIR}/queue.hpp
114114
${CMAKE_CURRENT_SOURCE_DIR}/sampler.hpp
115+
${CMAKE_CURRENT_SOURCE_DIR}/helpers/kernel_helpers.hpp
115116
${CMAKE_CURRENT_SOURCE_DIR}/ur_level_zero.cpp
116117
${CMAKE_CURRENT_SOURCE_DIR}/common.cpp
117118
${CMAKE_CURRENT_SOURCE_DIR}/context.cpp
@@ -130,6 +131,7 @@ if(UR_BUILD_ADAPTER_L0)
130131
${CMAKE_CURRENT_SOURCE_DIR}/queue.cpp
131132
${CMAKE_CURRENT_SOURCE_DIR}/sampler.cpp
132133
${CMAKE_CURRENT_SOURCE_DIR}/image.cpp
134+
${CMAKE_CURRENT_SOURCE_DIR}/helpers/kernel_helpers.cpp
133135
${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.cpp
134136
)
135137

source/adapters/level_zero/command_buffer.cpp

Lines changed: 4 additions & 127 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
//
99
//===----------------------------------------------------------------------===//
1010
#include "command_buffer.hpp"
11+
#include "helpers/kernel_helpers.hpp"
1112
#include "logger/ur_logger.hpp"
1213
#include "ur_level_zero.hpp"
1314

@@ -78,130 +79,6 @@ preferCopyEngineForFill(ur_exp_command_buffer_handle_t CommandBuffer,
7879
return UR_RESULT_SUCCESS;
7980
}
8081

81-
/**
82-
* Calculates a work group size for the kernel based on the GlobalWorkSize or
83-
* the LocalWorkSize if provided.
84-
* @param[in][optional] Kernel The Kernel. Used when LocalWorkSize is not
85-
* provided.
86-
* @param[in][optional] Device The device associated with the kernel. Used when
87-
* LocalWorkSize is not provided.
88-
* @param[out] ZeThreadGroupDimensions Number of work groups in each dimension.
89-
* @param[out] WG The work group size for each dimension.
90-
* @param[in] WorkDim The number of dimensions in the kernel.
91-
* @param[in] GlobalWorkSize The global work size.
92-
* @param[in][optional] LocalWorkSize The local work size.
93-
* @return UR_RESULT_SUCCESS or an error code on failure.
94-
*/
95-
ur_result_t calculateKernelWorkDimensions(
96-
ur_kernel_handle_t Kernel, ur_device_handle_t Device,
97-
ze_group_count_t &ZeThreadGroupDimensions, uint32_t (&WG)[3],
98-
uint32_t WorkDim, const size_t *GlobalWorkSize,
99-
const size_t *LocalWorkSize) {
100-
101-
UR_ASSERT(GlobalWorkSize, UR_RESULT_ERROR_INVALID_VALUE);
102-
// If LocalWorkSize is not provided then Kernel must be provided to query
103-
// suggested group size.
104-
UR_ASSERT(LocalWorkSize || Kernel, UR_RESULT_ERROR_INVALID_VALUE);
105-
106-
// New variable needed because GlobalWorkSize parameter might not be of size
107-
// 3
108-
size_t GlobalWorkSize3D[3]{1, 1, 1};
109-
std::copy(GlobalWorkSize, GlobalWorkSize + WorkDim, GlobalWorkSize3D);
110-
111-
if (LocalWorkSize) {
112-
WG[0] = ur_cast<uint32_t>(LocalWorkSize[0]);
113-
WG[1] = WorkDim >= 2 ? ur_cast<uint32_t>(LocalWorkSize[1]) : 1;
114-
WG[2] = WorkDim == 3 ? ur_cast<uint32_t>(LocalWorkSize[2]) : 1;
115-
} else {
116-
// We can't call to zeKernelSuggestGroupSize if 64-bit GlobalWorkSize3D
117-
// values do not fit to 32-bit that the API only supports currently.
118-
bool SuggestGroupSize = true;
119-
for (int I : {0, 1, 2}) {
120-
if (GlobalWorkSize3D[I] > UINT32_MAX) {
121-
SuggestGroupSize = false;
122-
}
123-
}
124-
if (SuggestGroupSize) {
125-
ZE2UR_CALL(zeKernelSuggestGroupSize,
126-
(Kernel->ZeKernel, GlobalWorkSize3D[0], GlobalWorkSize3D[1],
127-
GlobalWorkSize3D[2], &WG[0], &WG[1], &WG[2]));
128-
} else {
129-
for (int I : {0, 1, 2}) {
130-
// Try to find a I-dimension WG size that the GlobalWorkSize3D[I] is
131-
// fully divisable with. Start with the max possible size in
132-
// each dimension.
133-
uint32_t GroupSize[] = {
134-
Device->ZeDeviceComputeProperties->maxGroupSizeX,
135-
Device->ZeDeviceComputeProperties->maxGroupSizeY,
136-
Device->ZeDeviceComputeProperties->maxGroupSizeZ};
137-
GroupSize[I] = (std::min)(size_t(GroupSize[I]), GlobalWorkSize3D[I]);
138-
while (GlobalWorkSize3D[I] % GroupSize[I]) {
139-
--GroupSize[I];
140-
}
141-
if (GlobalWorkSize[I] / GroupSize[I] > UINT32_MAX) {
142-
logger::debug("calculateKernelWorkDimensions: can't find a WG size "
143-
"suitable for global work size > UINT32_MAX");
144-
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
145-
}
146-
WG[I] = GroupSize[I];
147-
}
148-
logger::debug("calculateKernelWorkDimensions: using computed WG "
149-
"size = {{{}, {}, {}}}",
150-
WG[0], WG[1], WG[2]);
151-
}
152-
}
153-
154-
// TODO: assert if sizes do not fit into 32-bit?
155-
switch (WorkDim) {
156-
case 3:
157-
ZeThreadGroupDimensions.groupCountX =
158-
ur_cast<uint32_t>(GlobalWorkSize3D[0] / WG[0]);
159-
ZeThreadGroupDimensions.groupCountY =
160-
ur_cast<uint32_t>(GlobalWorkSize3D[1] / WG[1]);
161-
ZeThreadGroupDimensions.groupCountZ =
162-
ur_cast<uint32_t>(GlobalWorkSize3D[2] / WG[2]);
163-
break;
164-
case 2:
165-
ZeThreadGroupDimensions.groupCountX =
166-
ur_cast<uint32_t>(GlobalWorkSize3D[0] / WG[0]);
167-
ZeThreadGroupDimensions.groupCountY =
168-
ur_cast<uint32_t>(GlobalWorkSize3D[1] / WG[1]);
169-
WG[2] = 1;
170-
break;
171-
case 1:
172-
ZeThreadGroupDimensions.groupCountX =
173-
ur_cast<uint32_t>(GlobalWorkSize3D[0] / WG[0]);
174-
WG[1] = WG[2] = 1;
175-
break;
176-
177-
default:
178-
logger::error("calculateKernelWorkDimensions: unsupported work_dim");
179-
return UR_RESULT_ERROR_INVALID_VALUE;
180-
}
181-
182-
// Error handling for non-uniform group size case
183-
if (GlobalWorkSize3D[0] !=
184-
size_t(ZeThreadGroupDimensions.groupCountX) * WG[0]) {
185-
logger::error("calculateKernelWorkDimensions: invalid work_dim. The range "
186-
"is not a multiple of the group size in the 1st dimension");
187-
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
188-
}
189-
if (GlobalWorkSize3D[1] !=
190-
size_t(ZeThreadGroupDimensions.groupCountY) * WG[1]) {
191-
logger::error("calculateKernelWorkDimensions: invalid work_dim. The range "
192-
"is not a multiple of the group size in the 2nd dimension");
193-
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
194-
}
195-
if (GlobalWorkSize3D[2] !=
196-
size_t(ZeThreadGroupDimensions.groupCountZ) * WG[2]) {
197-
logger::error("calculateKernelWorkDimensions: invalid work_dim. The range "
198-
"is not a multiple of the group size in the 3rd dimension");
199-
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
200-
}
201-
202-
return UR_RESULT_SUCCESS;
203-
}
204-
20582
/**
20683
* Helper function for finding the Level Zero events associated with the
20784
* commands in a command-buffer, each event is pointed to by a sync-point in the
@@ -880,7 +757,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
880757

881758
ze_group_count_t ZeThreadGroupDimensions{1, 1, 1};
882759
uint32_t WG[3];
883-
UR_CALL(calculateKernelWorkDimensions(Kernel, CommandBuffer->Device,
760+
UR_CALL(calculateKernelWorkDimensions(Kernel->ZeKernel, CommandBuffer->Device,
884761
ZeThreadGroupDimensions, WG, WorkDim,
885762
GlobalWorkSize, LocalWorkSize));
886763

@@ -1587,8 +1464,8 @@ ur_result_t updateKernelCommand(
15871464

15881465
uint32_t WG[3];
15891466
UR_CALL(calculateKernelWorkDimensions(
1590-
Command->Kernel, CommandBuffer->Device, ZeThreadGroupDimensions, WG,
1591-
Dim, NewGlobalWorkSize, NewLocalWorkSize));
1467+
Command->Kernel->ZeKernel, CommandBuffer->Device,
1468+
ZeThreadGroupDimensions, WG, Dim, NewGlobalWorkSize, NewLocalWorkSize));
15921469

15931470
auto MutableGroupCountDesc =
15941471
std::make_unique<ZeStruct<ze_mutable_group_count_exp_desc_t>>();
Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
//===--------- kernel_helpers.cpp - Level Zero Adapter -------------------===//
2+
//
3+
// Copyright (C) 2024 Intel Corporation
4+
//
5+
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
6+
// Exceptions. See LICENSE.TXT
7+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#include "kernel_helpers.hpp"
12+
#include "logger/ur_logger.hpp"
13+
14+
#include "../common.hpp"
15+
#include "../context.hpp"
16+
#include "../device.hpp"
17+
18+
ur_result_t getSuggestedLocalWorkSize(ur_device_handle_t hDevice,
19+
ze_kernel_handle_t hZeKernel,
20+
size_t GlobalWorkSize3D[3],
21+
uint32_t SuggestedLocalWorkSize3D[3]) {
22+
uint32_t *WG = SuggestedLocalWorkSize3D;
23+
24+
// We can't call to zeKernelSuggestGroupSize if 64-bit GlobalWorkSize
25+
// values do not fit to 32-bit that the API only supports currently.
26+
bool SuggestGroupSize = true;
27+
for (int I : {0, 1, 2}) {
28+
if (GlobalWorkSize3D[I] > UINT32_MAX) {
29+
SuggestGroupSize = false;
30+
}
31+
}
32+
if (SuggestGroupSize) {
33+
ZE2UR_CALL(zeKernelSuggestGroupSize,
34+
(hZeKernel, GlobalWorkSize3D[0], GlobalWorkSize3D[1],
35+
GlobalWorkSize3D[2], &WG[0], &WG[1], &WG[2]));
36+
} else {
37+
for (int I : {0, 1, 2}) {
38+
// Try to find a I-dimension WG size that the GlobalWorkSize[I] is
39+
// fully divisable with. Start with the max possible size in
40+
// each dimension.
41+
uint32_t GroupSize[] = {
42+
hDevice->ZeDeviceComputeProperties->maxGroupSizeX,
43+
hDevice->ZeDeviceComputeProperties->maxGroupSizeY,
44+
hDevice->ZeDeviceComputeProperties->maxGroupSizeZ};
45+
GroupSize[I] = (std::min)(size_t(GroupSize[I]), GlobalWorkSize3D[I]);
46+
while (GlobalWorkSize3D[I] % GroupSize[I]) {
47+
--GroupSize[I];
48+
}
49+
if (GlobalWorkSize3D[I] / GroupSize[I] > UINT32_MAX) {
50+
logger::error("getSuggestedLocalWorkSize: can't find a WG size "
51+
"suitable for global work size > UINT32_MAX");
52+
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
53+
}
54+
WG[I] = GroupSize[I];
55+
}
56+
logger::debug(
57+
"getSuggestedLocalWorkSize: using computed WG size = {{{}, {}, {}}}",
58+
WG[0], WG[1], WG[2]);
59+
}
60+
61+
return UR_RESULT_SUCCESS;
62+
}
63+
64+
ur_result_t setKernelGlobalOffset(ur_context_handle_t Context,
65+
ze_kernel_handle_t Kernel,
66+
const size_t *GlobalWorkOffset) {
67+
if (!Context->getPlatform()->ZeDriverGlobalOffsetExtensionFound) {
68+
logger::debug("No global offset extension found on this driver");
69+
return UR_RESULT_ERROR_INVALID_VALUE;
70+
}
71+
72+
ZE2UR_CALL(
73+
zeKernelSetGlobalOffsetExp,
74+
(Kernel, GlobalWorkOffset[0], GlobalWorkOffset[1], GlobalWorkOffset[2]));
75+
76+
return UR_RESULT_SUCCESS;
77+
}
78+
79+
ur_result_t calculateKernelWorkDimensions(
80+
ze_kernel_handle_t Kernel, ur_device_handle_t Device,
81+
ze_group_count_t &ZeThreadGroupDimensions, uint32_t (&WG)[3],
82+
uint32_t WorkDim, const size_t *GlobalWorkSize,
83+
const size_t *LocalWorkSize) {
84+
85+
UR_ASSERT(GlobalWorkSize, UR_RESULT_ERROR_INVALID_VALUE);
86+
// If LocalWorkSize is not provided then Kernel must be provided to query
87+
// suggested group size.
88+
UR_ASSERT(LocalWorkSize || Kernel, UR_RESULT_ERROR_INVALID_VALUE);
89+
90+
// New variable needed because GlobalWorkSize parameter might not be of size
91+
// 3
92+
size_t GlobalWorkSize3D[3]{1, 1, 1};
93+
std::copy(GlobalWorkSize, GlobalWorkSize + WorkDim, GlobalWorkSize3D);
94+
95+
if (LocalWorkSize) {
96+
WG[0] = ur_cast<uint32_t>(LocalWorkSize[0]);
97+
WG[1] = WorkDim >= 2 ? ur_cast<uint32_t>(LocalWorkSize[1]) : 1;
98+
WG[2] = WorkDim == 3 ? ur_cast<uint32_t>(LocalWorkSize[2]) : 1;
99+
} else {
100+
UR_CALL(getSuggestedLocalWorkSize(Device, Kernel, GlobalWorkSize3D, WG));
101+
}
102+
103+
// TODO: assert if sizes do not fit into 32-bit?
104+
switch (WorkDim) {
105+
case 3:
106+
ZeThreadGroupDimensions.groupCountX =
107+
ur_cast<uint32_t>(GlobalWorkSize3D[0] / WG[0]);
108+
ZeThreadGroupDimensions.groupCountY =
109+
ur_cast<uint32_t>(GlobalWorkSize3D[1] / WG[1]);
110+
ZeThreadGroupDimensions.groupCountZ =
111+
ur_cast<uint32_t>(GlobalWorkSize3D[2] / WG[2]);
112+
break;
113+
case 2:
114+
ZeThreadGroupDimensions.groupCountX =
115+
ur_cast<uint32_t>(GlobalWorkSize3D[0] / WG[0]);
116+
ZeThreadGroupDimensions.groupCountY =
117+
ur_cast<uint32_t>(GlobalWorkSize3D[1] / WG[1]);
118+
WG[2] = 1;
119+
break;
120+
case 1:
121+
ZeThreadGroupDimensions.groupCountX =
122+
ur_cast<uint32_t>(GlobalWorkSize3D[0] / WG[0]);
123+
WG[1] = WG[2] = 1;
124+
break;
125+
126+
default:
127+
logger::error("calculateKernelWorkDimensions: unsupported work_dim");
128+
return UR_RESULT_ERROR_INVALID_VALUE;
129+
}
130+
131+
// Error handling for non-uniform group size case
132+
if (GlobalWorkSize3D[0] !=
133+
size_t(ZeThreadGroupDimensions.groupCountX) * WG[0]) {
134+
logger::error("calculateKernelWorkDimensions: invalid work_dim. The range "
135+
"is not a multiple of the group size in the 1st dimension");
136+
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
137+
}
138+
if (GlobalWorkSize3D[1] !=
139+
size_t(ZeThreadGroupDimensions.groupCountY) * WG[1]) {
140+
logger::error("calculateKernelWorkDimensions: invalid work_dim. The range "
141+
"is not a multiple of the group size in the 2nd dimension");
142+
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
143+
}
144+
if (GlobalWorkSize3D[2] !=
145+
size_t(ZeThreadGroupDimensions.groupCountZ) * WG[2]) {
146+
logger::error("calculateKernelWorkDimensions: invalid work_dim. The range "
147+
"is not a multiple of the group size in the 3rd dimension");
148+
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
149+
}
150+
151+
return UR_RESULT_SUCCESS;
152+
}
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
//===--------- kernel_helpers.hpp - Level Zero Adapter -------------------===//
2+
//
3+
// Copyright (C) 2024 Intel Corporation
4+
//
5+
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
6+
// Exceptions. See LICENSE.TXT
7+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#include <ur_api.h>
12+
#include <ze_api.h>
13+
14+
/**
15+
* Calculates a work group size for the kernel based on the GlobalWorkSize or
16+
* the LocalWorkSize if provided.
17+
* @param[in][optional] Kernel The Kernel. Used when LocalWorkSize is not
18+
* provided.
19+
* @param[in][optional] Device The device associated with the kernel. Used when
20+
* LocalWorkSize is not provided.
21+
* @param[out] ZeThreadGroupDimensions Number of work groups in each dimension.
22+
* @param[out] WG The work group size for each dimension.
23+
* @param[in] WorkDim The number of dimensions in the kernel.
24+
* @param[in] GlobalWorkSize The global work size.
25+
* @param[in][optional] LocalWorkSize The local work size.
26+
* @return UR_RESULT_SUCCESS or an error code on failure.
27+
*/
28+
ur_result_t calculateKernelWorkDimensions(
29+
ze_kernel_handle_t Kernel, ur_device_handle_t Device,
30+
ze_group_count_t &ZeThreadGroupDimensions, uint32_t (&WG)[3],
31+
uint32_t WorkDim, const size_t *GlobalWorkSize,
32+
const size_t *LocalWorkSize);
33+
34+
/**
35+
* Sets the global offset for a kernel command that will be appended to the
36+
* command buffer.
37+
* @param[in] Context Context associated with the queue.
38+
* @param[in] Kernel The handle to the kernel that will be appended.
39+
* @param[in] GlobalWorkOffset The global offset value.
40+
* @return UR_RESULT_SUCCESS or an error code on failure
41+
*/
42+
ur_result_t setKernelGlobalOffset(ur_context_handle_t Context,
43+
ze_kernel_handle_t Kernel,
44+
const size_t *GlobalWorkOffset);
45+
46+
/**
47+
* Get the suggested local work size for a kernel.
48+
* @param[in] hDevice The device associated with the kernel.
49+
* @param[in] hZeKernel The kernel handle.
50+
* @param[in] GlobalWorkSize3D The global work size.
51+
* @param[out] SuggestedLocalWorkSize3D The suggested local work size.
52+
* @return UR_RESULT_SUCCESS or an error code on failure.
53+
*/
54+
ur_result_t getSuggestedLocalWorkSize(ur_device_handle_t hDevice,
55+
ze_kernel_handle_t hZeKernel,
56+
size_t GlobalWorkSize3D[3],
57+
uint32_t SuggestedLocalWorkSize3D[3]);

0 commit comments

Comments
 (0)