Skip to content

Commit dc181bb

Browse files
authored
[SYCL][COMPAT] Ensure launched kernels are fully inlined (#15941)
This PR defines & uses a custom `syclcompat::detail::apply_helper` with `[[clang::always_inline]]` to ensure kernels are inlined.
1 parent cda38de commit dc181bb

File tree

3 files changed

+117
-5
lines changed

3 files changed

+117
-5
lines changed

sycl/include/syclcompat/launch_policy.hpp

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,17 @@ launch_policy(dim3, dim3, Ts...) -> launch_policy<
192192
detail::has_type<local_mem_size, std::tuple<Ts...>>::value>;
193193

194194
namespace detail {
195+
// Custom std::apply helpers to enable inlining
196+
template <class F, class Tuple, size_t... Is>
197+
__syclcompat_inline__ constexpr void apply_expand(F f, Tuple t,
198+
std::index_sequence<Is...>) {
199+
[[clang::always_inline]] f(get<Is>(t)...);
200+
}
201+
202+
template <class F, class Tuple>
203+
__syclcompat_inline__ constexpr void apply_helper(F f, Tuple t) {
204+
apply_expand(f, t, std::make_index_sequence<std::tuple_size<Tuple>{}>{});
205+
}
195206

196207
template <auto F, typename Range, typename KProps, bool HasLocalMem,
197208
typename... Args>
@@ -211,12 +222,16 @@ struct KernelFunctor {
211222
operator()(syclcompat::detail::range_to_item_t<Range>) const {
212223
if constexpr (HasLocalMem) {
213224
char *local_mem_ptr = static_cast<char *>(
214-
_local_acc.template get_multi_ptr<sycl::access::decorated::no>().get());
215-
std::apply(
216-
[lmem_ptr = local_mem_ptr](auto &&...args) { F(args..., lmem_ptr); },
225+
_local_acc.template get_multi_ptr<sycl::access::decorated::no>()
226+
.get());
227+
apply_helper(
228+
[lmem_ptr = local_mem_ptr](auto &&...args) {
229+
[[clang::always_inline]] F(args..., lmem_ptr);
230+
},
217231
_argument_tuple);
218232
} else {
219-
std::apply([](auto &&...args) { F(args...); }, _argument_tuple);
233+
apply_helper([](auto &&...args) { [[clang::always_inline]] F(args...); },
234+
_argument_tuple);
220235
}
221236
}
222237

sycl/test/syclcompat/launch/kernel_properties.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
// We need hardware which can support at least 2 sub-group sizes, since that
2424
// hardware (presumably) supports the `intel_reqd_sub_group_size` attribute.
2525
// REQUIRES: sg-32 && sg-16
26-
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %if cl_options %{/clang:-S /clang:-emit-llvm%} %else %{-S -emit-llvm%} -o - | FileCheck %s
26+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %if cl_options %{/clang:-S /clang:-emit-llvm%} %else %{-S -emit-llvm%} %s -o - | FileCheck %s
2727
#include <sycl/ext/oneapi/kernel_properties/properties.hpp>
2828
#include <sycl/detail/core.hpp>
2929
#include <sycl/ext/oneapi/properties/properties.hpp>
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
/***************************************************************************
2+
*
3+
* Copyright (C) Codeplay Software Ltd.
4+
*
5+
* Part of the LLVM Project, under the Apache License v2.0 with LLVM
6+
* Exceptions. See https://llvm.org/LICENSE.txt for license information.
7+
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8+
*
9+
* Unless required by applicable law or agreed to in writing, software
10+
* distributed under the License is distributed on an "AS IS" BASIS,
11+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
* See the License for the specific language governing permissions and
13+
* limitations under the License.
14+
*
15+
* SYCLcompat API
16+
*
17+
* launch_inlining.cpp
18+
*
19+
* Description:
20+
* Ensure kernels are inlined
21+
**************************************************************************/
22+
// RUN: %clangxx -fsycl -fgpu-inline-threshold=0 %if cl_options %{/clang:-S /clang:-emit-llvm%} %else %{-S -emit-llvm%} %s -o - | FileCheck %s
23+
// We set -fgpu-inline-threshold=0 to disable heuristic inlining for the
24+
// purposes of the test
25+
#include <sycl/detail/core.hpp>
26+
#include <sycl/group_barrier.hpp>
27+
#include <syclcompat/launch.hpp>
28+
#include <syclcompat/memory.hpp>
29+
30+
namespace compat_exp = syclcompat::experimental;
31+
namespace sycl_exp = sycl::ext::oneapi::experimental;
32+
namespace sycl_intel_exp = sycl::ext::intel::experimental;
33+
34+
static constexpr int LOCAL_MEM_SIZE = 1024;
35+
36+
// CHECK: define {{.*}}spir_kernel{{.*}}write_mem_kernel{{.*}} {
37+
// CHECK-NOT: call {{.*}}write_mem_kernel
38+
// CHECK: }
39+
40+
template <typename T> void write_mem_kernel(T *data, int num_elements) {
41+
const int id =
42+
sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_global_id(0);
43+
if (id < num_elements) {
44+
data[id] = static_cast<T>(id);
45+
}
46+
};
47+
48+
// CHECK: define {{.*}}spir_kernel{{.*}}dynamic_local_mem_typed_kernel{{.*}} {
49+
// CHECK-NOT: call {{.*}}dynamic_local_mem_typed_kernel
50+
// CHECK: }
51+
template <typename T>
52+
void dynamic_local_mem_typed_kernel(T *data, char *local_mem) {
53+
constexpr size_t num_elements = LOCAL_MEM_SIZE / sizeof(T);
54+
T *typed_local_mem = reinterpret_cast<T *>(local_mem);
55+
56+
const int id =
57+
sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_global_id(0);
58+
if (id < num_elements) {
59+
typed_local_mem[id] = static_cast<T>(id);
60+
}
61+
sycl::group_barrier(sycl::ext::oneapi::this_work_item::get_work_group<1>());
62+
if (id < num_elements) {
63+
data[id] = typed_local_mem[num_elements - id - 1];
64+
}
65+
};
66+
67+
int test_write_mem() {
68+
compat_exp::launch_policy my_dim3_config(syclcompat::dim3{32},
69+
syclcompat::dim3{32});
70+
71+
const int memsize = 1024;
72+
int *d_a = (int *)syclcompat::malloc(memsize);
73+
compat_exp::launch<write_mem_kernel<int>>(my_dim3_config, d_a,
74+
memsize / sizeof(int))
75+
.wait();
76+
77+
syclcompat::free(d_a);
78+
return 0;
79+
}
80+
81+
int test_lmem_launch() {
82+
int local_mem_size = LOCAL_MEM_SIZE;
83+
84+
size_t num_elements = local_mem_size / sizeof(int);
85+
int *d_a = (int *)syclcompat::malloc(local_mem_size);
86+
87+
compat_exp::launch_policy my_config(
88+
sycl::nd_range<1>{{256}, {256}},
89+
compat_exp::local_mem_size(local_mem_size));
90+
91+
compat_exp::launch<dynamic_local_mem_typed_kernel<int>>(my_config, d_a)
92+
.wait();
93+
94+
syclcompat::free(d_a);
95+
96+
return 0;
97+
}

0 commit comments

Comments
 (0)