Skip to content

Commit b75c7af

Browse files
[SYCL] Fix use of memcpy in group_load and marray builtins (#16501)
The implementation of group_load and marray builtins currently use std::memcpy, which currently may fail to compile on device when the user sets -D_FORTIFY_SOURCE=2. This commit fixes this by using sycl::detail::memcpy_no_adl instead. This solution should be replaced by a devicelib implementation of `__memcpy_chk` when device-side abort/assertions work as intended. --------- Signed-off-by: Larsen, Steffen <steffen.larsen@intel.com>
1 parent df00dcb commit b75c7af

File tree

4 files changed

+55
-4
lines changed

4 files changed

+55
-4
lines changed

sycl/include/sycl/detail/builtins/builtins.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,8 @@ auto builtin_marray_impl(FuncTy F, const Ts &...x) {
121121
else
122122
return F(to_vec2(x, I * 2)...);
123123
}();
124-
std::memcpy(&Res[I * 2], &PartialRes, sizeof(decltype(PartialRes)));
124+
sycl::detail::memcpy_no_adl(&Res[I * 2], &PartialRes,
125+
sizeof(decltype(PartialRes)));
125126
}
126127
if (N % 2)
127128
Res[N - 1] = F(x[N - 1]...);

sycl/include/sycl/detail/memcpy.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ namespace detail {
1919
// sycl::detail namespace, like in the following code:
2020
// sycl::vec<int , 1> a, b;
2121
// memcpy(&a, &b, sizeof(sycl::vec<int , 1>));
22-
inline void memcpy_no_adl(void *Dst, const void *Src, size_t Size) {
22+
template <typename T1, typename T2>
23+
inline void memcpy_no_adl(T1 *Dst, const T2 *Src, size_t Size) {
2324
#ifdef __SYCL_DEVICE_ONLY__
2425
__builtin_memcpy(Dst, Src, Size);
2526
#else

sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -284,11 +284,11 @@ group_load(Group g, InputIteratorT in_ptr,
284284

285285
if constexpr (std::is_same_v<std::remove_const_t<value_type>, OutputT>) {
286286
static_assert(sizeof(load) == out.size_bytes());
287-
std::memcpy(out.begin(), &load, out.size_bytes());
287+
sycl::detail::memcpy_no_adl(out.begin(), &load, out.size_bytes());
288288
} else {
289289
std::remove_const_t<value_type> values[ElementsPerWorkItem];
290290
static_assert(sizeof(load) == sizeof(values));
291-
std::memcpy(values, &load, sizeof(values));
291+
sycl::detail::memcpy_no_adl(values, &load, sizeof(values));
292292

293293
// Note: can't `memcpy` directly into `out` because that might bypass
294294
// an implicit conversion required by the specification.
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
// REQUIRES: aspect-usm_device_allocations
2+
// RUN: %{build} -D_FORTIFY_SOURCE=2 -o %t.out
3+
// RUN: %{run} %t.out
4+
5+
// Checks that group_load runs even when the source code is fortified. This
6+
// failed at one point due to the use of std::memcpy in the implementation,
7+
// which would hold an assert in device code when fortified, which would fail
8+
// to JIT compile.
9+
10+
#include <sycl/detail/core.hpp>
11+
#include <sycl/ext/oneapi/experimental/group_load_store.hpp>
12+
#include <sycl/sub_group.hpp>
13+
#include <sycl/usm.hpp>
14+
15+
namespace syclexp = sycl::ext::oneapi::experimental;
16+
17+
int main(void) {
18+
sycl::queue Q;
19+
20+
constexpr std::size_t N = 256;
21+
constexpr std::uint32_t LWS = 64;
22+
constexpr std::uint32_t VecSize = 4;
23+
constexpr std::size_t NGroups = (N + VecSize * LWS - 1) / (VecSize * LWS);
24+
25+
int *Ptr = sycl::malloc_device<int>(N, Q);
26+
27+
Q.submit([&](sycl::handler &CGH) {
28+
CGH.parallel_for(
29+
sycl::nd_range<1>{sycl::range<1>{NGroups * LWS}, sycl::range<1>{LWS}},
30+
[=](sycl::nd_item<1> It) {
31+
const std::size_t GID = It.get_global_id();
32+
const sycl::sub_group &SG = It.get_sub_group();
33+
34+
constexpr auto Striped = syclexp::properties{
35+
syclexp::data_placement_striped, syclexp::full_group};
36+
37+
auto MPtr = sycl::address_space_cast<
38+
sycl::access::address_space::global_space,
39+
sycl::access::decorated::yes>(Ptr);
40+
41+
sycl::vec<int, VecSize> X{};
42+
syclexp::group_load(SG, MPtr, X, Striped);
43+
});
44+
}).wait();
45+
46+
sycl::free(Ptr, Q);
47+
48+
return 0;
49+
}

0 commit comments

Comments
 (0)