Skip to content

Commit 5edcf74

Browse files
authored
[SYCL] Enable mapping of group load/store functions to SPIRV built-ins for local address space (#16653)
Extension: https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroup_local_block_io.html Currently these built-ins for local address space are not supported by cpu/fpga backends, so introduce undocumented `native_local_block_io` property which allows to enable mapping to those built-ins. If this property is not provided then implementation falls back to naive approach.
1 parent 4d3d4e6 commit 5edcf74

File tree

12 files changed

+2996
-1360
lines changed

12 files changed

+2996
-1360
lines changed

sycl/include/sycl/__spirv/spirv_ops.hpp

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -445,6 +445,47 @@ template <typename dataT>
445445
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL void
446446
__spirv_SubgroupBlockWriteINTEL(__attribute__((opencl_global)) uint64_t *Ptr,
447447
dataT Data) noexcept;
448+
449+
template <typename dataT>
450+
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL dataT
451+
__spirv_SubgroupBlockReadINTEL(const __attribute__((opencl_local))
452+
uint8_t *Ptr) noexcept;
453+
454+
template <typename dataT>
455+
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL void
456+
__spirv_SubgroupBlockWriteINTEL(__attribute__((opencl_local)) uint8_t *Ptr,
457+
dataT Data) noexcept;
458+
459+
template <typename dataT>
460+
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL dataT
461+
__spirv_SubgroupBlockReadINTEL(const __attribute__((opencl_local))
462+
uint16_t *Ptr) noexcept;
463+
464+
template <typename dataT>
465+
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL void
466+
__spirv_SubgroupBlockWriteINTEL(__attribute__((opencl_local)) uint16_t *Ptr,
467+
dataT Data) noexcept;
468+
469+
template <typename dataT>
470+
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL dataT
471+
__spirv_SubgroupBlockReadINTEL(const __attribute__((opencl_local))
472+
uint32_t *Ptr) noexcept;
473+
474+
template <typename dataT>
475+
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL void
476+
__spirv_SubgroupBlockWriteINTEL(__attribute__((opencl_local)) uint32_t *Ptr,
477+
dataT Data) noexcept;
478+
479+
template <typename dataT>
480+
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL dataT
481+
__spirv_SubgroupBlockReadINTEL(const __attribute__((opencl_local))
482+
uint64_t *Ptr) noexcept;
483+
484+
template <typename dataT>
485+
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL void
486+
__spirv_SubgroupBlockWriteINTEL(__attribute__((opencl_local)) uint64_t *Ptr,
487+
dataT Data) noexcept;
488+
448489
template <int W, int rW>
449490
extern __DPCPP_SYCL_EXTERNAL sycl::detail::ap_int<rW>
450491
__spirv_FixedSqrtINTEL(sycl::detail::ap_int<W> a, bool S, int32_t I, int32_t rI,

sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp

Lines changed: 90 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,13 @@ struct naive_key : detail::compile_time_property_key<detail::PropKind::Naive> {
5858
using value_t = property_value<naive_key>;
5959
};
6060
inline constexpr naive_key::value_t naive;
61+
62+
struct native_local_block_io_key
63+
: detail::compile_time_property_key<detail::PropKind::NativeLocalBlockIO> {
64+
using value_t = property_value<native_local_block_io_key>;
65+
};
66+
inline constexpr native_local_block_io_key::value_t native_local_block_io;
67+
6168
using namespace sycl::detail;
6269
} // namespace detail
6370

@@ -154,7 +161,6 @@ template <typename BlockInfoTy> struct BlockTypeInfo;
154161
template <typename IteratorT, std::size_t ElementsPerWorkItem, bool Blocked>
155162
struct BlockTypeInfo<BlockInfo<IteratorT, ElementsPerWorkItem, Blocked>> {
156163
using BlockInfoTy = BlockInfo<IteratorT, ElementsPerWorkItem, Blocked>;
157-
static_assert(BlockInfoTy::has_builtin);
158164

159165
using block_type = detail::fixed_width_unsigned<BlockInfoTy::block_size>;
160166

@@ -163,15 +169,23 @@ struct BlockTypeInfo<BlockInfo<IteratorT, ElementsPerWorkItem, Blocked>> {
163169
typename std::iterator_traits<IteratorT>::reference>>,
164170
std::add_const_t<block_type>, block_type>;
165171

166-
using block_pointer_type = typename detail::DecoratedType<
167-
block_pointer_elem_type, access::address_space::global_space>::type *;
172+
static constexpr auto deduced_address_space =
173+
detail::deduce_AS<std::remove_cv_t<IteratorT>>::value;
174+
175+
using block_pointer_type =
176+
typename detail::DecoratedType<block_pointer_elem_type,
177+
deduced_address_space>::type *;
178+
168179
using block_op_type = std::conditional_t<
169180
BlockInfoTy::num_blocks == 1, block_type,
170181
detail::ConvertToOpenCLType_t<vec<block_type, BlockInfoTy::num_blocks>>>;
171182
};
172183

173-
// Returns either a pointer suitable to use in a block read/write builtin or
174-
// nullptr if some legality conditions aren't satisfied.
184+
// Returns either a pointer decorated with the deduced address space, suitable
185+
// to use in a block read/write builtin, or nullptr if some legality conditions
186+
// aren't satisfied. If deduced address space is generic then returned pointer
187+
// will have generic address space and has to be dynamically casted to global or
188+
// local space before using in a builtin.
175189
template <int RequiredAlign, std::size_t ElementsPerWorkItem,
176190
typename IteratorT, typename Properties>
177191
auto get_block_op_ptr(IteratorT iter, [[maybe_unused]] Properties props) {
@@ -211,16 +225,17 @@ auto get_block_op_ptr(IteratorT iter, [[maybe_unused]] Properties props) {
211225
bool is_aligned = alignof(value_type) >= RequiredAlign ||
212226
reinterpret_cast<uintptr_t>(iter) % RequiredAlign == 0;
213227

214-
constexpr auto AS = detail::deduce_AS<iter_no_cv>::value;
215228
using block_pointer_type =
216229
typename BlockTypeInfo<BlkInfo>::block_pointer_type;
217-
if constexpr (AS == access::address_space::global_space) {
230+
231+
static constexpr auto deduced_address_space =
232+
BlockTypeInfo<BlkInfo>::deduced_address_space;
233+
if constexpr (deduced_address_space ==
234+
access::address_space::generic_space ||
235+
deduced_address_space ==
236+
access::address_space::global_space ||
237+
deduced_address_space == access::address_space::local_space) {
218238
return is_aligned ? reinterpret_cast<block_pointer_type>(iter) : nullptr;
219-
} else if constexpr (AS == access::address_space::generic_space) {
220-
return is_aligned ? reinterpret_cast<block_pointer_type>(
221-
detail::dynamic_address_cast<
222-
access::address_space::global_space>(iter))
223-
: nullptr;
224239
} else {
225240
return nullptr;
226241
}
@@ -261,11 +276,37 @@ group_load(Group g, InputIteratorT in_ptr,
261276
// Do optimized load.
262277
using value_type = remove_decoration_t<
263278
typename std::iterator_traits<InputIteratorT>::value_type>;
264-
265-
auto load = __spirv_SubgroupBlockReadINTEL<
266-
typename detail::BlockTypeInfo<detail::BlockInfo<
267-
InputIteratorT, ElementsPerWorkItem, blocked>>::block_op_type>(
268-
ptr);
279+
using block_info = typename detail::BlockTypeInfo<
280+
detail::BlockInfo<InputIteratorT, ElementsPerWorkItem, blocked>>;
281+
static constexpr auto deduced_address_space =
282+
block_info::deduced_address_space;
283+
using block_op_type = typename block_info::block_op_type;
284+
285+
if constexpr (deduced_address_space ==
286+
access::address_space::local_space &&
287+
!props.template has_property<
288+
detail::native_local_block_io_key>())
289+
return group_load(g, in_ptr, out, use_naive{});
290+
291+
block_op_type load;
292+
if constexpr (deduced_address_space ==
293+
access::address_space::generic_space) {
294+
if (auto local_ptr = detail::dynamic_address_cast<
295+
access::address_space::local_space>(ptr)) {
296+
if constexpr (props.template has_property<
297+
detail::native_local_block_io_key>())
298+
load = __spirv_SubgroupBlockReadINTEL<block_op_type>(local_ptr);
299+
else
300+
return group_load(g, in_ptr, out, use_naive{});
301+
} else if (auto global_ptr = detail::dynamic_address_cast<
302+
access::address_space::global_space>(ptr)) {
303+
load = __spirv_SubgroupBlockReadINTEL<block_op_type>(global_ptr);
304+
} else {
305+
return group_load(g, in_ptr, out, use_naive{});
306+
}
307+
} else {
308+
load = __spirv_SubgroupBlockReadINTEL<block_op_type>(ptr);
309+
}
269310

270311
// TODO: accessor_iterator's value_type is weird, so we need
271312
// `std::remove_const_t` below:
@@ -331,6 +372,16 @@ group_store(Group g, const span<InputT, ElementsPerWorkItem> in,
331372
return group_store(g, in, out_ptr, use_naive{});
332373

333374
if constexpr (!std::is_same_v<std::nullptr_t, decltype(ptr)>) {
375+
using block_info = typename detail::BlockTypeInfo<
376+
detail::BlockInfo<OutputIteratorT, ElementsPerWorkItem, blocked>>;
377+
static constexpr auto deduced_address_space =
378+
block_info::deduced_address_space;
379+
if constexpr (deduced_address_space ==
380+
access::address_space::local_space &&
381+
!props.template has_property<
382+
detail::native_local_block_io_key>())
383+
return group_store(g, in, out_ptr, use_naive{});
384+
334385
// Do optimized store.
335386
std::remove_const_t<remove_decoration_t<
336387
typename std::iterator_traits<OutputIteratorT>::value_type>>
@@ -341,11 +392,28 @@ group_store(Group g, const span<InputT, ElementsPerWorkItem> in,
341392
values[i] = in[i];
342393
}
343394

344-
__spirv_SubgroupBlockWriteINTEL(
345-
ptr,
346-
sycl::bit_cast<typename detail::BlockTypeInfo<detail::BlockInfo<
347-
OutputIteratorT, ElementsPerWorkItem, blocked>>::block_op_type>(
348-
values));
395+
using block_op_type = typename block_info::block_op_type;
396+
if constexpr (deduced_address_space ==
397+
access::address_space::generic_space) {
398+
if (auto local_ptr = detail::dynamic_address_cast<
399+
access::address_space::local_space>(ptr)) {
400+
if constexpr (props.template has_property<
401+
detail::native_local_block_io_key>())
402+
__spirv_SubgroupBlockWriteINTEL(
403+
local_ptr, sycl::bit_cast<block_op_type>(values));
404+
else
405+
return group_store(g, in, out_ptr, use_naive{});
406+
} else if (auto global_ptr = detail::dynamic_address_cast<
407+
access::address_space::global_space>(ptr)) {
408+
__spirv_SubgroupBlockWriteINTEL(
409+
global_ptr, sycl::bit_cast<block_op_type>(values));
410+
} else {
411+
return group_store(g, in, out_ptr, use_naive{});
412+
}
413+
} else {
414+
__spirv_SubgroupBlockWriteINTEL(ptr,
415+
sycl::bit_cast<block_op_type>(values));
416+
}
349417
}
350418
}
351419
}

sycl/include/sycl/ext/oneapi/properties/property.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -224,8 +224,9 @@ enum PropKind : uint32_t {
224224
WorkGroupScratchSize = 79,
225225
Restrict = 80,
226226
EventMode = 81,
227+
NativeLocalBlockIO = 82,
227228
// PropKindSize must always be the last value.
228-
PropKindSize = 82,
229+
PropKindSize = 83,
229230
};
230231

231232
template <typename PropertyT> struct PropertyToKind {

sycl/test-e2e/GroupAlgorithm/load_store/basic.cpp

Lines changed: 66 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,9 @@
66

77
#include <numeric>
88

9-
int main() {
10-
using namespace sycl;
9+
using namespace sycl;
10+
11+
template <access::address_space addr_space> int test(queue &q) {
1112
namespace sycl_exp = sycl::ext::oneapi::experimental;
1213

1314
constexpr std::size_t wg_size = 32;
@@ -16,8 +17,6 @@ int main() {
1617
constexpr std::size_t elems_per_wi = 4;
1718
constexpr std::size_t n = global_size * elems_per_wi;
1819

19-
queue q;
20-
2120
buffer<int, 1> input_buf{n};
2221

2322
{
@@ -42,8 +41,10 @@ int main() {
4241
accessor store_blocked{store_blocked_buf, cgh};
4342
accessor store_striped{store_striped_buf, cgh};
4443

44+
local_accessor<int, 1> local_acc{wg_size * elems_per_wi, cgh};
4545
cgh.parallel_for(nd_range<1>{global_size, wg_size}, [=](nd_item<1> ndi) {
4646
auto gid = ndi.get_global_id(0);
47+
auto lid = ndi.get_local_id(0);
4748
auto g = ndi.get_group();
4849
auto offset = g.get_group_id(0) * g.get_local_range(0) * elems_per_wi;
4950

@@ -52,31 +53,76 @@ int main() {
5253
auto blocked = sycl_exp::properties{sycl_exp::data_placement_blocked};
5354
auto striped = sycl_exp::properties{sycl_exp::data_placement_striped};
5455

56+
if constexpr (addr_space == access::address_space::local_space) {
57+
// Copy input to local memory.
58+
for (int i = lid * elems_per_wi; i < lid * elems_per_wi + elems_per_wi;
59+
i++) {
60+
local_acc[i] = input[offset + i];
61+
}
62+
ndi.barrier(access::fence_space::local_space);
63+
}
64+
5565
// default
56-
sycl_exp::group_load(g, input.begin() + offset, span{data});
66+
if constexpr (addr_space == access::address_space::local_space) {
67+
sycl_exp::group_load(g, local_acc.begin(), span{data});
68+
} else {
69+
sycl_exp::group_load(g, input.begin() + offset, span{data});
70+
}
5771
for (int i = 0; i < elems_per_wi; ++i)
5872
load_blocked_default[gid * elems_per_wi + i] = data[i];
5973

6074
// blocked
61-
sycl_exp::group_load(g, input.begin() + offset, span{data}, blocked);
75+
if constexpr (addr_space == access::address_space::local_space) {
76+
sycl_exp::group_load(g, local_acc.begin(), span{data}, blocked);
77+
} else {
78+
sycl_exp::group_load(g, input.begin() + offset, span{data}, blocked);
79+
}
6280
for (int i = 0; i < elems_per_wi; ++i)
6381
load_blocked[gid * elems_per_wi + i] = data[i];
6482

6583
// striped
66-
sycl_exp::group_load(g, input.begin() + offset, span{data}, striped);
84+
if constexpr (addr_space == access::address_space::local_space) {
85+
sycl_exp::group_load(g, local_acc.begin(), span{data}, striped);
86+
} else {
87+
sycl_exp::group_load(g, input.begin() + offset, span{data}, striped);
88+
}
6789
for (int i = 0; i < elems_per_wi; ++i)
6890
load_striped[gid * elems_per_wi + i] = data[i];
6991

7092
// Stores:
7193

7294
std::iota(std::begin(data), std::end(data), gid * elems_per_wi);
7395

74-
sycl_exp::group_store(g, span{data},
75-
store_blocked_default.begin() + offset);
76-
sycl_exp::group_store(g, span{data}, store_blocked.begin() + offset,
77-
blocked);
78-
sycl_exp::group_store(g, span{data}, store_striped.begin() + offset,
79-
striped);
96+
auto copy_local_acc_to_global_output = [&](accessor<int, 1> output) {
97+
for (int i = lid * elems_per_wi; i < lid * elems_per_wi + elems_per_wi;
98+
i++) {
99+
output[offset + i] = local_acc[i];
100+
}
101+
};
102+
103+
if constexpr (addr_space == access::address_space::local_space) {
104+
sycl_exp::group_store(g, span{data}, local_acc.begin());
105+
copy_local_acc_to_global_output(store_blocked_default);
106+
} else {
107+
sycl_exp::group_store(g, span{data},
108+
store_blocked_default.begin() + offset);
109+
}
110+
111+
if constexpr (addr_space == access::address_space::local_space) {
112+
sycl_exp::group_store(g, span{data}, local_acc.begin(), blocked);
113+
copy_local_acc_to_global_output(store_blocked);
114+
} else {
115+
sycl_exp::group_store(g, span{data}, store_blocked.begin() + offset,
116+
blocked);
117+
}
118+
119+
if constexpr (addr_space == access::address_space::local_space) {
120+
sycl_exp::group_store(g, span{data}, local_acc.begin(), striped);
121+
copy_local_acc_to_global_output(store_striped);
122+
} else {
123+
sycl_exp::group_store(g, span{data}, store_striped.begin() + offset,
124+
striped);
125+
}
80126
});
81127
});
82128

@@ -111,3 +157,10 @@ int main() {
111157

112158
return 0;
113159
}
160+
161+
int main() {
162+
queue q;
163+
test<access::address_space::global_space>(q);
164+
test<access::address_space::local_space>(q);
165+
return 0;
166+
}

0 commit comments

Comments
 (0)