Skip to content

Commit 261a400

Browse files
authored
[SYCL][ESIMD] Update 2d API to use vc intrinsics instead of raw_send (#14690)
1 parent f5cb3bc commit 261a400

File tree

6 files changed

+188
-56
lines changed

6 files changed

+188
-56
lines changed

llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -592,6 +592,21 @@ class ESIMDIntrinDescTable {
592592
{"lsc.xatomic.stateless",
593593
{ai1(0), t8(1), t8(2), t8(3), t16(4), t32(5), t8(6), t8(7), t8(8),
594594
c8(0), a(1), a(2), a(3), c32(0), u(-1)}}},
595+
{"lsc_load2d_descriptor",
596+
{"lsc.load.2d.ugm.desc",
597+
{ai1(0), a(3), t8(1), t16(2), t16(3), a(1), t32(4), t32(5), a(2)}}},
598+
{"lsc_load2d_descriptor_transpose",
599+
{"lsc.load.2d.ugm.desc.transpose",
600+
{ai1(0), a(3), t8(1), t16(2), t16(3), a(1), t32(4), t32(5), a(2)}}},
601+
{"lsc_load2d_descriptor_transform",
602+
{"lsc.load.2d.ugm.desc.vnni",
603+
{ai1(0), a(3), t8(1), t16(2), t16(3), a(1), t32(4), t32(5), a(2)}}},
604+
{"lsc_prefetch_descriptor",
605+
{"lsc.prefetch.2d.ugm.desc",
606+
{ai1(0), a(3), t8(1), t16(2), t16(3), a(1), t32(4), t32(5), a(2)}}},
607+
{"lsc_store_descriptor",
608+
{"lsc.store.2d.ugm.desc",
609+
{ai1(0), a(3), t8(1), t16(2), t16(3), a(1), t32(4), t32(5), a(2)}}},
595610
{"lsc_fence", {"lsc.fence", {ai1(0), t8(0), t8(1), t8(2)}}},
596611
{"sat", {"sat", {a(0)}}},
597612
{"fptoui_sat", {"fptoui.sat", {a(0)}}},

sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp

Lines changed: 113 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ __ESIMD_INTRIN void __esimd_wait(uint16_t value);
3939
/// @tparam FenceOp is the fence operation.
4040
/// @tparam Scope is the operation scope.
4141
/// @tparam N is the SIMD size of operation (the number of addresses to access)
42-
/// @param pred is predicates.
42+
/// @param pred is the predicate.
4343
template <__ESIMD_NS::memory_kind Kind, __ESIMD_NS::fence_flush_op FenceOp,
4444
__ESIMD_NS::fence_scope Scope, int N>
4545
__ESIMD_INTRIN void
@@ -48,4 +48,116 @@ __esimd_lsc_fence(__ESIMD_DNS::simd_mask_storage_t<N> pred) __ESIMD_INTRIN_END;
4848
__ESIMD_INTRIN uint8_t __esimd_named_barrier_allocate(uint8_t NbarCount)
4949
__ESIMD_INTRIN_END;
5050

51+
/// 2D USM pointer block load.
52+
/// Supported platforms: PVC
53+
///
54+
/// Collects elements located as described in the descriptor and returns them
55+
/// as a single \ref simd object.
56+
///
57+
/// @tparam Ty is element type.
58+
/// @tparam NBlocks is the number of blocks.
59+
/// @tparam BlockWidth is the block width in number of elements.
60+
/// @tparam BlockHeight is the block height in number of elements.
61+
/// @tparam BlockXOffset is Memory block X immediate offset (in elements).
62+
/// @tparam BlockYOffset is Memory block Y immediate offset (in elements).
63+
/// @param Pred is the predicate.
64+
/// @param Desc is the descriptor containing parameters for the operation.
65+
/// @param PassThru is value to passthru when predicate is false on load.
66+
/// @param Cache is vector containing cache hint information.
67+
/// @return is a vector of type Ty
68+
template <typename Ty, uint8_t NBlocks, uint8_t BlockWidth, uint8_t BlockHeight,
69+
uint32_t BlockXOffset, uint32_t BlockYOffset, int N>
70+
__ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N> __esimd_lsc_load2d_descriptor(
71+
uint16_t Pred, __ESIMD_DNS::vector_type_t<uint32_t, 16> Desc,
72+
__ESIMD_DNS::vector_type_t<Ty, N> PassThru,
73+
__ESIMD_DNS::vector_type_t<uint8_t, 2> Cache) __ESIMD_INTRIN_END;
74+
75+
/// Collects elements located as described in the descriptor, performs
76+
/// transposition and returns them as a single \ref simd object.
77+
///
78+
/// @tparam Ty is element type.
79+
/// @tparam NBlocks is the number of blocks.
80+
/// @tparam BlockWidth is the block width in number of elements.
81+
/// @tparam BlockHeight is the block height in number of elements.
82+
/// @tparam BlockXOffset is Memory block X immediate offset (in elements).
83+
/// @tparam BlockYOffset is Memory block Y immediate offset (in elements).
84+
/// @param Pred is the predicate.
85+
/// @param Desc is the descriptor containing parameters for the operation.
86+
/// @param PassThru is value to passthru when predicate is false on load.
87+
/// @param Cache is vector containing cache hint information.
88+
/// @return is a vector of type Ty
89+
template <typename Ty, uint8_t NBlocks, uint8_t BlockWidth, uint8_t BlockHeight,
90+
uint32_t BlockXOffset, uint32_t BlockYOffset, int N>
91+
__ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N>
92+
__esimd_lsc_load2d_descriptor_transpose(
93+
uint16_t Pred, __ESIMD_DNS::vector_type_t<uint32_t, 16> Desc,
94+
__ESIMD_DNS::vector_type_t<Ty, N> PassThru,
95+
__ESIMD_DNS::vector_type_t<uint8_t, 2> Cache) __ESIMD_INTRIN_END;
96+
97+
/// Collects elements located as described in the descriptor, performs vnni
98+
/// transform and returns them as a single \ref simd object.
99+
///
100+
/// @tparam Ty is element type.
101+
/// @tparam NBlocks is the number of blocks.
102+
/// @tparam BlockWidth is the block width in number of elements.
103+
/// @tparam BlockHeight is the block height in number of elements.
104+
/// @tparam BlockXOffset is Memory block X immediate offset (in elements).
105+
/// @tparam BlockYOffset is Memory block Y immediate offset (in elements).
106+
/// @param Pred is the predicate.
107+
/// @param Desc is the descriptor containing parameters for the operation.
108+
/// @param PassThru is value to passthru when predicate is false on load.
109+
/// @param Cache is vector containing cache hint information.
110+
/// @return is a vector of type Ty
111+
template <typename Ty, uint8_t NBlocks, uint8_t BlockWidth, uint8_t BlockHeight,
112+
uint32_t BlockXOffset, uint32_t BlockYOffset, int N>
113+
__ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N>
114+
__esimd_lsc_load2d_descriptor_transform(
115+
uint16_t Pred, __ESIMD_DNS::vector_type_t<uint32_t, 16> Desc,
116+
__ESIMD_DNS::vector_type_t<Ty, N> PassThru,
117+
__ESIMD_DNS::vector_type_t<uint8_t, 2> Cache) __ESIMD_INTRIN_END;
118+
119+
/// 2D USM pointer block prefetch.
120+
/// Supported platforms: PVC
121+
///
122+
/// Prefetches elements located as described in the descriptor.
123+
///
124+
/// @tparam Ty is element type.
125+
/// @tparam NBlocks is the number of blocks.
126+
/// @tparam BlockWidth is the block width in number of elements.
127+
/// @tparam BlockHeight is the block height in number of elements.
128+
/// @tparam BlockXOffset is Memory block X immediate offset (in elements).
129+
/// @tparam BlockYOffset is Memory block Y immediate offset (in elements).
130+
/// @param Pred is the predicate.
131+
/// @param Desc is the descriptor containing parameters for the operation.
132+
/// @param PassThru is dummy value to obtain type of the elements.
133+
/// @param Cache is vector containing cache hint information.
134+
template <typename Ty, uint8_t NBlocks, uint8_t BlockWidth, uint8_t BlockHeight,
135+
uint32_t BlockXOffset, uint32_t BlockYOffset, int N>
136+
__ESIMD_INTRIN void __esimd_lsc_prefetch_descriptor(
137+
uint16_t Pred, __ESIMD_DNS::vector_type_t<uint32_t, 16> Desc,
138+
__ESIMD_DNS::vector_type_t<Ty, N> PassThru,
139+
__ESIMD_DNS::vector_type_t<uint8_t, 2> Cache) __ESIMD_INTRIN_END;
140+
141+
/// 2D USM pointer block store.
142+
/// Supported platforms: PVC
143+
///
144+
/// Stores elements as described in the descriptor.
145+
///
146+
/// @tparam Ty is element type.
147+
/// @tparam NBlocks is the number of blocks.
148+
/// @tparam BlockWidth is the block width in number of elements.
149+
/// @tparam BlockHeight is the block height in number of elements.
150+
/// @tparam BlockXOffset is Memory block X immediate offset (in elements).
151+
/// @tparam BlockYOffset is Memory block Y immediate offset (in elements).
152+
/// @param Pred is the predicate.
153+
/// @param Desc is the descriptor containing parameters for the operation.
154+
/// @param Values is value to to store.
155+
/// @param Cache is vector containing cache hint information.
156+
template <typename Ty, uint8_t NBlocks, uint8_t BlockWidth, uint8_t BlockHeight,
157+
uint32_t BlockXOffset, uint32_t BlockYOffset, int N>
158+
__ESIMD_INTRIN void __esimd_lsc_store_descriptor(
159+
uint16_t Pred, __ESIMD_DNS::vector_type_t<uint32_t, 16> Desc,
160+
__ESIMD_DNS::vector_type_t<Ty, N> Values,
161+
__ESIMD_DNS::vector_type_t<uint8_t, 2> Cache) __ESIMD_INTRIN_END;
162+
51163
/// @endcond ESIMD_DETAIL

sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp

Lines changed: 42 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1891,10 +1891,12 @@ template <typename T, int BlockWidth, int BlockHeight = 1, int NBlocks = 1,
18911891
T, NBlocks, BlockHeight, BlockWidth, Transposed, Transformed>()>
18921892
ESIMD_INLINE SYCL_ESIMD_FUNCTION __ESIMD_NS::simd<T, N> lsc_load_2d(
18931893
config_2d_mem_access<T, BlockWidth, BlockHeight, NBlocks> &payload) {
1894+
using RawT = __ESIMD_DNS::__raw_t<T>;
18941895
__ESIMD_DNS::check_lsc_block_2d_restrictions<
1895-
T, BlockWidth, BlockHeight, NBlocks, Transposed, Transformed,
1896+
RawT, BlockWidth, BlockHeight, NBlocks, Transposed, Transformed,
18961897
__ESIMD_DNS::block_2d_op::load>();
18971898
using PropertyListT = __ESIMD_DNS::make_L1_L2_properties_t<L1H, L2H>;
1899+
using CacheVectorT = __ESIMD_DNS::vector_type_t<uint8_t, 2>;
18981900
__ESIMD_DNS::check_cache_hints<__ESIMD_DNS::cache_action::load,
18991901
PropertyListT>();
19001902
constexpr int ElemsPerDword = 4 / sizeof(T);
@@ -1920,27 +1922,28 @@ ESIMD_INLINE SYCL_ESIMD_FUNCTION __ESIMD_NS::simd<T, N> lsc_load_2d(
19201922
__ESIMD_DNS::roundUpNextMultiple<DstElements * sizeof(T), GrfBytes>();
19211923
constexpr uint32_t DstLength =
19221924
(DstBlockSize / GrfBytes) > 31 ? 31 : (DstBlockSize / GrfBytes);
1923-
constexpr uint32_t DstLengthMask = DstLength << 20;
19241925

19251926
static_assert(N == ActualN || N == DstElements, "Incorrect element count");
19261927

1927-
constexpr uint32_t cache_mask = detail::get_lsc_load_cache_mask<L1H, L2H>()
1928-
<< 17;
1929-
constexpr uint32_t base_desc = 0x2000003;
1930-
constexpr uint32_t transformMask = Transformed ? 1 << 7 : 0;
1931-
constexpr uint32_t transposeMask = Transposed ? 1 << 15 : 0;
1932-
constexpr uint32_t dataSizeMask = detail::get_lsc_data_size<T>() << 9;
1933-
__ESIMD_NS::simd<T, N> oldDst;
1934-
constexpr uint32_t exDesc = 0x0;
1935-
constexpr uint32_t desc = base_desc | cache_mask | transformMask |
1936-
transposeMask | dataSizeMask | DstLengthMask;
1937-
constexpr uint8_t execSize = 1;
1938-
constexpr uint8_t sfid = 0xF;
1939-
constexpr uint8_t numSrc0 = 0x1;
1940-
constexpr uint8_t numDst = (N * sizeof(T)) / 64;
1941-
__ESIMD_NS::simd<T, ActualN> Raw =
1942-
__ESIMD_NS::raw_send<execSize, sfid, numSrc0, numDst>(
1943-
oldDst, payload.get_raw_data(), exDesc, desc);
1928+
__ESIMD_NS::simd<RawT, N> oldDst;
1929+
constexpr uint16_t Mask = 1;
1930+
constexpr CacheVectorT Cache = {static_cast<uint8_t>(L1H),
1931+
static_cast<uint8_t>(L2H)};
1932+
1933+
__ESIMD_NS::simd<T, ActualN> Raw;
1934+
1935+
if constexpr (Transposed)
1936+
Raw = __esimd_lsc_load2d_descriptor_transpose<RawT, NBlocks, BlockWidth,
1937+
BlockHeight, 0, 0, N>(
1938+
Mask, payload.get_raw_data().data(), oldDst.data(), Cache);
1939+
else if constexpr (Transformed)
1940+
Raw = __esimd_lsc_load2d_descriptor_transform<RawT, NBlocks, BlockWidth,
1941+
BlockHeight, 0, 0, N>(
1942+
Mask, payload.get_raw_data().data(), oldDst.data(), Cache);
1943+
else
1944+
Raw = __esimd_lsc_load2d_descriptor<RawT, NBlocks, BlockWidth, BlockHeight,
1945+
0, 0, N>(
1946+
Mask, payload.get_raw_data().data(), oldDst.data(), Cache);
19441947

19451948
if constexpr (ActualN == N) {
19461949
return Raw;
@@ -1988,27 +1991,24 @@ template <typename T, int BlockWidth, int BlockHeight = 1, int NBlocks = 1,
19881991
ESIMD_INLINE SYCL_ESIMD_FUNCTION void lsc_prefetch_2d(
19891992
config_2d_mem_access<T, BlockWidth, BlockHeight, NBlocks> &payload) {
19901993
using PropertyListT = __ESIMD_DNS::make_L1_L2_properties_t<L1H, L2H>;
1994+
using CacheVectorT = __ESIMD_DNS::vector_type_t<uint8_t, 2>;
1995+
using RawT = __ESIMD_DNS::__raw_t<T>;
19911996
__ESIMD_DNS::check_cache_hints<__ESIMD_DNS::cache_action::load,
19921997
PropertyListT>();
19931998
__ESIMD_DNS::check_lsc_block_2d_restrictions<
1994-
T, BlockWidth, BlockHeight, NBlocks, Transposed, Transformed,
1999+
RawT, BlockWidth, BlockHeight, NBlocks, Transposed, Transformed,
19952000
__ESIMD_DNS::block_2d_op::prefetch>();
19962001
static_assert(!Transposed || !Transformed,
19972002
"Transposed and transformed is not supported");
1998-
constexpr uint32_t cache_mask = detail::get_lsc_load_cache_mask<L1H, L2H>()
1999-
<< 17;
2000-
constexpr uint32_t dataSizeMask = detail::get_lsc_data_size<T>() << 9;
2001-
constexpr uint32_t base_desc = 0x2000003;
2002-
constexpr uint32_t transformMask = Transformed ? 1 << 7 : 0;
2003-
constexpr uint32_t transposeMask = Transposed ? 1 << 15 : 0;
2004-
constexpr uint32_t exDesc = 0x0;
2005-
constexpr uint32_t desc =
2006-
base_desc | cache_mask | transformMask | transposeMask | dataSizeMask;
2007-
constexpr uint8_t execSize = 1;
2008-
constexpr uint8_t sfid = 0xF;
2009-
constexpr uint8_t numDst = (N * sizeof(T)) / 64;
2010-
__ESIMD_NS::raw_send<execSize, sfid, numDst>(payload.get_raw_data(), exDesc,
2011-
desc);
2003+
2004+
__ESIMD_NS::simd<RawT, N> oldDst;
2005+
constexpr uint16_t Mask = 1;
2006+
constexpr CacheVectorT Cache = {static_cast<uint8_t>(L1H),
2007+
static_cast<uint8_t>(L2H)};
2008+
2009+
__esimd_lsc_prefetch_descriptor<RawT, NBlocks, BlockWidth, BlockHeight, 0, 0,
2010+
N>(Mask, payload.get_raw_data().data(),
2011+
oldDst.data(), Cache);
20122012
}
20132013

20142014
/// A variation of \c 2D stateless block store \c with parameters passed as
@@ -2033,27 +2033,21 @@ template <typename T, int BlockWidth, int BlockHeight = 1, int NBlocks = 1,
20332033
ESIMD_INLINE SYCL_ESIMD_FUNCTION void
20342034
lsc_store_2d(config_2d_mem_access<T, BlockWidth, BlockHeight, NBlocks> &payload,
20352035
__ESIMD_NS::simd<T, N> Data) {
2036+
using RawT = __ESIMD_DNS::__raw_t<T>;
20362037
__ESIMD_DNS::check_lsc_block_2d_restrictions<
2037-
T, BlockWidth, BlockHeight, NBlocks, false, false,
2038+
RawT, BlockWidth, BlockHeight, NBlocks, false, false,
20382039
__ESIMD_DNS::block_2d_op::store>();
20392040
using PropertyListT = __ESIMD_DNS::make_L1_L2_properties_t<L1H, L2H>;
2041+
using CacheVectorT = __ESIMD_DNS::vector_type_t<uint8_t, 2>;
20402042
__ESIMD_DNS::check_cache_hints<__ESIMD_DNS::cache_action::store,
20412043
PropertyListT>();
20422044

2043-
constexpr uint32_t cache_mask = detail::get_lsc_store_cache_mask<L1H, L2H>()
2044-
<< 17;
2045-
constexpr uint32_t dataSizeMask = detail::get_lsc_data_size<T>() << 9;
2046-
constexpr uint32_t base_desc = 0x2000007;
2047-
2048-
constexpr uint32_t exDesc = 0x0;
2049-
constexpr uint32_t desc = base_desc | cache_mask | dataSizeMask;
2050-
constexpr uint8_t execSize = 1;
2051-
constexpr uint8_t sfid = 0xF;
2052-
constexpr uint8_t numSrc0 = 0x1;
2053-
constexpr uint8_t numSrc1 = (N * sizeof(T)) / 64;
2045+
constexpr uint16_t Mask = 1;
2046+
constexpr CacheVectorT Cache = {static_cast<uint8_t>(L1H),
2047+
static_cast<uint8_t>(L2H)};
20542048

2055-
__ESIMD_NS::raw_sends<execSize, sfid, numSrc0, numSrc1>(
2056-
payload.get_raw_data(), Data, exDesc, desc);
2049+
__esimd_lsc_store_descriptor<RawT, NBlocks, BlockWidth, BlockHeight, 0, 0, N>(
2050+
Mask, payload.get_raw_data().data(), Data.data(), Cache);
20572051
}
20582052

20592053
namespace detail {

sycl/test-e2e/ESIMD/lsc/lsc_load_2d_compare.cpp

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
//
77
//===----------------------------------------------------------------------===//
88
// REQUIRES: arch-intel_gpu_pvc
9+
// REQUIRES-INTEL-DRIVER: lin: 30508
910
// RUN: %{build} -o %t.out
1011
// RUN: %{run} %t.out
1112

@@ -20,15 +21,16 @@ using bf16 = sycl::ext::oneapi::bfloat16;
2021
using namespace sycl;
2122
using namespace sycl::ext::intel::esimd;
2223
using namespace sycl::ext::intel::experimental::esimd;
23-
template <typename T> bool test() {
24+
template <typename T, bool Transposed = false, bool Transformed = false>
25+
bool test() {
2426
sycl::queue Q(sycl::gpu_selector_v);
2527
auto dev = Q.get_device();
2628
std::cout << "Running on " << dev.get_info<sycl::info::device::name>()
2729
<< "\n";
2830

2931
constexpr int TM = 8;
3032
constexpr int TN = 8;
31-
constexpr int NBLOCKS = 2;
33+
constexpr int NBLOCKS = Transposed ? 1 : 2;
3234
constexpr int WIDTH = 2 * TN;
3335
constexpr int HEIGHT = TM;
3436
constexpr int PITCH = WIDTH;
@@ -49,11 +51,12 @@ template <typename T> bool test() {
4951
A, WIDTH * sizeof(T) - 1, HEIGHT - 1, PITCH * sizeof(T) - 1, 0, 0);
5052

5153
simd<T, NBLOCKS * TM * TN> tmp =
52-
lsc_load_2d<T, TN, TM, NBLOCKS, false, false>(my_config);
53-
simd<T, NBLOCKS * TM * TN> tmp1 = lsc_load_2d<T, TN, TM, NBLOCKS>(
54-
my_config.get_data_pointer(), my_config.get_surface_width(),
55-
my_config.get_surface_height(), my_config.get_surface_pitch(),
56-
my_config.get_x(), my_config.get_y());
54+
lsc_load_2d<T, TN, TM, NBLOCKS, Transposed, Transformed>(my_config);
55+
simd<T, NBLOCKS * TM * TN> tmp1 =
56+
lsc_load_2d<T, TN, TM, NBLOCKS, Transposed, Transformed>(
57+
my_config.get_data_pointer(), my_config.get_surface_width(),
58+
my_config.get_surface_height(), my_config.get_surface_pitch(),
59+
my_config.get_x(), my_config.get_y());
5760

5861
tmp.copy_to(C);
5962
tmp1.copy_to(C1);
@@ -77,6 +80,12 @@ int main() {
7780
result |= test<uint8_t>();
7881
result |= test<sycl::half>();
7982

83+
result |= test<float, true>();
84+
result |= test<uint32_t, true>();
85+
86+
result |= test<uint16_t, false, true>();
87+
result |= test<uint8_t, false, true>();
88+
8089
std::cout << (result ? "FAILED" : "passed") << std::endl;
8190
return 0;
8291
}

sycl/test-e2e/ESIMD/lsc/lsc_load_store_2d_compare.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
//
77
//===----------------------------------------------------------------------===//
88
// REQUIRES: arch-intel_gpu_pvc
9+
// REQUIRES-INTEL-DRIVER: lin: 30508
910
// RUN: %{build} -o %t.out
1011
// RUN: %{run} %t.out
1112

sycl/test-e2e/ESIMD/lsc/lsc_load_store_2d_smoke.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
//
77
//===----------------------------------------------------------------------===//
88
// REQUIRES: arch-intel_gpu_pvc
9+
// REQUIRES-INTEL-DRIVER: lin: 30508
910
// RUN: %{build} -o %t.out
1011
// RUN: %{run} %t.out
1112

0 commit comments

Comments
 (0)