Skip to content

Commit c63f802

Browse files
authored
[ESIMD] Add capability to specify 64 bit offsets to esimd memory functions (#7411)
1 parent 422d2b5 commit c63f802

File tree

2 files changed

+357
-55
lines changed

2 files changed

+357
-55
lines changed

sycl/include/sycl/ext/intel/esimd/memory.hpp

Lines changed: 216 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include <sycl/ext/intel/esimd/detail/types.hpp>
1616
#include <sycl/ext/intel/esimd/detail/util.hpp>
1717
#include <sycl/ext/intel/esimd/simd.hpp>
18+
#include <sycl/ext/intel/esimd/simd_view.hpp>
1819
#include <sycl/half_type.hpp>
1920

2021
#include <cstdint>
@@ -119,15 +120,18 @@ __ESIMD_API SurfaceIndex get_surface_index(AccessorTy acc) {
119120
/// @tparam N Number of elements to read; can be \c 1, \c 2, \c 4, \c 8, \c 16
120121
/// or \c 32.
121122
/// @param p The base address.
122-
/// @param offsets the vector of 32-bit offsets in bytes. For each lane \c i,
123-
/// ((byte*)p + offsets[i]) must be element size aligned.
123+
/// @param offsets the vector of 32-bit or 64-bit offsets in bytes. For each
124+
/// lane \c i, ((byte*)p + offsets[i]) must be element size aligned.
124125
/// @param mask The access mask, defaults to all 1s.
125126
/// @return A vector of elements read. Elements in masked out lanes are
126127
/// undefined.
127128
///
128-
template <typename Tx, int N, class T = detail::__raw_t<Tx>>
129-
__ESIMD_API std::enable_if_t<detail::isPowerOf2(N, 32), simd<Tx, N>>
130-
gather(const Tx *p, simd<uint32_t, N> offsets, simd_mask<N> mask = 1) {
129+
template <typename Tx, int N, typename Toffset>
130+
__ESIMD_API simd<Tx, N> gather(const Tx *p, simd<Toffset, N> offsets,
131+
simd_mask<N> mask = 1) {
132+
using T = detail::__raw_t<Tx>;
133+
static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
134+
static_assert(detail::isPowerOf2(N, 32), "Unsupported value of N");
131135
simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
132136
simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
133137
addrs = addrs + offsets_i;
@@ -148,6 +152,29 @@ gather(const Tx *p, simd<uint32_t, N> offsets, simd_mask<N> mask = 1) {
148152
mask.data());
149153
}
150154

155+
/// A variation of \c gather API with \c offsets represented as \c simd_view
156+
/// object.
157+
///
158+
/// @tparam Tx Element type, must be of size 4 or less.
159+
/// @tparam N Number of elements to read; can be \c 1, \c 2, \c 4, \c 8, \c 16
160+
/// or \c 32.
161+
/// @param p The base address.
162+
/// @param offsets the simd_view of 32-bit or 64-bit offsets in bytes. For each
163+
/// lane \c i, ((byte*)p + offsets[i]) must be element size aligned.
164+
/// @param mask The access mask, defaults to all 1s.
165+
/// @return A vector of elements read. Elements in masked out lanes are
166+
/// undefined.
167+
///
168+
template <typename Tx, int N, typename Toffset,
169+
typename RegionTy = region1d_t<Toffset, N, 1>>
170+
__ESIMD_API simd<Tx, N> gather(const Tx *p,
171+
simd_view<Toffset, RegionTy> offsets,
172+
simd_mask<N> mask = 1) {
173+
using T = detail::__raw_t<Tx>;
174+
using Ty = typename simd_view<Toffset, RegionTy>::element_type;
175+
return gather<Tx, N>(p, simd<Ty, N>(offsets), mask);
176+
}
177+
151178
/// Writes ("scatters") elements of the input vector to different memory
152179
/// locations. Each memory location is base address plus an offset - a
153180
/// value of the corresponding element in the input offset vector. Access to
@@ -156,15 +183,17 @@ gather(const Tx *p, simd<uint32_t, N> offsets, simd_mask<N> mask = 1) {
156183
/// @tparam N Number of elements to write; can be \c 1, \c 2, \c 4, \c 8, \c 16
157184
/// or \c 32.
158185
/// @param p The base address.
159-
/// @param offsets A vector of 32-bit offsets in bytes. For each lane \c i,
160-
/// ((byte*)p + offsets[i]) must be element size aligned.
186+
/// @param offsets A vector of 32-bit or 64-bit offsets in bytes. For each lane
187+
/// \c i, ((byte*)p + offsets[i]) must be element size aligned.
161188
/// @param vals The vector to scatter.
162189
/// @param mask The access mask, defaults to all 1s.
163190
///
164-
template <typename Tx, int N, class T = detail::__raw_t<Tx>>
165-
__ESIMD_API std::enable_if_t<detail::isPowerOf2(N, 32)>
166-
scatter(Tx *p, simd<uint32_t, N> offsets, simd<Tx, N> vals,
167-
simd_mask<N> mask = 1) {
191+
template <typename Tx, int N, typename Toffset>
192+
__ESIMD_API void scatter(Tx *p, simd<Toffset, N> offsets, simd<Tx, N> vals,
193+
simd_mask<N> mask = 1) {
194+
using T = detail::__raw_t<Tx>;
195+
static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
196+
static_assert(detail::isPowerOf2(N, 32), "Unsupported value of N");
168197
simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
169198
simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
170199
addrs = addrs + offsets_i;
@@ -186,6 +215,27 @@ scatter(Tx *p, simd<uint32_t, N> offsets, simd<Tx, N> vals,
186215
addrs.data(), vals.data(), mask.data());
187216
}
188217

218+
/// A variation of \c scatter API with \c offsets represented as \c simd_view
219+
/// object.
220+
///
221+
/// @tparam Tx Element type, must be of size 4 or less.
222+
/// @tparam N Number of elements to write; can be \c 1, \c 2, \c 4, \c 8, \c 16
223+
/// or \c 32.
224+
/// @param p The base address.
225+
/// @param offsets A simd_view of 32-bit or 64-bit offsets in bytes. For each
226+
/// lane \c i, ((byte*)p + offsets[i]) must be element size aligned.
227+
/// @param vals The vector to scatter.
228+
/// @param mask The access mask, defaults to all 1s.
229+
///
230+
template <typename Tx, int N, typename Toffset,
231+
typename RegionTy = region1d_t<Toffset, N, 1>>
232+
__ESIMD_API void scatter(Tx *p, simd_view<Toffset, RegionTy> offsets,
233+
simd<Tx, N> vals, simd_mask<N> mask = 1) {
234+
using T = detail::__raw_t<Tx>;
235+
using Ty = typename simd_view<Toffset, RegionTy>::element_type;
236+
scatter<Tx, N>(p, simd<Ty, N>(offsets), vals, mask);
237+
}
238+
189239
/// Loads a contiguous block of memory from given memory address and returns
190240
/// the loaded data as a vector. Actual code generated depends on the
191241
/// alignment parameter.
@@ -539,24 +589,52 @@ __ESIMD_API void scalar_store(AccessorTy acc, uint32_t offset, T val) {
539589
/// vector). Must be 8, 16 or 32.
540590
/// @tparam Mask A pixel's channel mask.
541591
/// @param p The USM base pointer representing memory address of the access.
542-
/// @param offsets Byte offsets of the pixels relative to the base pointer.
592+
/// @param offsets vector of byte offsets of the pixels relative to the base
593+
/// pointer.
543594
/// @param mask Memory access mask. Pixels with zero corresponding mask's
544595
/// predicate are not accessed. Their values in the resulting vector are
545596
/// undefined.
546597
/// @return Read data - up to N*4 values of type \c Tx.
547598
///
548599
template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
549-
int N>
550-
__ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && sizeof(T) == 4,
551-
simd<T, N * get_num_channels_enabled(RGBAMask)>>
552-
gather_rgba(const T *p, simd<uint32_t, N> offsets, simd_mask<N> mask = 1) {
600+
int N, typename Toffset>
601+
__ESIMD_API simd<T, N * get_num_channels_enabled(RGBAMask)>
602+
gather_rgba(const T *p, simd<Toffset, N> offsets, simd_mask<N> mask = 1) {
603+
static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
604+
static_assert((N == 8 || N == 16 || N == 32), "Unsupported value of N");
605+
static_assert(sizeof(T) == 4, "Unsupported size of type T");
553606
simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
554607
simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
555608
addrs = addrs + offsets_i;
556609
return __esimd_svm_gather4_scaled<detail::__raw_t<T>, N, RGBAMask>(
557610
addrs.data(), mask.data());
558611
}
559612

613+
/// A variation of \c gather_rgba API with \c offsets represented as
614+
/// \c simd_view object.
615+
///
616+
/// @tparam T Element type of the returned vector. Must be 4 bytes in size.
617+
/// @tparam N Number of pixels to access (matches the size of the \c offsets
618+
/// vector). Must be 8, 16 or 32.
619+
/// @tparam Mask A pixel's channel mask.
620+
/// @param p The USM base pointer representing memory address of the access.
621+
/// @param offsets simd_view of byte offsets of the pixels relative to the base
622+
/// pointer.
623+
/// @param mask Memory access mask. Pixels with zero corresponding mask's
624+
/// predicate are not accessed. Their values in the resulting vector are
625+
/// undefined.
626+
/// @return Read data - up to N*4 values of type \c Tx.
627+
///
628+
template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
629+
int N, typename Toffset,
630+
typename RegionTy = region1d_t<Toffset, N, 1>>
631+
__ESIMD_API simd<T, N * get_num_channels_enabled(RGBAMask)>
632+
gather_rgba(const T *p, simd_view<Toffset, RegionTy> offsets,
633+
simd_mask<N> mask = 1) {
634+
using Ty = typename simd_view<Toffset, RegionTy>::element_type;
635+
return gather_rgba<RGBAMask, T, N>(p, simd<Ty, N>(offsets), mask);
636+
}
637+
560638
template <typename T, int N, rgba_channel_mask RGBAMask>
561639
__SYCL_DEPRECATED("use gather_rgba<rgba_channel_mask>()")
562640
__ESIMD_API std::enable_if_t<
@@ -592,17 +670,21 @@ template <rgba_channel_mask M> static void validate_rgba_write_channel_mask() {
592670
/// @tparam RGBAMask A pixel's channel mask.
593671
/// @param p The USM base pointer representing memory address of the access.
594672
/// @param vals values to be written.
595-
/// @param offsets Byte offsets of the pixels relative to the base pointer.
673+
/// @param offsets vector of byte offsets of the pixels relative to the base
674+
/// pointer.
596675
/// @param mask Memory access mask. Pixels with zero corresponding mask's
597676
/// predicate are not accessed. Their values in the resulting vector are
598677
/// undefined.
599678
///
600679
template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
601-
int N>
602-
__ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && sizeof(T) == 4>
603-
scatter_rgba(T *p, simd<uint32_t, N> offsets,
680+
int N, typename Toffset>
681+
__ESIMD_API void
682+
scatter_rgba(T *p, simd<Toffset, N> offsets,
604683
simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
605684
simd_mask<N> mask = 1) {
685+
static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
686+
static_assert((N == 8 || N == 16 || N == 32), "Unsupported value of N");
687+
static_assert(sizeof(T) == 4, "Unsupported size of type T");
606688
detail::validate_rgba_write_channel_mask<RGBAMask>();
607689
simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
608690
simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
@@ -611,6 +693,32 @@ scatter_rgba(T *p, simd<uint32_t, N> offsets,
611693
addrs.data(), vals.data(), mask.data());
612694
}
613695

696+
/// A variation of \c scatter_rgba API with \c offsets represented as
697+
/// \c simd_view object
698+
///
699+
/// @tparam T Element type of the returned vector. Must be 4 bytes in size.
700+
/// @tparam N Number of pixels to access (matches the size of the \c offsets
701+
/// vector). Must be 8, 16 or 32.
702+
/// @tparam RGBAMask A pixel's channel mask.
703+
/// @param p The USM base pointer representing memory address of the access.
704+
/// @param vals values to be written.
705+
/// @param offsets simd_view of byte offsets of the pixels relative to the base
706+
/// pointer.
707+
/// @param mask Memory access mask. Pixels with zero corresponding mask's
708+
/// predicate are not accessed. Their values in the resulting vector are
709+
/// undefined.
710+
///
711+
template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
712+
int N, typename Toffset,
713+
typename RegionTy = region1d_t<Toffset, N, 1>>
714+
__ESIMD_API void
715+
scatter_rgba(T *p, simd_view<Toffset, RegionTy> offsets,
716+
simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
717+
simd_mask<N> mask = 1) {
718+
using Ty = typename simd_view<Toffset, RegionTy>::element_type;
719+
scatter_rgba<RGBAMask, T, N>(p, simd<Ty, N>(offsets), vals, mask);
720+
}
721+
614722
template <typename T, int N, rgba_channel_mask RGBAMask>
615723
__SYCL_DEPRECATED("use scatter_rgba<rgba_channel_mask>()")
616724
__ESIMD_API std::
@@ -770,15 +878,16 @@ constexpr void check_atomic() {
770878
/// @tparam Tx The vector element type.
771879
/// @tparam N The number of memory locations to update.
772880
/// @param p The USM pointer.
773-
/// @param offset The vector of 32-bit offsets in bytes.
881+
/// @param offset The vector of 32-bit or 64-bit offsets in bytes.
774882
/// @param mask Operation mask, only locations with non-zero in the
775883
/// corresponding mask element are updated.
776884
/// @return A vector of the old values at the memory locations before the
777885
/// update.
778886
///
779-
template <atomic_op Op, typename Tx, int N>
780-
__ESIMD_API simd<Tx, N> atomic_update(Tx *p, simd<unsigned, N> offset,
887+
template <atomic_op Op, typename Tx, int N, typename Toffset>
888+
__ESIMD_API simd<Tx, N> atomic_update(Tx *p, simd<Toffset, N> offset,
781889
simd_mask<N> mask) {
890+
static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
782891
detail::check_atomic<Op, Tx, N, 0>();
783892
simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
784893
simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(offset);
@@ -787,6 +896,29 @@ __ESIMD_API simd<Tx, N> atomic_update(Tx *p, simd<unsigned, N> offset,
787896
return __esimd_svm_atomic0<Op, T, N>(vAddr.data(), mask.data());
788897
}
789898

899+
/// A variation of \c atomic_update API with \c offsets represented as
900+
/// \c simd_view object.
901+
///
902+
/// @tparam Op The atomic operation - can be \c atomic_op::inc or
903+
/// atomic_op::dec.
904+
/// @tparam Tx The vector element type.
905+
/// @tparam N The number of memory locations to update.
906+
/// @param p The USM pointer.
907+
/// @param offset The simd_view of 32-bit or 64-bit offsets in bytes.
908+
/// @param mask Operation mask, only locations with non-zero in the
909+
/// corresponding mask element are updated.
910+
/// @return A vector of the old values at the memory locations before the
911+
/// update.
912+
///
913+
template <atomic_op Op, typename Tx, int N, typename Toffset,
914+
typename RegionTy = region1d_t<Toffset, N, 1>>
915+
__ESIMD_API simd<Tx, N> atomic_update(Tx *p,
916+
simd_view<Toffset, RegionTy> offsets,
917+
simd_mask<N> mask = 1) {
918+
using Ty = typename simd_view<Toffset, RegionTy>::element_type;
919+
return atomic_update<Op, Tx, N>(p, simd<Ty, N>(offsets), mask);
920+
}
921+
790922
/// @anchor usm_atomic_update1
791923
/// @brief Single-argument variant of the atomic update operation.
792924
///
@@ -803,16 +935,17 @@ __ESIMD_API simd<Tx, N> atomic_update(Tx *p, simd<unsigned, N> offset,
803935
/// @tparam Tx The vector element type.
804936
/// @tparam N The number of memory locations to update.
805937
/// @param p The USM pointer.
806-
/// @param offset The vector of 32-bit offsets in bytes.
938+
/// @param offset The vector of 32-bit or 64-bit offsets in bytes.
807939
/// @param src0 The additional argument.
808940
/// @param mask Operation mask, only locations with non-zero in the
809941
/// corresponding mask element are updated.
810942
/// @return A vector of the old values at the memory locations before the
811943
/// update.
812944
///
813-
template <atomic_op Op, typename Tx, int N>
814-
__ESIMD_API simd<Tx, N> atomic_update(Tx *p, simd<unsigned, N> offset,
945+
template <atomic_op Op, typename Tx, int N, typename Toffset>
946+
__ESIMD_API simd<Tx, N> atomic_update(Tx *p, simd<Toffset, N> offset,
815947
simd<Tx, N> src0, simd_mask<N> mask) {
948+
static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
816949
if constexpr ((Op == atomic_op::fmin) || (Op == atomic_op::fmax) ||
817950
(Op == atomic_op::fadd) || (Op == atomic_op::fsub)) {
818951
// Auto-convert FP atomics to LSC version. Warning is given - see enum.
@@ -823,12 +956,40 @@ __ESIMD_API simd<Tx, N> atomic_update(Tx *p, simd<unsigned, N> offset,
823956
simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
824957
simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(offset);
825958
vAddr += offset_i1;
959+
826960
using T = typename detail::__raw_t<Tx>;
827961
return __esimd_svm_atomic1<Op, T, N>(vAddr.data(), src0.data(),
828962
mask.data());
829963
}
830964
}
831965

966+
/// A variation of \c atomic_update API with \c offsets represented as
967+
/// \c simd_view object.
968+
///
969+
/// @tparam Op The atomic operation - can be one of the following:
970+
/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max,
971+
/// \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or,
972+
/// \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint,
973+
/// \c atomic_op::fmax, \c atomic_op::fmin.
974+
/// @tparam Tx The vector element type.
975+
/// @tparam N The number of memory locations to update.
976+
/// @param p The USM pointer.
977+
/// @param offset The simd_view of 32-bit or 64-bit offsets in bytes.
978+
/// @param src0 The additional argument.
979+
/// @param mask Operation mask, only locations with non-zero in the
980+
/// corresponding mask element are updated.
981+
/// @return A vector of the old values at the memory locations before the
982+
/// update.
983+
///
984+
template <atomic_op Op, typename Tx, int N, typename Toffset,
985+
typename RegionTy = region1d_t<Toffset, N, 1>>
986+
__ESIMD_API simd<Tx, N> atomic_update(Tx *p,
987+
simd_view<Toffset, RegionTy> offsets,
988+
simd<Tx, N> src0, simd_mask<N> mask) {
989+
using Ty = typename simd_view<Toffset, RegionTy>::element_type;
990+
return atomic_update<Op, Tx, N>(p, simd<Ty, N>(offsets), src0, mask);
991+
}
992+
832993
/// @anchor usm_atomic_update2
833994
/// Atomically updates \c N memory locations represented by a USM pointer and
834995
/// a vector of offsets relative to the pointer, and returns a vector of old
@@ -840,18 +1001,19 @@ __ESIMD_API simd<Tx, N> atomic_update(Tx *p, simd<unsigned, N> offset,
8401001
/// @tparam Tx The vector element type.
8411002
/// @tparam N The number of memory locations to update.
8421003
/// @param p The USM pointer.
843-
/// @param offset The vector of 32-bit offsets in bytes.
1004+
/// @param offset The vector of 32-bit or 64-bit offsets in bytes.
8441005
/// @param src0 The first additional argument (new value).
8451006
/// @param src1 The second additional argument (expected value).
8461007
/// @param mask Operation mask, only locations with non-zero in the
8471008
/// corresponding mask element are updated.
8481009
/// @return A vector of the old values at the memory locations before the
8491010
/// update.
8501011
///
851-
template <atomic_op Op, typename Tx, int N>
852-
__ESIMD_API simd<Tx, N> atomic_update(Tx *p, simd<unsigned, N> offset,
1012+
template <atomic_op Op, typename Tx, int N, typename Toffset>
1013+
__ESIMD_API simd<Tx, N> atomic_update(Tx *p, simd<Toffset, N> offset,
8531014
simd<Tx, N> src0, simd<Tx, N> src1,
8541015
simd_mask<N> mask) {
1016+
static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
8551017
if constexpr (Op == atomic_op::fcmpwr) {
8561018
// Auto-convert FP atomics to LSC version. Warning is given - see enum.
8571019
return atomic_update<detail::to_lsc_atomic_op<Op>(), Tx, N>(p, offset, src0,
@@ -867,6 +1029,31 @@ __ESIMD_API simd<Tx, N> atomic_update(Tx *p, simd<unsigned, N> offset,
8671029
}
8681030
}
8691031

1032+
/// A variation of \c atomic_update API with \c offsets represented as
1033+
/// \c simd_view object.
1034+
///
1035+
/// @tparam Op The atomic operation - can be one of the following:
1036+
/// \c atomic_op::cmpxchg, \c atomic_op::fcmpwr.
1037+
/// @tparam Tx The vector element type.
1038+
/// @tparam N The number of memory locations to update.
1039+
/// @param p The USM pointer.
1040+
/// @param offset The simd_view of 32-bit or 64-bit offsets in bytes.
1041+
/// @param src0 The first additional argument (new value).
1042+
/// @param src1 The second additional argument (expected value).
1043+
/// @param mask Operation mask, only locations with non-zero in the
1044+
/// corresponding mask element are updated.
1045+
/// @return A vector of the old values at the memory locations before the
1046+
/// update.
1047+
///
1048+
template <atomic_op Op, typename Tx, int N, typename Toffset,
1049+
typename RegionTy = region1d_t<Toffset, N, 1>>
1050+
__ESIMD_API simd<Tx, N>
1051+
atomic_update(Tx *p, simd_view<Toffset, RegionTy> offsets, simd<Tx, N> src0,
1052+
simd<Tx, N> src1, simd_mask<N> mask) {
1053+
using Ty = typename simd_view<Toffset, RegionTy>::element_type;
1054+
return atomic_update<Op, Tx, N>(p, simd<Ty, N>(offsets), src0, src1, mask);
1055+
}
1056+
8701057
/// @} sycl_esimd_memory_atomics
8711058

8721059
/// @addtogroup sycl_esimd_memory

0 commit comments

Comments
 (0)