Skip to content

Commit 0b4c123

Browse files
authored
[ESIMD] Implement accessor based gather_rgba/scatter_rgba (#6131)
* [ESIMD] Implement accessor based gather_rgba/scatter_rgba Signed-off-by: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
1 parent f232cfd commit 0b4c123

File tree

4 files changed

+129
-34
lines changed

4 files changed

+129
-34
lines changed

llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -418,6 +418,15 @@ class ESIMDIntrinDescTable {
418418
{"gather_masked_scaled2",
419419
{"gather.masked.scaled2", {t(3), t(4), aSI(0), a(1), a(2), ai1(3)}}},
420420

421+
// arg0: i32 channel mask, CONSTANT
422+
// arg1: i16 scale, CONSTANT
423+
// arg2: i32 surface index
424+
// arg3: i32 global offset in bytes
425+
// arg4: vXi32 element offset in bytes
426+
// arg5: vXi1 predicate (overloaded)
427+
{"gather4_masked_scaled2",
428+
{"gather4.masked.scaled2", {t(2), t(4), aSI(0), a(1), a(2), ai1(3)}}},
429+
421430
// arg0: vXi1 predicate (overloaded)
422431
// arg1: i32 log2 num blocks, CONSTANT (0/1/2 for num blocks 1/2/4)
423432
// arg2: i16 scale, CONSTANT

sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -738,14 +738,15 @@ __esimd_oword_ld(SurfIndAliasTy surf_ind, uint32_t addr)
738738
}
739739
#endif // __SYCL_DEVICE_ONLY__
740740

741-
// gather4 scaled from a surface/SLM
742-
template <typename Ty, int N, typename SurfIndAliasTy,
743-
__ESIMD_NS::rgba_channel_mask Mask, int16_t Scale = 0>
741+
// gather4 scaled masked from a surface/SLM
742+
template <typename Ty, int N, __ESIMD_NS::rgba_channel_mask Mask,
743+
typename SurfIndAliasTy, int16_t Scale = 0>
744744
__ESIMD_INTRIN
745745
__ESIMD_DNS::vector_type_t<Ty, N * get_num_channels_enabled(Mask)>
746-
__esimd_gather4_scaled(__ESIMD_DNS::simd_mask_storage_t<N> pred,
747-
SurfIndAliasTy surf_ind, int global_offset,
748-
__ESIMD_DNS::vector_type_t<uint32_t, N> offsets)
746+
__esimd_gather4_masked_scaled2(
747+
SurfIndAliasTy surf_ind, int global_offset,
748+
__ESIMD_DNS::vector_type_t<uint32_t, N> offsets,
749+
__ESIMD_DNS::simd_mask_storage_t<N> pred)
749750
#ifdef __SYCL_DEVICE_ONLY__
750751
;
751752
#else

sycl/include/sycl/ext/intel/esimd/memory.hpp

Lines changed: 110 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -506,7 +506,7 @@ __ESIMD_API void scalar_store(AccessorTy acc, uint32_t offset, T val) {
506506
/// R1 R2 ... Rn A1 A2 ... An
507507
/// @endcode
508508
///
509-
/// @tparam Tx Element type of the returned vector. Must be 4 bytes in size.
509+
/// @tparam T Element type of the returned vector. Must be 4 bytes in size.
510510
/// @tparam N Number of pixels to access (matches the size of the \c offsets
511511
/// vector). Must be 8, 16 or 32.
512512
/// @tparam Mask A pixel's channel mask.
@@ -517,16 +517,26 @@ __ESIMD_API void scalar_store(AccessorTy acc, uint32_t offset, T val) {
517517
/// undefined.
518518
/// @return Read data - up to N*4 values of type \c Tx.
519519
///
520-
template <typename Tx, int N, rgba_channel_mask Mask,
521-
class T = detail::__raw_t<Tx>>
522-
__ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && (sizeof(T) == 4),
523-
simd<Tx, N * get_num_channels_enabled(Mask)>>
524-
gather_rgba(const Tx *p, simd<uint32_t, N> offsets, simd_mask<N> mask = 1) {
525-
520+
template <rgba_channel_mask RGBAMask, typename T, int N>
521+
__ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && sizeof(T) == 4,
522+
simd<T, N * get_num_channels_enabled(RGBAMask)>>
523+
gather_rgba(const T *p, simd<uint32_t, N> offsets, simd_mask<N> mask = 1) {
526524
simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
527525
simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
528526
addrs = addrs + offsets_i;
529-
return __esimd_svm_gather4_scaled<T, N, Mask>(addrs.data(), mask.data());
527+
return __esimd_svm_gather4_scaled<detail::__raw_t<T>, N, RGBAMask>(
528+
addrs.data(), mask.data());
529+
}
530+
531+
template <typename T, int N, rgba_channel_mask RGBAMask>
532+
__SYCL_DEPRECATED("use gather_rgba<rgba_channel_mask>()")
533+
__ESIMD_API std::enable_if_t<
534+
(N == 8 || N == 16 || N == 32) && sizeof(T) == 4,
535+
simd<T, N * get_num_channels_enabled(
536+
RGBAMask)>> gather_rgba(const T *p,
537+
simd<uint32_t, N> offsets,
538+
simd_mask<N> mask = 1) {
539+
return gather_rgba<RGBAMask>(p, offsets, mask);
530540
}
531541

532542
namespace detail {
@@ -541,35 +551,110 @@ template <rgba_channel_mask M> static void validate_rgba_write_channel_mask() {
541551
/// @anchor usm_scatter_rgba
542552
/// Transpose and scatter pixels to given memory locations defined by the base
543553
/// pointer \c p and \c offsets. Up to 4 32-bit data elements may be accessed at
544-
/// each address depending on the channel mask \c Mask template parameter. Each
554+
/// each address depending on the channel mask \c RGBAMask. Each
545555
/// pixel's address must be 4 byte aligned. This is basically an inverse
546556
/// operation for gather_rgba. Unlike \c gather_rgba, this function imposes
547557
/// restrictions on possible \c Mask template argument values. It can only be
548558
/// one of the following: \c ABGR, \c BGR, \c GR, \c R.
549559
///
550-
/// @tparam Tx Element type of the returned vector. Must be 4 bytes in size.
560+
/// @tparam T Element type of the returned vector. Must be 4 bytes in size.
551561
/// @tparam N Number of pixels to access (matches the size of the \c offsets
552562
/// vector). Must be 8, 16 or 32.
553-
/// @tparam Mask A pixel's channel mask.
563+
/// @tparam RGBAMask A pixel's channel mask.
554564
/// @param p The USM base pointer representing memory address of the access.
555565
/// @param vals values to be written.
556566
/// @param offsets Byte offsets of the pixels relative to the base pointer.
557567
/// @param mask Memory access mask. Pixels with zero corresponding mask's
558568
/// predicate are not accessed. Their values in the resulting vector are
559569
/// undefined.
560570
///
561-
template <typename Tx, int N, rgba_channel_mask Mask,
562-
class T = detail::__raw_t<Tx>>
563-
__ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && (sizeof(T) == 4)>
564-
scatter_rgba(Tx *p, simd<uint32_t, N> offsets,
565-
simd<Tx, N * get_num_channels_enabled(Mask)> vals,
571+
template <rgba_channel_mask RGBAMask, typename T, int N>
572+
__ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && sizeof(T) == 4>
573+
scatter_rgba(T *p, simd<uint32_t, N> offsets,
574+
simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
566575
simd_mask<N> mask = 1) {
567-
detail::validate_rgba_write_channel_mask<Mask>();
576+
detail::validate_rgba_write_channel_mask<RGBAMask>();
568577
simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
569578
simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
570579
addrs = addrs + offsets_i;
571-
__esimd_svm_scatter4_scaled<T, N, Mask>(addrs.data(), vals.data(),
572-
mask.data());
580+
__esimd_svm_scatter4_scaled<detail::__raw_t<T>, N, RGBAMask>(
581+
addrs.data(), vals.data(), mask.data());
582+
}
583+
584+
template <typename T, int N, rgba_channel_mask RGBAMask>
585+
__SYCL_DEPRECATED("use scatter_rgba<rgba_channel_mask>()")
586+
__ESIMD_API std::
587+
enable_if_t<(N == 8 || N == 16 || N == 32) && sizeof(T) == 4> scatter_rgba(
588+
T *p, simd<uint32_t, N> offsets,
589+
simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
590+
simd_mask<N> mask = 1) {
591+
scatter_rgba<RGBAMask>(p, offsets, vals, mask);
592+
}
593+
594+
/// Gather and transpose pixels from the given memory locations defined by the
595+
/// base specified by \c acc, the global offset \c global_offset and a vector of
596+
/// offsets \c offsets. Up to 4 32-bit data elements may be accessed at each
597+
/// address depending on the channel mask \c RGBAMask. Each pixel's address must
598+
/// be 4-byte aligned.
599+
/// For usage examples, see \ref usm_gather_rgba above, the only difference
600+
/// would be the usage of an accessor instead of a usm pointer.
601+
///
602+
/// @tparam RGBAMask A pixel's channel mask.
603+
/// @tparam AccessorT The accessor type for the memory to be loaded/gathered.
604+
/// The returned vector elements mutch the accessor data type. The loaded
605+
/// elements must be 4 bytes in size.
606+
/// @tparam N Number of pixels to access (matches the size of the \c offsets
607+
/// vector). Must be 8, 16 or 32.
608+
/// @param acc The accessor representing memory address of the access.
609+
/// @param offsets Byte offsets of the pixels relative to the base pointer.
610+
/// @param global_offset Byte offset of the pixels relative to the base pointer.
611+
/// @param mask Memory access mask. Pixels with zero corresponding mask's
612+
/// predicate are not accessed. Their values in the resulting vector are
613+
/// undefined.
614+
/// @return Read data - up to N*4 values of type \c Tx.
615+
///
616+
template <rgba_channel_mask RGBAMask, typename AccessorT, int N,
617+
typename T = typename AccessorT::value_type>
618+
__ESIMD_API std::enable_if_t<((N == 8 || N == 16 || N == 32) &&
619+
sizeof(T) == 4 && !std::is_pointer_v<AccessorT>),
620+
simd<T, N * get_num_channels_enabled(RGBAMask)>>
621+
gather_rgba(AccessorT acc, simd<uint32_t, N> offsets,
622+
uint32_t global_offset = 0, simd_mask<N> mask = 1) {
623+
// TODO (performance) use hardware-supported scale once BE supports it
624+
constexpr uint32_t Scale = 0;
625+
const auto SI = get_surface_index(acc);
626+
return __esimd_gather4_masked_scaled2<detail::__raw_t<T>, N, RGBAMask,
627+
decltype(SI), Scale>(
628+
SI, global_offset, offsets.data(), mask.data());
629+
}
630+
631+
/// Gather data from the memory addressed by accessor \c acc, offset common
632+
/// for all loaded elements \c global_offset and per-element offsets \c offsets,
633+
/// and return it as simd vector. See @ref usm_gather_rgba for information about
634+
/// the operation semantics and parameter restrictions/interdependencies.
635+
/// @tparam RGBAMask Pixel's channel mask.
636+
/// @tparam AccessorT The accessor type for the memory to be stored/scattered.
637+
/// The returned vector elements mast match the accessor data type. The loaded
638+
/// elements must be 4 bytes in size.
639+
/// @tparam N The number of elements to access.
640+
/// @param offsets Byte offsets of each element.
641+
/// @param vals values to be written.
642+
/// @param global_offset Byte offset of the pixels relative to the base pointer.
643+
/// @param mask Operation mask. All-1 by default.
644+
///
645+
template <rgba_channel_mask RGBAMask, typename AccessorT, int N,
646+
typename T = typename AccessorT::value_type>
647+
__ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && sizeof(T) == 4 &&
648+
!std::is_pointer_v<AccessorT>>
649+
scatter_rgba(AccessorT acc, simd<uint32_t, N> offsets,
650+
simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
651+
uint32_t global_offset = 0, simd_mask<N> mask = 1) {
652+
detail::validate_rgba_write_channel_mask<RGBAMask>();
653+
// TODO (performance) use hardware-supported scale once BE supports it
654+
constexpr uint32_t Scale = 0;
655+
const auto SI = get_surface_index(acc);
656+
__esimd_scatter4_scaled<T, N, decltype(SI), RGBAMask, Scale>(
657+
mask.data(), SI, global_offset, offsets.data(), vals.data());
573658
}
574659

575660
/// @} sycl_esimd_memory
@@ -871,19 +956,19 @@ __ESIMD_API void slm_scalar_store(uint32_t offset, T val) {
871956
/// operation semantics and parameter restrictions/interdependencies.
872957
/// @tparam T The element type of the returned vector.
873958
/// @tparam N The number of elements to access.
874-
/// @tparam Mask Pixel's channel mask.
959+
/// @tparam RGBAMask Pixel's channel mask.
875960
/// @param offsets Byte offsets within the SLM of each element.
876961
/// @param mask Operation mask. All-1 by default.
877962
/// @return Gathered data as an \c N - element vector.
878963
///
879-
template <typename T, int N, rgba_channel_mask Mask>
964+
template <typename T, int N, rgba_channel_mask RGBAMask>
880965
__ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && (sizeof(T) == 4),
881-
simd<T, N * get_num_channels_enabled(Mask)>>
966+
simd<T, N * get_num_channels_enabled(RGBAMask)>>
882967
slm_gather_rgba(simd<uint32_t, N> offsets, simd_mask<N> mask = 1) {
883968

884-
const auto si = __ESIMD_GET_SURF_HANDLE(detail::LocalAccessorMarker());
885-
return __esimd_gather4_scaled<T, N, decltype(si), Mask>(
886-
mask.data(), si, 0 /*global_offset*/, offsets.data());
969+
const auto SI = __ESIMD_GET_SURF_HANDLE(detail::LocalAccessorMarker());
970+
return __esimd_gather4_masked_scaled2<T, N, RGBAMask>(
971+
SI, 0 /*global_offset*/, offsets.data(), mask.data());
887972
}
888973

889974
/// Gather data from the Shared Local Memory at specified \c offsets and return

sycl/test/esimd/gather_scatter_rgba.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,11 @@ void kernel(int *ptr) SYCL_ESIMD_FUNCTION {
1717
simd<uint32_t, 32> offsets(0, sizeof(int) * 4);
1818
simd<int, 32 * 4> v1(0, 1);
1919

20-
auto v0 = gather_rgba<int, 32, rgba_channel_mask::ABGR>(ptr, offsets);
20+
auto v0 = gather_rgba<rgba_channel_mask::ABGR>(ptr, offsets);
2121

2222
v0 = v0 + v1;
2323

24-
scatter_rgba<int, 32, rgba_channel_mask::ABGR>(ptr, offsets, v0);
24+
scatter_rgba<rgba_channel_mask::ABGR>(ptr, offsets, v0);
2525
}
2626

2727
constexpr int AGR_N_CHANNELS = 3;
@@ -33,5 +33,5 @@ void kernel1(int *ptr, simd<int, 32 * AGR_N_CHANNELS> v) SYCL_ESIMD_FUNCTION {
3333
// expected-error-re@* {{static_assert failed{{.*}}Only ABGR, BGR, GR, R channel masks are valid in write operations}}
3434
// expected-note@* {{in instantiation }}
3535
// expected-note@+1 {{in instantiation }}
36-
scatter_rgba<int, 32, rgba_channel_mask::AGR>(ptr, offsets, v);
36+
scatter_rgba<rgba_channel_mask::AGR>(ptr, offsets, v);
3737
}

0 commit comments

Comments
 (0)