@@ -506,7 +506,7 @@ __ESIMD_API void scalar_store(AccessorTy acc, uint32_t offset, T val) {
506
506
// / R1 R2 ... Rn A1 A2 ... An
507
507
// / @endcode
508
508
// /
509
- // / @tparam Tx Element type of the returned vector. Must be 4 bytes in size.
509
+ // / @tparam T Element type of the returned vector. Must be 4 bytes in size.
510
510
// / @tparam N Number of pixels to access (matches the size of the \c offsets
511
511
// / vector). Must be 8, 16 or 32.
512
512
// / @tparam Mask A pixel's channel mask.
@@ -517,16 +517,26 @@ __ESIMD_API void scalar_store(AccessorTy acc, uint32_t offset, T val) {
517
517
// / undefined.
518
518
// / @return Read data - up to N*4 values of type \c Tx.
519
519
// /
520
- template <typename Tx, int N, rgba_channel_mask Mask,
521
- class T = detail::__raw_t <Tx>>
522
- __ESIMD_API std::enable_if_t <(N == 8 || N == 16 || N == 32 ) && (sizeof (T) == 4 ),
523
- simd<Tx, N * get_num_channels_enabled (Mask)>>
524
- gather_rgba(const Tx *p, simd<uint32_t , N> offsets, simd_mask<N> mask = 1 ) {
525
-
520
+ template <rgba_channel_mask RGBAMask, typename T, int N>
521
+ __ESIMD_API std::enable_if_t <(N == 8 || N == 16 || N == 32 ) && sizeof (T) == 4 ,
522
+ simd<T, N * get_num_channels_enabled (RGBAMask)>>
523
+ gather_rgba(const T *p, simd<uint32_t , N> offsets, simd_mask<N> mask = 1 ) {
526
524
simd<uint64_t , N> offsets_i = convert<uint64_t >(offsets);
527
525
simd<uint64_t , N> addrs (reinterpret_cast <uint64_t >(p));
528
526
addrs = addrs + offsets_i;
529
- return __esimd_svm_gather4_scaled<T, N, Mask>(addrs.data (), mask.data ());
527
+ return __esimd_svm_gather4_scaled<detail::__raw_t <T>, N, RGBAMask>(
528
+ addrs.data (), mask.data ());
529
+ }
530
+
531
+ template <typename T, int N, rgba_channel_mask RGBAMask>
532
+ __SYCL_DEPRECATED (" use gather_rgba<rgba_channel_mask>()" )
533
+ __ESIMD_API std::enable_if_t<
534
+ (N == 8 || N == 16 || N == 32 ) && sizeof(T) == 4,
535
+ simd<T, N * get_num_channels_enabled(
536
+ RGBAMask)>> gather_rgba(const T *p,
537
+ simd<uint32_t , N> offsets,
538
+ simd_mask<N> mask = 1 ) {
539
+ return gather_rgba<RGBAMask>(p, offsets, mask);
530
540
}
531
541
532
542
namespace detail {
@@ -541,35 +551,110 @@ template <rgba_channel_mask M> static void validate_rgba_write_channel_mask() {
541
551
// / @anchor usm_scatter_rgba
542
552
// / Transpose and scatter pixels to given memory locations defined by the base
543
553
// / pointer \c p and \c offsets. Up to 4 32-bit data elements may be accessed at
544
- // / each address depending on the channel mask \c Mask template parameter . Each
554
+ // / each address depending on the channel mask \c RGBAMask . Each
545
555
// / pixel's address must be 4 byte aligned. This is basically an inverse
546
556
// / operation for gather_rgba. Unlike \c gather_rgba, this function imposes
547
557
// / restrictions on possible \c Mask template argument values. It can only be
548
558
// / one of the following: \c ABGR, \c BGR, \c GR, \c R.
549
559
// /
550
- // / @tparam Tx Element type of the returned vector. Must be 4 bytes in size.
560
+ // / @tparam T Element type of the returned vector. Must be 4 bytes in size.
551
561
// / @tparam N Number of pixels to access (matches the size of the \c offsets
552
562
// / vector). Must be 8, 16 or 32.
553
- // / @tparam Mask A pixel's channel mask.
563
+ // / @tparam RGBAMask A pixel's channel mask.
554
564
// / @param p The USM base pointer representing memory address of the access.
555
565
// / @param vals values to be written.
556
566
// / @param offsets Byte offsets of the pixels relative to the base pointer.
557
567
// / @param mask Memory access mask. Pixels with zero corresponding mask's
558
568
// / predicate are not accessed. Their values in the resulting vector are
559
569
// / undefined.
560
570
// /
561
- template <typename Tx, int N, rgba_channel_mask Mask,
562
- class T = detail::__raw_t <Tx>>
563
- __ESIMD_API std::enable_if_t <(N == 8 || N == 16 || N == 32 ) && (sizeof (T) == 4 )>
564
- scatter_rgba (Tx *p, simd<uint32_t , N> offsets,
565
- simd<Tx, N * get_num_channels_enabled (Mask)> vals,
571
+ template <rgba_channel_mask RGBAMask, typename T, int N>
572
+ __ESIMD_API std::enable_if_t <(N == 8 || N == 16 || N == 32 ) && sizeof (T) == 4 >
573
+ scatter_rgba (T *p, simd<uint32_t , N> offsets,
574
+ simd<T, N * get_num_channels_enabled (RGBAMask)> vals,
566
575
simd_mask<N> mask = 1) {
567
- detail::validate_rgba_write_channel_mask<Mask >();
576
+ detail::validate_rgba_write_channel_mask<RGBAMask >();
568
577
simd<uint64_t , N> offsets_i = convert<uint64_t >(offsets);
569
578
simd<uint64_t , N> addrs (reinterpret_cast <uint64_t >(p));
570
579
addrs = addrs + offsets_i;
571
- __esimd_svm_scatter4_scaled<T, N, Mask>(addrs.data (), vals.data (),
572
- mask.data ());
580
+ __esimd_svm_scatter4_scaled<detail::__raw_t <T>, N, RGBAMask>(
581
+ addrs.data (), vals.data (), mask.data ());
582
+ }
583
+
584
+ template <typename T, int N, rgba_channel_mask RGBAMask>
585
+ __SYCL_DEPRECATED (" use scatter_rgba<rgba_channel_mask>()" )
586
+ __ESIMD_API std::
587
+ enable_if_t<(N == 8 || N == 16 || N == 32 ) && sizeof(T) == 4> scatter_rgba(
588
+ T *p, simd<uint32_t , N> offsets,
589
+ simd<T, N * get_num_channels_enabled (RGBAMask)> vals,
590
+ simd_mask<N> mask = 1) {
591
+ scatter_rgba<RGBAMask>(p, offsets, vals, mask);
592
+ }
593
+
594
+ // / Gather and transpose pixels from the given memory locations defined by the
595
+ // / base specified by \c acc, the global offset \c global_offset and a vector of
596
+ // / offsets \c offsets. Up to 4 32-bit data elements may be accessed at each
597
+ // / address depending on the channel mask \c RGBAMask. Each pixel's address must
598
+ // / be 4-byte aligned.
599
+ // / For usage examples, see \ref usm_gather_rgba above, the only difference
600
+ // / would be the usage of an accessor instead of a usm pointer.
601
+ // /
602
+ // / @tparam RGBAMask A pixel's channel mask.
603
+ // / @tparam AccessorT The accessor type for the memory to be loaded/gathered.
604
+ // / The returned vector elements mutch the accessor data type. The loaded
605
+ // / elements must be 4 bytes in size.
606
+ // / @tparam N Number of pixels to access (matches the size of the \c offsets
607
+ // / vector). Must be 8, 16 or 32.
608
+ // / @param acc The accessor representing memory address of the access.
609
+ // / @param offsets Byte offsets of the pixels relative to the base pointer.
610
+ // / @param global_offset Byte offset of the pixels relative to the base pointer.
611
+ // / @param mask Memory access mask. Pixels with zero corresponding mask's
612
+ // / predicate are not accessed. Their values in the resulting vector are
613
+ // / undefined.
614
+ // / @return Read data - up to N*4 values of type \c Tx.
615
+ // /
616
+ template <rgba_channel_mask RGBAMask, typename AccessorT, int N,
617
+ typename T = typename AccessorT::value_type>
618
+ __ESIMD_API std::enable_if_t <((N == 8 || N == 16 || N == 32 ) &&
619
+ sizeof(T) == 4 && !std::is_pointer_v<AccessorT>),
620
+ simd<T, N * get_num_channels_enabled(RGBAMask)>>
621
+ gather_rgba(AccessorT acc, simd<uint32_t , N> offsets,
622
+ uint32_t global_offset = 0 , simd_mask<N> mask = 1 ) {
623
+ // TODO (performance) use hardware-supported scale once BE supports it
624
+ constexpr uint32_t Scale = 0 ;
625
+ const auto SI = get_surface_index (acc);
626
+ return __esimd_gather4_masked_scaled2<detail::__raw_t <T>, N, RGBAMask,
627
+ decltype (SI), Scale>(
628
+ SI, global_offset, offsets.data (), mask.data ());
629
+ }
630
+
631
+ // / Gather data from the memory addressed by accessor \c acc, offset common
632
+ // / for all loaded elements \c global_offset and per-element offsets \c offsets,
633
+ // / and return it as simd vector. See @ref usm_gather_rgba for information about
634
+ // / the operation semantics and parameter restrictions/interdependencies.
635
+ // / @tparam RGBAMask Pixel's channel mask.
636
+ // / @tparam AccessorT The accessor type for the memory to be stored/scattered.
637
+ // / The returned vector elements mast match the accessor data type. The loaded
638
+ // / elements must be 4 bytes in size.
639
+ // / @tparam N The number of elements to access.
640
+ // / @param offsets Byte offsets of each element.
641
+ // / @param vals values to be written.
642
+ // / @param global_offset Byte offset of the pixels relative to the base pointer.
643
+ // / @param mask Operation mask. All-1 by default.
644
+ // /
645
+ template <rgba_channel_mask RGBAMask, typename AccessorT, int N,
646
+ typename T = typename AccessorT::value_type>
647
+ __ESIMD_API std::enable_if_t <(N == 8 || N == 16 || N == 32 ) && sizeof(T) == 4 &&
648
+ !std::is_pointer_v<AccessorT>>
649
+ scatter_rgba(AccessorT acc, simd<uint32_t , N> offsets,
650
+ simd<T, N * get_num_channels_enabled (RGBAMask)> vals,
651
+ uint32_t global_offset = 0, simd_mask<N> mask = 1) {
652
+ detail::validate_rgba_write_channel_mask<RGBAMask>();
653
+ // TODO (performance) use hardware-supported scale once BE supports it
654
+ constexpr uint32_t Scale = 0 ;
655
+ const auto SI = get_surface_index (acc);
656
+ __esimd_scatter4_scaled<T, N, decltype (SI), RGBAMask, Scale>(
657
+ mask.data (), SI, global_offset, offsets.data (), vals.data ());
573
658
}
574
659
575
660
// / @} sycl_esimd_memory
@@ -871,19 +956,19 @@ __ESIMD_API void slm_scalar_store(uint32_t offset, T val) {
871
956
// / operation semantics and parameter restrictions/interdependencies.
872
957
// / @tparam T The element type of the returned vector.
873
958
// / @tparam N The number of elements to access.
874
- // / @tparam Mask Pixel's channel mask.
959
+ // / @tparam RGBAMask Pixel's channel mask.
875
960
// / @param offsets Byte offsets within the SLM of each element.
876
961
// / @param mask Operation mask. All-1 by default.
877
962
// / @return Gathered data as an \c N - element vector.
878
963
// /
879
- template <typename T, int N, rgba_channel_mask Mask >
964
+ template <typename T, int N, rgba_channel_mask RGBAMask >
880
965
__ESIMD_API std::enable_if_t <(N == 8 || N == 16 || N == 32 ) && (sizeof (T) == 4 ),
881
- simd<T, N * get_num_channels_enabled (Mask )>>
966
+ simd<T, N * get_num_channels_enabled (RGBAMask )>>
882
967
slm_gather_rgba(simd<uint32_t , N> offsets, simd_mask<N> mask = 1 ) {
883
968
884
- const auto si = __ESIMD_GET_SURF_HANDLE (detail::LocalAccessorMarker ());
885
- return __esimd_gather4_scaled <T, N, decltype (si), Mask >(
886
- mask. data (), si, 0 /* global_offset*/ , offsets.data ());
969
+ const auto SI = __ESIMD_GET_SURF_HANDLE (detail::LocalAccessorMarker ());
970
+ return __esimd_gather4_masked_scaled2 <T, N, RGBAMask >(
971
+ SI, 0 /* global_offset*/ , offsets. data (), mask .data ());
887
972
}
888
973
889
974
// / Gather data from the Shared Local Memory at specified \c offsets and return
0 commit comments