@@ -449,7 +449,8 @@ __ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value,
449
449
lsc_gather (AccessorTy acc, __ESIMD_NS::simd<uint32_t , N> offsets,
450
450
__ESIMD_NS::simd_mask<N> pred = 1 ) {
451
451
#ifdef __ESIMD_FORCE_STATELESS_MEM
452
- return lsc_gather<T, N, DS, L1H>(acc.get_pointer ().get (), offsets, pred);
452
+ return lsc_gather<T, NElts, DS, L1H, L3H>(acc.get_pointer ().get (), offsets,
453
+ pred);
453
454
#else
454
455
detail::check_lsc_vector_size<NElts>();
455
456
detail::check_lsc_data_size<T, DS>();
@@ -478,11 +479,11 @@ lsc_gather(AccessorTy acc, __ESIMD_NS::simd<uint32_t, N> offsets,
478
479
// / given address, where S is a byte size of an "element" defined by the \c DS
479
480
// / template parameter. The maximum size of accessed block is 512 bytes for PVC
480
481
// / and 256 bytes for ACM (DG2).
481
- // / When \? DS equals \? lsc_data_size::u64, the address must be 8-byte aligned,
482
+ // / When \c DS equals \c lsc_data_size::u64, the address must be 8-byte aligned,
482
483
// / otherwise - 4-bytes aligned. Allowed values for the data size are
483
- // / \? lsc_data_size::u32 and \? lsc_data_size::u64. Allowed NElts values are
484
+ // / \c lsc_data_size::u32 and \c lsc_data_size::u64. Allowed NElts values are
484
485
// / 1, 2, 3, 4, 8, 16, 32, 64.
485
- // / Note that to access 512 bytes, DS must be \? lsc_data_size::u64 and \c NElts
486
+ // / Note that to access 512 bytes, DS must be \c lsc_data_size::u64 and \c NElts
486
487
// / must be 64.
487
488
// /
488
489
// / @tparam T is element type.
@@ -518,9 +519,19 @@ lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred = 1) {
518
519
constexpr detail::lsc_vector_size _VS =
519
520
detail::to_lsc_vector_size<NElts / SmallIntFactor>();
520
521
if constexpr (SmallIntFactor == 1 ) {
521
- return __esimd_lsc_load_stateless<T, L1H, L3H, _AddressScale, _ImmOffset,
522
- _DS, _VS, _Transposed, N>(pred.data (),
523
- addrs.data ());
522
+ if constexpr (_DS == lsc_data_size::u32 ) {
523
+ __ESIMD_NS::simd<uint32_t , NElts> result =
524
+ __esimd_lsc_load_stateless<uint32_t , L1H, L3H, _AddressScale,
525
+ _ImmOffset, lsc_data_size::u32 , _VS,
526
+ _Transposed, N>(pred.data (), addrs.data ());
527
+ return result.template bit_cast_view <T>();
528
+ } else {
529
+ __ESIMD_NS::simd<uint64_t , NElts> result =
530
+ __esimd_lsc_load_stateless<uint64_t , L1H, L3H, _AddressScale,
531
+ _ImmOffset, lsc_data_size::u64 , _VS,
532
+ _Transposed, N>(pred.data (), addrs.data ());
533
+ return result.template bit_cast_view <T>();
534
+ }
524
535
} else {
525
536
__ESIMD_NS::simd<uint32_t , NElts / SmallIntFactor> result =
526
537
__esimd_lsc_load_stateless<uint32_t , L1H, L3H, _AddressScale,
@@ -582,11 +593,20 @@ lsc_block_load(AccessorTy acc, uint32_t offset,
582
593
detail::to_lsc_vector_size<NElts / SmallIntFactor>();
583
594
584
595
if constexpr (SmallIntFactor == 1 ) {
585
- return __esimd_lsc_load_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
586
- _VS, _Transposed, N>(pred.data (),
587
- offsets.data (), si);
596
+ if constexpr (_DS == lsc_data_size::u32 ) {
597
+ __ESIMD_NS::simd<uint32_t , NElts> result =
598
+ __esimd_lsc_load_bti<uint32_t , L1H, L3H, _AddressScale, _ImmOffset,
599
+ lsc_data_size::u32 , _VS, _Transposed, N>(
600
+ pred.data (), offsets.data (), si);
601
+ return result.template bit_cast_view <T>();
602
+ } else {
603
+ __ESIMD_NS::simd<uint64_t , NElts> result =
604
+ __esimd_lsc_load_bti<uint64_t , L1H, L3H, _AddressScale, _ImmOffset,
605
+ lsc_data_size::u64 , _VS, _Transposed, N>(
606
+ pred.data (), offsets.data (), si);
607
+ return result.template bit_cast_view <T>();
608
+ }
588
609
} else {
589
-
590
610
__ESIMD_NS::simd<uint32_t , NElts / SmallIntFactor> result =
591
611
__esimd_lsc_load_bti<uint32_t , L1H, L3H, _AddressScale, _ImmOffset,
592
612
lsc_data_size::u32 , _VS, _Transposed, N>(
@@ -904,8 +924,8 @@ lsc_scatter(AccessorTy acc, __ESIMD_NS::simd<uint32_t, N> offsets,
904
924
__ESIMD_NS::simd<T, N * NElts> vals,
905
925
__ESIMD_NS::simd_mask<N> pred = 1 ) {
906
926
#ifdef __ESIMD_FORCE_STATELESS_MEM
907
- lsc_scatter<T, NElts, DS, L1H>(__ESIMD_DNS::accessorToPointer<T>(acc),
908
- offsets, pred);
927
+ lsc_scatter<T, NElts, DS, L1H, L3H >(__ESIMD_DNS::accessorToPointer<T>(acc),
928
+ offsets, vals , pred);
909
929
#else
910
930
detail::check_lsc_vector_size<NElts>();
911
931
detail::check_lsc_data_size<T, DS>();
@@ -967,13 +987,23 @@ __ESIMD_API void lsc_block_store(T *p, __ESIMD_NS::simd<T, NElts> vals,
967
987
constexpr detail::lsc_vector_size _VS =
968
988
detail::to_lsc_vector_size<NElts / SmallIntFactor>();
969
989
if constexpr (SmallIntFactor == 1 ) {
970
-
971
- __esimd_lsc_store_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
972
- _VS, _Transposed, N>(pred.data (), addrs.data (),
973
- vals.data ());
990
+ if constexpr (_DS == lsc_data_size::u32 ) {
991
+ __esimd_lsc_store_stateless<uint32_t , L1H, L3H, _AddressScale, _ImmOffset,
992
+ _DS, _VS, _Transposed, N>(
993
+ pred.data (), addrs.data (),
994
+ sycl::bit_cast<__ESIMD_DNS::vector_type_t <uint32_t , NElts>>(
995
+ vals.data ()));
996
+ } else {
997
+ __esimd_lsc_store_stateless<uint64_t , L1H, L3H, _AddressScale, _ImmOffset,
998
+ _DS, _VS, _Transposed, N>(
999
+ pred.data (), addrs.data (),
1000
+ sycl::bit_cast<__ESIMD_DNS::vector_type_t <uint64_t , NElts>>(
1001
+ vals.data ()));
1002
+ }
974
1003
} else {
975
- __ESIMD_NS::simd<uint32_t , NElts / SmallIntFactor> tmp =
976
- vals.template bit_cast_view <uint32_t >();
1004
+ __ESIMD_NS::simd<uint32_t , NElts / SmallIntFactor> tmp = sycl::bit_cast<
1005
+ __ESIMD_DNS::vector_type_t <uint32_t , NElts / SmallIntFactor>>(
1006
+ vals.data ());
977
1007
978
1008
__esimd_lsc_store_stateless<uint32_t , L1H, L3H, _AddressScale, _ImmOffset,
979
1009
lsc_data_size::u32 , _VS, _Transposed, N>(
@@ -1010,7 +1040,7 @@ lsc_block_store(AccessorTy acc, uint32_t offset,
1010
1040
__ESIMD_NS::simd<T, NElts> vals,
1011
1041
__ESIMD_NS::simd_mask<1 > pred = 1 ) {
1012
1042
#ifdef __ESIMD_FORCE_STATELESS_MEM
1013
- lsc_block_store<T, NElts, DS, L1H>(
1043
+ lsc_block_store<T, NElts, DS, L1H, L3H >(
1014
1044
__ESIMD_DNS::accessorToPointer<T>(acc, offset), vals, pred);
1015
1045
#else
1016
1046
detail::check_lsc_data_size<T, DS>();
@@ -1033,15 +1063,29 @@ lsc_block_store(AccessorTy acc, uint32_t offset,
1033
1063
constexpr detail::lsc_vector_size _VS =
1034
1064
detail::to_lsc_vector_size<NElts / SmallIntFactor>();
1035
1065
if constexpr (SmallIntFactor > 1 ) {
1036
- __ESIMD_NS::simd<uint32_t , NElts / SmallIntFactor> Tmp =
1037
- vals.template bit_cast_view <uint32_t >();
1038
1066
__esimd_lsc_store_bti<uint32_t , L1H, L3H, _AddressScale, _ImmOffset,
1039
1067
lsc_data_size::u32 , _VS, _Transposed, N>(
1040
- pred.data (), offsets.data (), Tmp.data (), si);
1068
+ pred.data (), offsets.data (),
1069
+ sycl::bit_cast<
1070
+ __ESIMD_DNS::vector_type_t <uint32_t , NElts / SmallIntFactor>>(
1071
+ vals.data ()),
1072
+ si);
1041
1073
} else {
1042
- __esimd_lsc_store_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
1043
- _Transposed, N>(pred.data (), offsets.data (),
1044
- vals.data (), si);
1074
+ if constexpr (_DS == lsc_data_size::u32 ) {
1075
+ __esimd_lsc_store_bti<uint32_t , L1H, L3H, _AddressScale, _ImmOffset, _DS,
1076
+ _VS, _Transposed, N>(
1077
+ pred.data (), offsets.data (),
1078
+ sycl::bit_cast<__ESIMD_DNS::vector_type_t <uint32_t , NElts>>(
1079
+ vals.data ()),
1080
+ si);
1081
+ } else {
1082
+ __esimd_lsc_store_bti<uint64_t , L1H, L3H, _AddressScale, _ImmOffset, _DS,
1083
+ _VS, _Transposed, N>(
1084
+ pred.data (), offsets.data (),
1085
+ sycl::bit_cast<__ESIMD_DNS::vector_type_t <uint64_t , NElts>>(
1086
+ vals.data ()),
1087
+ si);
1088
+ }
1045
1089
}
1046
1090
#endif
1047
1091
}
0 commit comments