@@ -910,22 +910,22 @@ ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a)
910
910
* Table structure for a 16x 8-bit entry table.
911
911
*/
912
912
struct vtable8_16x8 {
913
- vint8 t0;
913
+ svuint8_8_t t0;
914
914
};
915
915
916
916
/*
917
917
* Table structure for a 32x 8-bit entry table.
918
918
*/
919
919
struct vtable8_32x8 {
920
- vint8 t0;
920
+ svuint8_8_t t0;
921
921
};
922
922
923
923
/*
924
924
* Table structure for a 64x 8-bit entry table.
925
925
*/
926
926
struct vtable8_64x8 {
927
- vint8 t0;
928
- vint8 t1;
927
+ svuint8_8_t t0;
928
+ svuint8_8_t t1;
929
929
};
930
930
931
931
/* *
@@ -936,7 +936,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
936
936
const uint8_t * data
937
937
) {
938
938
// Top half of register will be zeros
939
- table.t0 = vint8 ( svld1_u8 (svptrue_pat_b8 (SV_VL16), data) );
939
+ table.t0 = svld1_u8 (svptrue_pat_b8 (SV_VL16), data);
940
940
}
941
941
942
942
/* *
@@ -946,7 +946,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
946
946
vtable8_32x8& table,
947
947
const uint8_t * data
948
948
) {
949
- table.t0 = vint8 ( svld1_u8 (svptrue_b8 (), data) );
949
+ table.t0 = svld1_u8 (svptrue_b8 (), data);
950
950
}
951
951
952
952
/* *
@@ -956,8 +956,8 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
956
956
vtable8_64x8& table,
957
957
const uint8_t * data
958
958
) {
959
- table.t0 = vint8 ( svld1_u8 (svptrue_b8 (), data) );
960
- table.t1 = vint8 ( svld1_u8 (svptrue_b8 (), data + 32 ) );
959
+ table.t0 = svld1_u8 (svptrue_b8 (), data);
960
+ table.t1 = svld1_u8 (svptrue_b8 (), data + 32 );
961
961
}
962
962
963
963
/* *
@@ -969,11 +969,9 @@ ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
969
969
) {
970
970
// Set index byte above max index for unused bytes so table lookup returns zero
971
971
svint32_8_t idx_masked = svorr_s32_x (svptrue_b32 (), idx.m , svdup_s32 (0xFFFFFF00 ));
972
-
973
972
svuint8_8_t idx_bytes = svreinterpret_u8_s32 (idx_masked);
974
- svuint8_8_t tbl_bytes = svreinterpret_u8_s32 (tbl.t0 .m );
975
- svuint8_8_t result = svtbl_u8 (tbl_bytes, idx_bytes);
976
973
974
+ svuint8_8_t result = svtbl_u8 (tbl.t0 , idx_bytes);
977
975
return vint8 (svreinterpret_s32_u8 (result));
978
976
}
979
977
@@ -986,40 +984,32 @@ ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
986
984
) {
987
985
// Set index byte above max index for unused bytes so table lookup returns zero
988
986
svint32_8_t idx_masked = svorr_s32_x (svptrue_b32 (), idx.m , svdup_s32 (0xFFFFFF00 ));
989
-
990
987
svuint8_8_t idx_bytes = svreinterpret_u8_s32 (idx_masked);
991
- svuint8_8_t tbl_bytes = svreinterpret_u8_s32 (tbl.t0 .m );
992
- svuint8_8_t result = svtbl_u8 (tbl_bytes, idx_bytes);
993
988
989
+ svuint8_8_t result = svtbl_u8 (tbl.t0 , idx_bytes);
994
990
return vint8 (svreinterpret_s32_u8 (result));
995
991
}
996
992
997
993
/* *
998
994
* @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
995
+ *
996
+ * Future: SVE2 can directly do svtbl2_u8() for a two register table.
999
997
*/
1000
998
ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit (
1001
999
const vtable8_64x8& tbl,
1002
1000
vint8 idx
1003
1001
) {
1004
1002
// Set index byte above max index for unused bytes so table lookup returns zero
1005
- svint32_8_t literal32 = svdup_s32 (32 );
1006
- svbool_8_t idx_lo_select = svcmplt (svptrue_b32 (), idx.m , literal32);
1007
- svint32_8_t idx_lo_masked = svorr_s32_x (svptrue_b32 (), idx.m , svdup_s32 (0xFFFFFF00 ));
1008
- svint32_8_t idx_hi_masked = svorr_s32_x (svptrue_b32 (), idx.m - literal32, svdup_s32 (0xFFFFFF00 ));
1003
+ svint32_8_t idxm = svorr_s32_x (svptrue_b32 (), idx.m , svdup_s32 (0xFFFFFF00 ));
1009
1004
1010
- svuint8_8_t idx_lo_bytes = svreinterpret_u8_s32 (idx_lo_masked );
1011
- svuint8_8_t idx_hi_bytes = svreinterpret_u8_s32 (idx_hi_masked );
1005
+ svuint8_8_t idxm8 = svreinterpret_u8_s32 (idxm );
1006
+ svuint8_8_t t0_lookup = svtbl_u8 (tbl. t0 , idxm8 );
1012
1007
1013
- svuint8_8_t tbl0_bytes = svreinterpret_u8_s32 (tbl. t0 . m );
1014
- svuint8_8_t tbl1_bytes = svreinterpret_u8_s32 (tbl.t1 . m );
1008
+ idxm8 = svsub_u8_x ( svptrue_b8 (), idxm8, svdup_u8 ( 32 ) );
1009
+ svuint8_8_t t1_lookup = svtbl_u8 (tbl.t1 , idxm8 );
1015
1010
1016
- svint32_8_t t0_lookup = svreinterpret_s32_u8 (svtbl_u8 (tbl0_bytes, idx_lo_bytes));
1017
- svint32_8_t t1_lookup = svreinterpret_s32_u8 (svtbl_u8 (tbl1_bytes, idx_hi_bytes));
1018
-
1019
- svint32_8_t result = svsel_s32 (idx_lo_select, t0_lookup, t1_lookup);
1020
-
1021
- // Future: SVE2 can directly do svtbl2_u8() for a two register table
1022
- return vint8 (result);
1011
+ svuint8_8_t result = svorr_u8_x (svptrue_b32 (), t0_lookup, t1_lookup);
1012
+ return vint8 (svreinterpret_s32_u8 (result));
1023
1013
}
1024
1014
1025
1015
/* *
0 commit comments