Skip to content

Commit e371306

Browse files
authored
Simplify SVE vtable lookup (#494)
This has no impact on compression, but gives a small (0.5%) improvement to decompression performance.
1 parent 91e99dc commit e371306

File tree

2 files changed

+21
-30
lines changed

2 files changed

+21
-30
lines changed

Source/UnitTest/test_decode.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ TEST(decode, decode12x12)
6565

6666
status = astcenc_decompress_image(context, data, 16, &image, &swizzle, 0);
6767
EXPECT_EQ(status, ASTCENC_SUCCESS);
68-
68+
#if 0
6969
for (int y = 0; y < 12; y++)
7070
{
7171
for (int x = 0; x < 12; x++)
@@ -74,6 +74,7 @@ TEST(decode, decode12x12)
7474
printf("[%2dx%2d] = %03d, %03d, %03d, %03d\n", x, y, pixel[0], pixel[1], pixel[2], pixel[3]);
7575
}
7676
}
77+
#endif
7778
}
7879

7980
}

Source/astcenc_vecmathlib_sve_8.h

Lines changed: 19 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -910,22 +910,22 @@ ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a)
910910
* Table structure for a 16x 8-bit entry table.
911911
*/
912912
struct vtable8_16x8 {
913-
vint8 t0;
913+
svuint8_8_t t0;
914914
};
915915

916916
/*
917917
* Table structure for a 32x 8-bit entry table.
918918
*/
919919
struct vtable8_32x8 {
920-
vint8 t0;
920+
svuint8_8_t t0;
921921
};
922922

923923
/*
924924
* Table structure for a 64x 8-bit entry table.
925925
*/
926926
struct vtable8_64x8 {
927-
vint8 t0;
928-
vint8 t1;
927+
svuint8_8_t t0;
928+
svuint8_8_t t1;
929929
};
930930

931931
/**
@@ -936,7 +936,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
936936
const uint8_t* data
937937
) {
938938
// Top half of register will be zeros
939-
table.t0 = vint8(svld1_u8(svptrue_pat_b8(SV_VL16), data));
939+
table.t0 = svld1_u8(svptrue_pat_b8(SV_VL16), data);
940940
}
941941

942942
/**
@@ -946,7 +946,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
946946
vtable8_32x8& table,
947947
const uint8_t* data
948948
) {
949-
table.t0 = vint8(svld1_u8(svptrue_b8(), data));
949+
table.t0 = svld1_u8(svptrue_b8(), data);
950950
}
951951

952952
/**
@@ -956,8 +956,8 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
956956
vtable8_64x8& table,
957957
const uint8_t* data
958958
) {
959-
table.t0 = vint8(svld1_u8(svptrue_b8(), data));
960-
table.t1 = vint8(svld1_u8(svptrue_b8(), data + 32));
959+
table.t0 = svld1_u8(svptrue_b8(), data);
960+
table.t1 = svld1_u8(svptrue_b8(), data + 32);
961961
}
962962

963963
/**
@@ -969,11 +969,9 @@ ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
969969
) {
970970
// Set index byte above max index for unused bytes so table lookup returns zero
971971
svint32_8_t idx_masked = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00));
972-
973972
svuint8_8_t idx_bytes = svreinterpret_u8_s32(idx_masked);
974-
svuint8_8_t tbl_bytes = svreinterpret_u8_s32(tbl.t0.m);
975-
svuint8_8_t result = svtbl_u8(tbl_bytes, idx_bytes);
976973

974+
svuint8_8_t result = svtbl_u8(tbl.t0, idx_bytes);
977975
return vint8(svreinterpret_s32_u8(result));
978976
}
979977

@@ -986,40 +984,32 @@ ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
986984
) {
987985
// Set index byte above max index for unused bytes so table lookup returns zero
988986
svint32_8_t idx_masked = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00));
989-
990987
svuint8_8_t idx_bytes = svreinterpret_u8_s32(idx_masked);
991-
svuint8_8_t tbl_bytes = svreinterpret_u8_s32(tbl.t0.m);
992-
svuint8_8_t result = svtbl_u8(tbl_bytes, idx_bytes);
993988

989+
svuint8_8_t result = svtbl_u8(tbl.t0, idx_bytes);
994990
return vint8(svreinterpret_s32_u8(result));
995991
}
996992

997993
/**
998994
* @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
995+
*
996+
* Future: SVE2 can directly do svtbl2_u8() for a two register table.
999997
*/
1000998
ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
1001999
const vtable8_64x8& tbl,
10021000
vint8 idx
10031001
) {
10041002
// Set index byte above max index for unused bytes so table lookup returns zero
1005-
svint32_8_t literal32 = svdup_s32(32);
1006-
svbool_8_t idx_lo_select = svcmplt(svptrue_b32(), idx.m, literal32);
1007-
svint32_8_t idx_lo_masked = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00));
1008-
svint32_8_t idx_hi_masked = svorr_s32_x(svptrue_b32(), idx.m - literal32, svdup_s32(0xFFFFFF00));
1003+
svint32_8_t idxm = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00));
10091004

1010-
svuint8_8_t idx_lo_bytes = svreinterpret_u8_s32(idx_lo_masked);
1011-
svuint8_8_t idx_hi_bytes = svreinterpret_u8_s32(idx_hi_masked);
1005+
svuint8_8_t idxm8 = svreinterpret_u8_s32(idxm);
1006+
svuint8_8_t t0_lookup = svtbl_u8(tbl.t0, idxm8);
10121007

1013-
svuint8_8_t tbl0_bytes = svreinterpret_u8_s32(tbl.t0.m);
1014-
svuint8_8_t tbl1_bytes = svreinterpret_u8_s32(tbl.t1.m);
1008+
idxm8 = svsub_u8_x(svptrue_b8(), idxm8, svdup_u8(32));
1009+
svuint8_8_t t1_lookup = svtbl_u8(tbl.t1, idxm8);
10151010

1016-
svint32_8_t t0_lookup = svreinterpret_s32_u8(svtbl_u8(tbl0_bytes, idx_lo_bytes));
1017-
svint32_8_t t1_lookup = svreinterpret_s32_u8(svtbl_u8(tbl1_bytes, idx_hi_bytes));
1018-
1019-
svint32_8_t result = svsel_s32(idx_lo_select, t0_lookup, t1_lookup);
1020-
1021-
// Future: SVE2 can directly do svtbl2_u8() for a two register table
1022-
return vint8(result);
1011+
svuint8_8_t result = svorr_u8_x(svptrue_b32(), t0_lookup, t1_lookup);
1012+
return vint8(svreinterpret_s32_u8(result));
10231013
}
10241014

10251015
/**

0 commit comments

Comments
 (0)