From 1e748f916a78277749af2a6d44e30f9a1fd9ee65 Mon Sep 17 00:00:00 2001 From: Peter Harris Date: Fri, 9 Aug 2024 23:02:39 +0100 Subject: [PATCH 1/8] New vtable API for x86 --- Source/UnitTest/test_simd.cpp | 183 ++++++++++++------ Source/astcenc_decompress_symbolic.cpp | 27 +-- .../astcenc_ideal_endpoints_and_weights.cpp | 19 +- Source/astcenc_vecmathlib.h | 12 ++ Source/astcenc_vecmathlib_avx2_8.h | 128 +++++++----- Source/astcenc_vecmathlib_none_4.h | 120 ++++++------ Source/astcenc_vecmathlib_sse_4.h | 129 +++++++----- Test/astc_profile_valgrind.py | 2 +- 8 files changed, 388 insertions(+), 232 deletions(-) diff --git a/Source/UnitTest/test_simd.cpp b/Source/UnitTest/test_simd.cpp index f857c3550..dcfa4e644 100644 --- a/Source/UnitTest/test_simd.cpp +++ b/Source/UnitTest/test_simd.cpp @@ -1947,43 +1947,78 @@ TEST(vmask4, not) } /** @brief Test vint4 table permute. */ -TEST(vint4, vtable_8bt_32bi_32entry) +TEST(vint4, vtable4_16x8) { - vint4 table0(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f); - vint4 table1(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f); + uint8_t data[16] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f + }; - vint4 table0p, table1p; - vtable_prepare(table0, table1, table0p, table1p); + vtable4_16x8 table; + vtable_prepare(table, data); - vint4 index(0, 7, 4, 31); + vint4 index(0, 7, 4, 15); - vint4 result = vtable_8bt_32bi(table0p, table1p, index); + vint4 result = vtable_lookup(table, index); - EXPECT_EQ(result.lane<0>(), 3); - EXPECT_EQ(result.lane<1>(), 4); - EXPECT_EQ(result.lane<2>(), 7); - EXPECT_EQ(result.lane<3>(), 28); + EXPECT_EQ(result.lane<0>(), 0); + EXPECT_EQ(result.lane<1>(), 7); + EXPECT_EQ(result.lane<2>(), 4); + EXPECT_EQ(result.lane<3>(), 15); } /** @brief Test vint4 table permute. */ -TEST(vint4, vtable_8bt_32bi_64entry) +TEST(vint4, vtable4_32x8) { - vint4 table0(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f); - vint4 table1(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f); - vint4 table2(0x20212223, 0x24252627, 0x28292a2b, 0x2c2d2e2f); - vint4 table3(0x30313233, 0x34353637, 0x38393a3b, 0x3c3d3e3f); + uint8_t data[32] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f + }; + + vtable4_32x8 table; + vtable_prepare(table, data); + + vint4 index(0, 7, 4, 31); + + vint4 result = vtable_lookup(table, index); - vint4 table0p, table1p, table2p, table3p; - vtable_prepare(table0, table1, table2, table3, table0p, table1p, table2p, table3p); + EXPECT_EQ(result.lane<0>(), 0); + EXPECT_EQ(result.lane<1>(), 7); + EXPECT_EQ(result.lane<2>(), 4); + EXPECT_EQ(result.lane<3>(), 31); +} + +/** @brief Test vint4 table permute. */ +TEST(vint4, vtable4_64x8) +{ + uint8_t data[64] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f + }; + + vtable4_64x8 table; + vtable_prepare(table, data); vint4 index(0, 7, 38, 63); - vint4 result = vtable_8bt_32bi(table0p, table1p, table2p, table3p, index); + vint4 result = vtable_lookup(table, index); + + uint8_t* hack = reinterpret_cast(&table); + std::cout << "38: " << hack[38] << "\n"; + std::cout << "63: " << hack[63] << "\n"; - EXPECT_EQ(result.lane<0>(), 3); - EXPECT_EQ(result.lane<1>(), 4); - EXPECT_EQ(result.lane<2>(), 37); - EXPECT_EQ(result.lane<3>(), 60); + EXPECT_EQ(result.lane<0>(), 0); + EXPECT_EQ(result.lane<1>(), 7); + EXPECT_EQ(result.lane<2>(), 38); + EXPECT_EQ(result.lane<3>(), 63); } /** @brief Test vint4 rgba byte interleave. */ @@ -3657,57 +3692,95 @@ TEST(vmask8, not) } /** @brief Test vint8 table permute. */ -TEST(vint8, vtable_8bt_32bi_32entry) +TEST(vint8, vtable8_16x8) { - vint4 table0(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f); - vint4 table1(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f); + uint8_t data[16] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f + }; - vint8 table0p, table1p; - vtable_prepare(table0, table1, table0p, table1p); + vtable8_16x8 table; + vtable_prepare(table, data); - vint8 index = vint8_lit(0, 7, 4, 15, 16, 20, 23, 31); + vint8 index = vint8_lit(0, 7, 4, 15, 1, 2, 14, 4); - vint8 result = vtable_8bt_32bi(table0p, table1p, index); + vint8 result = vtable_lookup(table, index); alignas(32) int ra[8]; store(result, ra); - EXPECT_EQ(ra[0], 3); - EXPECT_EQ(ra[1], 4); - EXPECT_EQ(ra[2], 7); - EXPECT_EQ(ra[3], 12); - EXPECT_EQ(ra[4], 19); - EXPECT_EQ(ra[5], 23); - EXPECT_EQ(ra[6], 20); - EXPECT_EQ(ra[7], 28); + EXPECT_EQ(ra[0], 0); + EXPECT_EQ(ra[1], 7); + EXPECT_EQ(ra[2], 4); + EXPECT_EQ(ra[3], 15); + EXPECT_EQ(ra[4], 1); + EXPECT_EQ(ra[5], 2); + EXPECT_EQ(ra[6], 14); + EXPECT_EQ(ra[7], 4); } -/** @brief Test vint4 table permute. */ -TEST(vint8, vtable_8bt_32bi_64entry) +/** @brief Test vint8 table permute. */ +TEST(vint8, vtable8_32x8) { - vint4 table0(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f); - vint4 table1(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f); - vint4 table2(0x20212223, 0x24252627, 0x28292a2b, 0x2c2d2e2f); - vint4 table3(0x30313233, 0x34353637, 0x38393a3b, 0x3c3d3e3f); + uint8_t data[32] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f + }; + + vtable8_32x8 table; + vtable_prepare(table, data); + + vint8 index = vint8_lit(0, 7, 4, 15, 16, 20, 23, 31); - vint8 table0p, table1p, table2p, table3p; - vtable_prepare(table0, table1, table2, table3, table0p, table1p, table2p, table3p); + vint8 result = vtable_lookup(table, index); + + alignas(32) int ra[8]; + store(result, ra); + + EXPECT_EQ(ra[0], 0); + EXPECT_EQ(ra[1], 7); + EXPECT_EQ(ra[2], 4); + EXPECT_EQ(ra[3], 15); + EXPECT_EQ(ra[4], 16); + EXPECT_EQ(ra[5], 20); + EXPECT_EQ(ra[6], 23); + EXPECT_EQ(ra[7], 31); +} + +/** @brief Test vint8 table permute. */ +TEST(vint8, vtable8_64x8) +{ + uint8_t data[64] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f + }; + + vtable8_64x8 table; + vtable_prepare(table, data); vint8 index = vint8_lit(0, 7, 4, 15, 16, 20, 38, 63); - vint8 result = vtable_8bt_32bi(table0p, table1p, table2p, table3p, index); + vint8 result = vtable_lookup(table, index); alignas(32) int ra[8]; store(result, ra); - EXPECT_EQ(ra[0], 3); - EXPECT_EQ(ra[1], 4); - EXPECT_EQ(ra[2], 7); - EXPECT_EQ(ra[3], 12); - EXPECT_EQ(ra[4], 19); - EXPECT_EQ(ra[5], 23); - EXPECT_EQ(ra[6], 37); - EXPECT_EQ(ra[7], 60); + EXPECT_EQ(ra[0], 0); + EXPECT_EQ(ra[1], 7); + EXPECT_EQ(ra[2], 4); + EXPECT_EQ(ra[3], 15); + EXPECT_EQ(ra[4], 16); + EXPECT_EQ(ra[5], 20); + EXPECT_EQ(ra[6], 38); + EXPECT_EQ(ra[7], 63); } #endif diff --git a/Source/astcenc_decompress_symbolic.cpp b/Source/astcenc_decompress_symbolic.cpp index 902a3f3e9..ef9165db8 100644 --- a/Source/astcenc_decompress_symbolic.cpp +++ b/Source/astcenc_decompress_symbolic.cpp @@ -98,13 +98,8 @@ void unpack_weights( if (!is_dual_plane) { // Build full 64-entry weight lookup table - vint4 tab0 = vint4::load(scb.weights + 0); - vint4 tab1 = vint4::load(scb.weights + 16); - vint4 tab2 = vint4::load(scb.weights + 32); - vint4 tab3 = vint4::load(scb.weights + 48); - - vint tab0p, tab1p, tab2p, tab3p; - vtable_prepare(tab0, tab1, tab2, tab3, tab0p, tab1p, tab2p, tab3p); + vtable_64x8 table; + vtable_prepare(table, scb.weights); for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH) { @@ -118,7 +113,7 @@ void unpack_weights( vint texel_weights(di.texel_weights_tr[j] + i); vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i); - summed_value += vtable_8bt_32bi(tab0p, tab1p, tab2p, tab3p, texel_weights) * texel_weights_int; + summed_value += vtable_lookup(table, texel_weights) * texel_weights_int; } store(lsr<4>(summed_value), weights_plane1 + i); @@ -128,16 +123,12 @@ void unpack_weights( { // Build a 32-entry weight lookup table per plane // Plane 1 - vint4 tab0_plane1 = vint4::load(scb.weights + 0); - vint4 tab1_plane1 = vint4::load(scb.weights + 16); - vint tab0_plane1p, tab1_plane1p; - vtable_prepare(tab0_plane1, tab1_plane1, tab0_plane1p, tab1_plane1p); + vtable_32x8 tab_plane1; + vtable_prepare(tab_plane1, scb.weights); // Plane 2 - vint4 tab0_plane2 = vint4::load(scb.weights + 32); - vint4 tab1_plane2 = vint4::load(scb.weights + 48); - vint tab0_plane2p, tab1_plane2p; - vtable_prepare(tab0_plane2, tab1_plane2, tab0_plane2p, tab1_plane2p); + vtable_32x8 tab_plane2; + vtable_prepare(tab_plane2, scb.weights + 32); for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH) { @@ -153,8 +144,8 @@ void unpack_weights( vint texel_weights(di.texel_weights_tr[j] + i); vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i); - sum_plane1 += vtable_8bt_32bi(tab0_plane1p, tab1_plane1p, texel_weights) * texel_weights_int; - sum_plane2 += vtable_8bt_32bi(tab0_plane2p, tab1_plane2p, texel_weights) * texel_weights_int; + sum_plane1 += vtable_lookup(tab_plane1, texel_weights) * texel_weights_int; + sum_plane2 += vtable_lookup(tab_plane2, texel_weights) * texel_weights_int; } store(lsr<4>(sum_plane1), weights_plane1 + i); diff --git a/Source/astcenc_ideal_endpoints_and_weights.cpp b/Source/astcenc_ideal_endpoints_and_weights.cpp index 3442464d5..8a09c6f67 100644 --- a/Source/astcenc_ideal_endpoints_and_weights.cpp +++ b/Source/astcenc_ideal_endpoints_and_weights.cpp @@ -1023,9 +1023,8 @@ void compute_quantized_weights_for_decimation( // safe data in compute_ideal_weights_for_decimation and arrays are always 64 elements if (get_quant_level(quant_level) <= 16) { - vint4 tab0 = vint4::load(qat.quant_to_unquant); - vint tab0p; - vtable_prepare(tab0, tab0p); + vtable_16x8 table; + vtable_prepare(table, qat.quant_to_unquant); for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) { @@ -1038,8 +1037,8 @@ void compute_quantized_weights_for_decimation( vint weightl = float_to_int(ix1); vint weighth = min(weightl + vint(1), steps_m1); - vint ixli = vtable_8bt_32bi(tab0p, weightl); - vint ixhi = vtable_8bt_32bi(tab0p, weighth); + vint ixli = vtable_lookup(table, weightl); + vint ixhi = vtable_lookup(table, weighth); vfloat ixl = int_to_float(ixli); vfloat ixh = int_to_float(ixhi); @@ -1055,10 +1054,8 @@ void compute_quantized_weights_for_decimation( } else { - vint4 tab0 = vint4::load(qat.quant_to_unquant + 0); - vint4 tab1 = vint4::load(qat.quant_to_unquant + 16); - vint tab0p, tab1p; - vtable_prepare(tab0, tab1, tab0p, tab1p); + vtable_32x8 table; + vtable_prepare(table, qat.quant_to_unquant); for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) { @@ -1071,8 +1068,8 @@ void compute_quantized_weights_for_decimation( vint weightl = float_to_int(ix1); vint weighth = min(weightl + vint(1), steps_m1); - vint ixli = vtable_8bt_32bi(tab0p, tab1p, weightl); - vint ixhi = vtable_8bt_32bi(tab0p, tab1p, weighth); + vint ixli = vtable_lookup(table, weightl); + vint ixhi = vtable_lookup(table, weighth); vfloat ixl = int_to_float(ixli); vfloat ixh = int_to_float(ixhi); diff --git a/Source/astcenc_vecmathlib.h b/Source/astcenc_vecmathlib.h index 628755619..07b933206 100644 --- a/Source/astcenc_vecmathlib.h +++ b/Source/astcenc_vecmathlib.h @@ -96,6 +96,10 @@ using vint = vint8; using vmask = vmask8; + using vtable_16x8 = vtable8_16x8; + using vtable_32x8 = vtable8_32x8; + using vtable_64x8 = vtable8_64x8; + constexpr auto loada = vfloat8::loada; constexpr auto load1 = vfloat8::load1; @@ -111,6 +115,10 @@ using vint = vint4; using vmask = vmask4; + using vtable_16x8 = vtable4_16x8; + using vtable_32x8 = vtable4_32x8; + using vtable_64x8 = vtable4_64x8; + constexpr auto loada = vfloat4::loada; constexpr auto load1 = vfloat4::load1; @@ -185,6 +193,10 @@ using vint = vint4; using vmask = vmask4; + using vtable_16x8 = vtable4_16x8; + using vtable_32x8 = vtable4_32x8; + using vtable_64x8 = vtable4_64x8; + constexpr auto loada = vfloat4::loada; constexpr auto load1 = vfloat4::load1; #endif diff --git a/Source/astcenc_vecmathlib_avx2_8.h b/Source/astcenc_vecmathlib_avx2_8.h index 347abf83c..c3ebbba29 100644 --- a/Source/astcenc_vecmathlib_avx2_8.h +++ b/Source/astcenc_vecmathlib_avx2_8.h @@ -971,98 +971,138 @@ ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a) return vfloat8(_mm256_castsi256_ps(a.m)); } +/* + * Table structure for a 16x 8-bit entry table. + */ +struct vtable8_16x8 { + vint8 t0; +}; + +/* + * Table structure for a 32x 8-bit entry table. + */ +struct vtable8_32x8 { + vint8 t0; + vint8 t1; +}; + +/* + * Table structure for a 64x 8-bit entry table. + */ +struct vtable8_64x8 { + vint8 t0; + vint8 t1; + vint8 t2; + vint8 t3; +}; + /** - * @brief Prepare a vtable lookup table for use with the native SIMD size. + * @brief Prepare a vtable lookup table for 16x 8-bit entry table. */ -ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint8& t0p) -{ - // AVX2 duplicates the table within each 128-bit lane - __m128i t0n = t0.m; - t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n)); +ASTCENC_SIMD_INLINE void vtable_prepare( + vtable8_16x8& table, + const uint8_t* data +) { + // AVX2 tables duplicate table entries in each 128-bit half-register + vint4 d0 = vint4::load(data); + table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m)); } /** - * @brief Prepare a vtable lookup table for use with the native SIMD size. + * @brief Prepare a vtable lookup table for 32x 8-bit entry table. */ -ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint8& t0p, vint8& t1p) -{ - // AVX2 duplicates the table within each 128-bit lane - __m128i t0n = t0.m; - t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n)); +ASTCENC_SIMD_INLINE void vtable_prepare( + vtable8_32x8& table, + const uint8_t* data +) { + // AVX2 tables duplicate table entries in each 128-bit half-register + // Direct lookup for first row + vint4 d0 = vint4::load(data); + table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m)); - __m128i t1n = _mm_xor_si128(t0.m, t1.m); - t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n)); + // XOR with previous rows for subsequent rows + vint4 d1 = vint4::load(data + 16); + d1 = d1 ^ d0; + table.t1 = vint8(astcenc_mm256_set_m128i(d1.m, d1.m)); } /** - * @brief Prepare a vtable lookup table for use with the native SIMD size. + * @brief Prepare a vtable lookup table 64x 8-bit entry table. */ ASTCENC_SIMD_INLINE void vtable_prepare( - vint4 t0, vint4 t1, vint4 t2, vint4 t3, - vint8& t0p, vint8& t1p, vint8& t2p, vint8& t3p) -{ - // AVX2 duplicates the table within each 128-bit lane - __m128i t0n = t0.m; - t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n)); + vtable8_64x8& table, + const uint8_t* data +) { + // AVX2 tables duplicate table entries in each 128-bit half-register + vint4 d0 = vint4::load(data); + vint4 d1 = vint4::load(data + 16); + vint4 d2 = vint4::load(data + 32); + vint4 d3 = vint4::load(data + 48); - __m128i t1n = _mm_xor_si128(t0.m, t1.m); - t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n)); + table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m)); + table.t1 = vint8(astcenc_mm256_set_m128i(d1.m, d1.m)); + table.t2 = vint8(astcenc_mm256_set_m128i(d2.m, d2.m)); + table.t3 = vint8(astcenc_mm256_set_m128i(d3.m, d3.m)); - __m128i t2n = _mm_xor_si128(t1.m, t2.m); - t2p = vint8(astcenc_mm256_set_m128i(t2n, t2n)); - - __m128i t3n = _mm_xor_si128(t2.m, t3.m); - t3p = vint8(astcenc_mm256_set_m128i(t3n, t3n)); + table.t3 = table.t3 ^ table.t2; + table.t2 = table.t2 ^ table.t1; + table.t1 = table.t1 ^ table.t0; } /** - * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 idx) -{ +ASTCENC_SIMD_INLINE vint8 vtable_lookup( + const vtable8_16x8& tbl, + vint8 idx +) { // Set index byte MSB to 1 for unused bytes so shuffle returns zero __m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast(0xFFFFFF00))); - __m256i result = _mm256_shuffle_epi8(t0.m, idxx); + __m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx); return vint8(result); } /** - * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 idx) -{ +ASTCENC_SIMD_INLINE vint8 vtable_lookup( + const vtable8_32x8& tbl, + vint8 idx +) { // Set index byte MSB to 1 for unused bytes so shuffle returns zero __m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast(0xFFFFFF00))); - __m256i result = _mm256_shuffle_epi8(t0.m, idxx); + __m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx); idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16)); - __m256i result2 = _mm256_shuffle_epi8(t1.m, idxx); + __m256i result2 = _mm256_shuffle_epi8(tbl.t1.m, idxx); result = _mm256_xor_si256(result, result2); return vint8(result); } /** - * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3, vint8 idx) -{ +ASTCENC_SIMD_INLINE vint8 vtable_lookup( + const vtable8_64x8& tbl, + vint8 idx +) { // Set index byte MSB to 1 for unused bytes so shuffle returns zero __m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast(0xFFFFFF00))); - __m256i result = _mm256_shuffle_epi8(t0.m, idxx); + __m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx); idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16)); - __m256i result2 = _mm256_shuffle_epi8(t1.m, idxx); + __m256i result2 = _mm256_shuffle_epi8(tbl.t1.m, idxx); result = _mm256_xor_si256(result, result2); idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16)); - result2 = _mm256_shuffle_epi8(t2.m, idxx); + result2 = _mm256_shuffle_epi8(tbl.t2.m, idxx); result = _mm256_xor_si256(result, result2); idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16)); - result2 = _mm256_shuffle_epi8(t3.m, idxx); + result2 = _mm256_shuffle_epi8(tbl.t3.m, idxx); result = _mm256_xor_si256(result, result2); return vint8(result); diff --git a/Source/astcenc_vecmathlib_none_4.h b/Source/astcenc_vecmathlib_none_4.h index 55981e4c3..9f4f4aed1 100644 --- a/Source/astcenc_vecmathlib_none_4.h +++ b/Source/astcenc_vecmathlib_none_4.h @@ -1067,84 +1067,94 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 a) return r; } +/* + * Table structure for a 16x 8-bit entry table. + */ +struct vtable4_16x8 { + const uint8_t* data; +}; + +/* + * Table structure for a 32x 8-bit entry table. + */ +struct vtable4_32x8 { + const uint8_t* data; +}; + +/* + * Table structure for a 64x 8-bit entry table. + */ +struct vtable4_64x8 { + const uint8_t* data; +}; + /** - * @brief Prepare a vtable lookup table for use with the native SIMD size. + * @brief Prepare a vtable lookup table for 16x 8-bit entry table. */ -ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p) -{ - t0p = t0; +ASTCENC_SIMD_INLINE void vtable_prepare( + vtable4_16x8& table, + const uint8_t* data +) { + table.data = data; } /** - * @brief Prepare a vtable lookup table for use with the native SIMD size. + * @brief Prepare a vtable lookup table for 32x 8-bit entry table. */ -ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p) -{ - t0p = t0; - t1p = t1; +ASTCENC_SIMD_INLINE void vtable_prepare( + vtable4_32x8& table, + const uint8_t* data +) { + table.data = data; } /** - * @brief Prepare a vtable lookup table for use with the native SIMD size. + * @brief Prepare a vtable lookup table 64x 8-bit entry table. */ ASTCENC_SIMD_INLINE void vtable_prepare( - vint4 t0, vint4 t1, vint4 t2, vint4 t3, - vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p) -{ - t0p = t0; - t1p = t1; - t2p = t2; - t3p = t3; + vtable4_64x8& table, + const uint8_t* data +) { + table.data = data; } /** - * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx) -{ - uint8_t table[16]; - - std::memcpy(table + 0, t0.m, 4 * sizeof(int)); - - return vint4(table[idx.lane<0>()], - table[idx.lane<1>()], - table[idx.lane<2>()], - table[idx.lane<3>()]); +ASTCENC_SIMD_INLINE vint4 vtable_lookup( + const vtable4_16x8& table, + vint4 idx +) { + return vint4(table.data[idx.lane<0>()], + table.data[idx.lane<1>()], + table.data[idx.lane<2>()], + table.data[idx.lane<3>()]); } - /** - * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx) -{ - uint8_t table[32]; - - std::memcpy(table + 0, t0.m, 4 * sizeof(int)); - std::memcpy(table + 16, t1.m, 4 * sizeof(int)); - - return vint4(table[idx.lane<0>()], - table[idx.lane<1>()], - table[idx.lane<2>()], - table[idx.lane<3>()]); +ASTCENC_SIMD_INLINE vint4 vtable_lookup( + const vtable4_32x8& table, + vint4 idx +) { + return vint4(table.data[idx.lane<0>()], + table.data[idx.lane<1>()], + table.data[idx.lane<2>()], + table.data[idx.lane<3>()]); } /** - * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx) -{ - uint8_t table[64]; - - std::memcpy(table + 0, t0.m, 4 * sizeof(int)); - std::memcpy(table + 16, t1.m, 4 * sizeof(int)); - std::memcpy(table + 32, t2.m, 4 * sizeof(int)); - std::memcpy(table + 48, t3.m, 4 * sizeof(int)); - - return vint4(table[idx.lane<0>()], - table[idx.lane<1>()], - table[idx.lane<2>()], - table[idx.lane<3>()]); +ASTCENC_SIMD_INLINE vint4 vtable_lookup( + const vtable4_64x8& table, + vint4 idx +) { + return vint4(table.data[idx.lane<0>()], + table.data[idx.lane<1>()], + table.data[idx.lane<2>()], + table.data[idx.lane<3>()]); } /** diff --git a/Source/astcenc_vecmathlib_sse_4.h b/Source/astcenc_vecmathlib_sse_4.h index 111d9b9e5..5e260a8da 100644 --- a/Source/astcenc_vecmathlib_sse_4.h +++ b/Source/astcenc_vecmathlib_sse_4.h @@ -1037,63 +1037,92 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v) return vfloat4(_mm_castsi128_ps(v.m)); } +/* + * Table structure for a 16x 8-bit entry table. + */ +struct vtable4_16x8 { + vint4 t0; +}; + +/* + * Table structure for a 32x 8-bit entry table. + */ +struct vtable4_32x8 { + vint4 t0; + vint4 t1; +}; + +/* + * Table structure for a 64x 8-bit entry table. + */ +struct vtable4_64x8 { + vint4 t0; + vint4 t1; + vint4 t2; + vint4 t3; +}; + /** - * @brief Prepare a vtable lookup table for use with the native SIMD size. + * @brief Prepare a vtable lookup table for 16x 8-bit entry table. */ -ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p) -{ - t0p = t0; +ASTCENC_SIMD_INLINE void vtable_prepare( + vtable4_16x8& table, + const uint8_t* data +) { + table.t0 = vint4::load(data); } /** - * @brief Prepare a vtable lookup table for use with the native SIMD size. + * @brief Prepare a vtable lookup table for 32x 8-bit entry table. */ -ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p) -{ +ASTCENC_SIMD_INLINE void vtable_prepare( + vtable4_32x8& table, + const uint8_t* data +) { + table.t0 = vint4::load(data); + table.t1 = vint4::load(data + 16); + #if ASTCENC_SSE >= 41 - t0p = t0; - t1p = t0 ^ t1; -#else - t0p = t0; - t1p = t1; + table.t1 = table.t1 ^ table.t0; #endif } /** - * @brief Prepare a vtable lookup table for use with the native SIMD size. + * @brief Prepare a vtable lookup table 64x 8-bit entry table. */ ASTCENC_SIMD_INLINE void vtable_prepare( - vint4 t0, vint4 t1, vint4 t2, vint4 t3, - vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p) -{ + vtable4_64x8& table, + const uint8_t* data +) { + table.t0 = vint4::load(data); + table.t1 = vint4::load(data + 16); + table.t2 = vint4::load(data + 32); + table.t3 = vint4::load(data + 48); + #if ASTCENC_SSE >= 41 - t0p = t0; - t1p = t0 ^ t1; - t2p = t1 ^ t2; - t3p = t2 ^ t3; -#else - t0p = t0; - t1p = t1; - t2p = t2; - t3p = t3; + table.t3 = table.t3 ^ table.t2; + table.t2 = table.t2 ^ table.t1; + table.t1 = table.t1 ^ table.t0; #endif } /** - * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx) -{ +ASTCENC_SIMD_INLINE vint4 vtable_lookup( + const vtable4_16x8& tbl, + vint4 idx +) { #if ASTCENC_SSE >= 41 // Set index byte MSB to 1 for unused bytes so shuffle returns zero __m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast(0xFFFFFF00))); - __m128i result = _mm_shuffle_epi8(t0.m, idxx); + __m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx); return vint4(result); #else uint8_t table[16]; - std::memcpy(table + 0, &t0.m, 4 * sizeof(int)); + std::memcpy(table + 0, &tbl.t0.m, 4 * sizeof(int)); return vint4(table[idx.lane<0>()], table[idx.lane<1>()], @@ -1103,26 +1132,28 @@ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx) } /** - * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx) -{ +ASTCENC_SIMD_INLINE vint4 vtable_lookup( + const vtable4_32x8& tbl, + vint4 idx +) { #if ASTCENC_SSE >= 41 // Set index byte MSB to 1 for unused bytes so shuffle returns zero __m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast(0xFFFFFF00))); - __m128i result = _mm_shuffle_epi8(t0.m, idxx); + __m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx); idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16)); - __m128i result2 = _mm_shuffle_epi8(t1.m, idxx); + __m128i result2 = _mm_shuffle_epi8(tbl.t1.m, idxx); result = _mm_xor_si128(result, result2); return vint4(result); #else uint8_t table[32]; - std::memcpy(table + 0, &t0.m, 4 * sizeof(int)); - std::memcpy(table + 16, &t1.m, 4 * sizeof(int)); + std::memcpy(table + 0, &tbl.t0.m, 4 * sizeof(int)); + std::memcpy(table + 16, &tbl.t1.m, 4 * sizeof(int)); return vint4(table[idx.lane<0>()], table[idx.lane<1>()], @@ -1132,36 +1163,38 @@ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx) } /** - * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx) -{ +ASTCENC_SIMD_INLINE vint4 vtable_lookup( + const vtable4_64x8& tbl, + vint4 idx +) { #if ASTCENC_SSE >= 41 // Set index byte MSB to 1 for unused bytes so shuffle returns zero __m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast(0xFFFFFF00))); - __m128i result = _mm_shuffle_epi8(t0.m, idxx); + __m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx); idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16)); - __m128i result2 = _mm_shuffle_epi8(t1.m, idxx); + __m128i result2 = _mm_shuffle_epi8(tbl.t1.m, idxx); result = _mm_xor_si128(result, result2); idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16)); - result2 = _mm_shuffle_epi8(t2.m, idxx); + result2 = _mm_shuffle_epi8(tbl.t2.m, idxx); result = _mm_xor_si128(result, result2); idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16)); - result2 = _mm_shuffle_epi8(t3.m, idxx); + result2 = _mm_shuffle_epi8(tbl.t3.m, idxx); result = _mm_xor_si128(result, result2); return vint4(result); #else uint8_t table[64]; - std::memcpy(table + 0, &t0.m, 4 * sizeof(int)); - std::memcpy(table + 16, &t1.m, 4 * sizeof(int)); - std::memcpy(table + 32, &t2.m, 4 * sizeof(int)); - std::memcpy(table + 48, &t3.m, 4 * sizeof(int)); + std::memcpy(table + 0, &tbl.t0.m, 4 * sizeof(int)); + std::memcpy(table + 16, &tbl.t1.m, 4 * sizeof(int)); + std::memcpy(table + 32, &tbl.t2.m, 4 * sizeof(int)); + std::memcpy(table + 48, &tbl.t3.m, 4 * sizeof(int)); return vint4(table[idx.lane<0>()], table[idx.lane<1>()], diff --git a/Test/astc_profile_valgrind.py b/Test/astc_profile_valgrind.py index eb9a82a45..3cd08445b 100644 --- a/Test/astc_profile_valgrind.py +++ b/Test/astc_profile_valgrind.py @@ -125,7 +125,7 @@ def run_pass(image, noStartup, encoder, blocksize, quality): if noStartup: args = ["gprof2dot", "--format=callgrind", "--output=out.dot", "callgrind.txt", - "-s", "-z", "compress_block(astcenc_contexti const&, image_block const&, physical_compressed_block&, compression_working_buffers&)"] + "-s", "-z", "compress_block(astcenc_contexti const&, image_block const&, unsigned char*, compression_working_buffers&)"] else: args = ["gprof2dot", "--format=callgrind", "--output=out.dot", "callgrind.txt", "-s", "-z", "main"] From 50dd7a43bf72cbc8de516ace8dcb352b79a40cb9 Mon Sep 17 00:00:00 2001 From: Peter Harris Date: Fri, 9 Aug 2024 23:25:39 +0100 Subject: [PATCH 2/8] New vtable API for Arm --- Source/astcenc_vecmathlib.h | 8 +++ Source/astcenc_vecmathlib_neon_4.h | 102 ++++++++++++++++++--------- Source/astcenc_vecmathlib_sve_8.h | 109 ++++++++++++++++------------- 3 files changed, 136 insertions(+), 83 deletions(-) diff --git a/Source/astcenc_vecmathlib.h b/Source/astcenc_vecmathlib.h index 07b933206..b41f6fa3a 100644 --- a/Source/astcenc_vecmathlib.h +++ b/Source/astcenc_vecmathlib.h @@ -146,6 +146,10 @@ using vint = vint8; using vmask = vmask8; + using vtable_16x8 = vtable8_16x8; + using vtable_32x8 = vtable8_32x8; + using vtable_64x8 = vtable8_64x8; + constexpr auto loada = vfloat8::loada; constexpr auto load1 = vfloat8::load1; @@ -161,6 +165,10 @@ using vint = vint4; using vmask = vmask4; + using vtable_16x8 = vtable4_16x8; + using vtable_32x8 = vtable4_32x8; + using vtable_64x8 = vtable4_64x8; + constexpr auto loada = vfloat4::loada; constexpr auto load1 = vfloat4::load1; diff --git a/Source/astcenc_vecmathlib_neon_4.h b/Source/astcenc_vecmathlib_neon_4.h index e15d4f9ef..c5d43c14d 100644 --- a/Source/astcenc_vecmathlib_neon_4.h +++ b/Source/astcenc_vecmathlib_neon_4.h @@ -939,44 +939,74 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v) return vfloat4(vreinterpretq_f32_s32(v.m)); } +/* + * Table structure for a 16x 8-bit entry table. + */ +struct vtable4_16x8 { + vint4 t0; +}; + +/* + * Table structure for a 32x 8-bit entry table. + */ +struct vtable4_32x8 { + vint4 t0; + vint4 t1; +}; + +/* + * Table structure for a 64x 8-bit entry table. + */ +struct vtable4_64x8 { + vint4 t0; + vint4 t1; + vint4 t2; + vint4 t3; +}; + /** - * @brief Prepare a vtable lookup table for use with the native SIMD size. + * @brief Prepare a vtable lookup table for 16x 8-bit entry table. */ -ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p) -{ - t0p = t0; +ASTCENC_SIMD_INLINE void vtable_prepare( + vtable4_16x8& table, + const uint8_t* data +) { + table.t0 = vint4::load(data); } - /** - * @brief Prepare a vtable lookup table for use with the native SIMD size. + * @brief Prepare a vtable lookup table for 32x 8-bit entry table. */ -ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p) -{ - t0p = t0; - t1p = t1; +ASTCENC_SIMD_INLINE void vtable_prepare( + vtable4_32x8& table, + const uint8_t* data +) { + table.t0 = vint4::load(data); + table.t1 = vint4::load(data + 16); } /** - * @brief Prepare a vtable lookup table for use with the native SIMD size. + * @brief Prepare a vtable lookup table 64x 8-bit entry table. */ ASTCENC_SIMD_INLINE void vtable_prepare( - vint4 t0, vint4 t1, vint4 t2, vint4 t3, - vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p) -{ - t0p = t0; - t1p = t1; - t2p = t2; - t3p = t3; + vtable4_64x8& table, + const uint8_t* data +) { + table.t0 = vint4::load(data); + table.t1 = vint4::load(data + 16); + table.t2 = vint4::load(data + 32); + table.t3 = vint4::load(data + 48); } /** - * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx) -{ +ASTCENC_SIMD_INLINE vint4 vtable_lookup( + const vtable4_16x8& tbl, + vint4 idx +) { int8x16_t table { - vreinterpretq_s8_s32(t0.m) + vreinterpretq_s8_s32(tbl.t0.m) }; // Set index byte above max index for unused bytes so table lookup returns zero @@ -987,13 +1017,15 @@ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx) } /** - * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx) -{ +ASTCENC_SIMD_INLINE vint4 vtable_lookup( + const vtable4_32x8& tbl, + vint4 idx +) { int8x16x2_t table { - vreinterpretq_s8_s32(t0.m), - vreinterpretq_s8_s32(t1.m) + vreinterpretq_s8_s32(tbl.t0.m), + vreinterpretq_s8_s32(tbl.t1.m) }; // Set index byte above max index for unused bytes so table lookup returns zero @@ -1004,15 +1036,17 @@ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx) } /** - * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx) -{ +ASTCENC_SIMD_INLINE vint4 vtable_lookup( + const vtable4_64x8& tbl, + vint4 idx +) { int8x16x4_t table { - vreinterpretq_s8_s32(t0.m), - vreinterpretq_s8_s32(t1.m), - vreinterpretq_s8_s32(t2.m), - vreinterpretq_s8_s32(t3.m) + vreinterpretq_s8_s32(tbl.t0.m), + vreinterpretq_s8_s32(tbl.t1.m), + vreinterpretq_s8_s32(tbl.t2.m), + vreinterpretq_s8_s32(tbl.t3.m) }; // Set index byte above max index for unused bytes so table lookup returns zero diff --git a/Source/astcenc_vecmathlib_sve_8.h b/Source/astcenc_vecmathlib_sve_8.h index 169f28cb1..a1eeb18c3 100644 --- a/Source/astcenc_vecmathlib_sve_8.h +++ b/Source/astcenc_vecmathlib_sve_8.h @@ -906,90 +906,101 @@ ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a) return vfloat8(svreinterpret_f32_s32(a.m)); } -/** - * @brief Prepare a vtable lookup table for use with the native SIMD size. +/* + * Table structure for a 16x 8-bit entry table. */ -ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint8& t0p) -{ - t0p = vint8(svdup_neonq_f32(t0.m)); -} +struct vtable8_16x8 { + vint8 t0; +}; -/** - * @brief Prepare a vtable lookup table for use with the native SIMD size. +/* + * Table structure for a 32x 8-bit entry table. */ -ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint8& t0p, vint8& t1p) -{ - // 8-wide SVE uses a single table register, so t1 is unused - (void)t1p; +struct vtable8_32x8 { + vint8 t0; +}; - svfloat32_8_t t0v = svdup_neonq_f32(t0.m); - svfloat32_8_t t1v = svdup_neonq_f32(t1.m); +/* + * Table structure for a 64x 8-bit entry table. + */ +struct vtable8_64x8 { + vint8 t0; + vint8 t1; +}; - t0p = vint8(svext_f32(t0v, t1v, 4)); +/** + * @brief Prepare a vtable lookup table for 16x 8-bit entry table. + */ +ASTCENC_SIMD_INLINE void vtable_prepare( + vtable8_16x8& table, + const uint8_t* data +) { + // Top half of register will be zeros + table.t0 = vint8(svld1_u8(svptrue_pat_b8(SV_VL16), data)); } /** - * @brief Prepare a vtable lookup table for use with the native SIMD size. + * @brief Prepare a vtable lookup table for 32x 8-bit entry table. */ ASTCENC_SIMD_INLINE void vtable_prepare( - vint4 t0, vint4 t1, vint4 t2, vint4 t3, - vint8& t0p, vint8& t1p, vint8& t2p, vint8& t3p) -{ - // 8-wide SVE uses a two table registers, so t2 and t3 are unused - (void)t2p; - (void)t3p; - - svfloat32_8_t t0v = svdup_neonq_f32(t0.m); - svfloat32_8_t t1v = svdup_neonq_f32(t1.m); - svfloat32_8_t t2v = svdup_neonq_f32(t2.m); - svfloat32_8_t t3v = svdup_neonq_f32(t3.m); + vtable8_32x8& table, + const uint8_t* data +) { + table.t0 = vint8(svld1_u8(svptrue_b8(), data)); +} - t0p = vint8(svext_f32(t0v, t1v, 4)); - t1p = vint8(svext_f32(t2v, t3v, 4)); +/** + * @brief Prepare a vtable lookup table 64x 8-bit entry table. + */ +ASTCENC_SIMD_INLINE void vtable_prepare( + vtable8_64x8& table, + const uint8_t* data +) { + table.t0 = vint8(svld1_u8(svptrue_b8(), data)); + table.t1 = vint8(svld1_u8(svptrue_b8(), data + 32)); } /** - * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 idx) -{ +ASTCENC_SIMD_INLINE vint8 vtable_lookup( + const vtable8_16x8& tbl, + vint8 idx +) { // Set index byte above max index for unused bytes so table lookup returns zero svint32_8_t idx_masked = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00)); svuint8_8_t idx_bytes = svreinterpret_u8_s32(idx_masked); - svuint8_8_t tbl_bytes = svreinterpret_u8_s32(t0.m); + svuint8_8_t tbl_bytes = svreinterpret_u8_s32(tbl.t0.m); svuint8_8_t result = svtbl_u8(tbl_bytes, idx_bytes); return vint8(svreinterpret_s32_u8(result)); } /** - * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 idx) -{ - // 8-wide SVE uses a single table register, so t1 is unused - (void)t1; - +ASTCENC_SIMD_INLINE vint8 vtable_lookup( + const vtable8_32x8& tbl, + vint8 idx +) { // Set index byte above max index for unused bytes so table lookup returns zero svint32_8_t idx_masked = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00)); svuint8_8_t idx_bytes = svreinterpret_u8_s32(idx_masked); - svuint8_8_t tbl_bytes = svreinterpret_u8_s32(t0.m); + svuint8_8_t tbl_bytes = svreinterpret_u8_s32(table.t0.m); svuint8_8_t result = svtbl_u8(tbl_bytes, idx_bytes); return vint8(svreinterpret_s32_u8(result)); } /** - * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes. + * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3, vint8 idx) -{ - // 8-wide SVE uses a two table registers, so t2 and t3 are unused - (void)t2; - (void)t3; - +ASTCENC_SIMD_INLINE vint8 vtable_lookup( + const vtable8_64x8& tbl, + vint8 idx +) { // Set index byte above max index for unused bytes so table lookup returns zero svint32_8_t literal32 = svdup_s32(32); svbool_8_t idx_lo_select = svcmplt(svptrue_b32(), idx.m, literal32); @@ -999,8 +1010,8 @@ ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3 svuint8_8_t idx_lo_bytes = svreinterpret_u8_s32(idx_lo_masked); svuint8_8_t idx_hi_bytes = svreinterpret_u8_s32(idx_hi_masked); - svuint8_8_t tbl0_bytes = svreinterpret_u8_s32(t0.m); - svuint8_8_t tbl1_bytes = svreinterpret_u8_s32(t1.m); + svuint8_8_t tbl0_bytes = svreinterpret_u8_s32(tbl.t0.m); + svuint8_8_t tbl1_bytes = svreinterpret_u8_s32(tbl.t1.m); svint32_8_t t0_lookup = svreinterpret_s32_u8(svtbl_u8(tbl0_bytes, idx_lo_bytes)); svint32_8_t t1_lookup = svreinterpret_s32_u8(svtbl_u8(tbl1_bytes, idx_hi_bytes)); From cd251739f686ee378112b33f2138c36d4d86e6d1 Mon Sep 17 00:00:00 2001 From: Peter Harris Date: Fri, 9 Aug 2024 23:45:25 +0100 Subject: [PATCH 3/8] Fix self-review --- Source/astcenc_vecmathlib_avx2_8.h | 12 +++--- Source/astcenc_vecmathlib_neon_4.h | 55 ++++++++++--------------- Source/astcenc_vecmathlib_sse_4.h | 64 ++++++++++++++++-------------- 3 files changed, 63 insertions(+), 68 deletions(-) diff --git a/Source/astcenc_vecmathlib_avx2_8.h b/Source/astcenc_vecmathlib_avx2_8.h index c3ebbba29..3ed01a0b8 100644 --- a/Source/astcenc_vecmathlib_avx2_8.h +++ b/Source/astcenc_vecmathlib_avx2_8.h @@ -1005,6 +1005,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare( ) { // AVX2 tables duplicate table entries in each 128-bit half-register vint4 d0 = vint4::load(data); + table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m)); } @@ -1016,14 +1017,14 @@ ASTCENC_SIMD_INLINE void vtable_prepare( const uint8_t* data ) { // AVX2 tables duplicate table entries in each 128-bit half-register - // Direct lookup for first row vint4 d0 = vint4::load(data); - table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m)); - - // XOR with previous rows for subsequent rows vint4 d1 = vint4::load(data + 16); - d1 = d1 ^ d0; + + table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m)); table.t1 = vint8(astcenc_mm256_set_m128i(d1.m, d1.m)); + + // XOR chain the high rows to allow table emulation + table.t1 = table.t1 ^ table.t0; } /** @@ -1044,6 +1045,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare( table.t2 = vint8(astcenc_mm256_set_m128i(d2.m, d2.m)); table.t3 = vint8(astcenc_mm256_set_m128i(d3.m, d3.m)); + // XOR chain the high rows to allow table emulation table.t3 = table.t3 ^ table.t2; table.t2 = table.t2 ^ table.t1; table.t1 = table.t1 ^ table.t0; diff --git a/Source/astcenc_vecmathlib_neon_4.h b/Source/astcenc_vecmathlib_neon_4.h index c5d43c14d..b12fff3ff 100644 --- a/Source/astcenc_vecmathlib_neon_4.h +++ b/Source/astcenc_vecmathlib_neon_4.h @@ -943,25 +943,21 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v) * Table structure for a 16x 8-bit entry table. */ struct vtable4_16x8 { - vint4 t0; + int8x16_t t0; }; /* * Table structure for a 32x 8-bit entry table. */ struct vtable4_32x8 { - vint4 t0; - vint4 t1; + int8x16x2_t t01; }; /* * Table structure for a 64x 8-bit entry table. */ struct vtable4_64x8 { - vint4 t0; - vint4 t1; - vint4 t2; - vint4 t3; + int8x16x4_t t0123; }; /** @@ -971,7 +967,8 @@ ASTCENC_SIMD_INLINE void vtable_prepare( vtable4_16x8& table, const uint8_t* data ) { - table.t0 = vint4::load(data); + vint4 t0 = vint4::load(data); + table.t0 = vreinterpretq_s8_s32(t0.m); } /** @@ -981,8 +978,11 @@ ASTCENC_SIMD_INLINE void vtable_prepare( vtable4_32x8& table, const uint8_t* data ) { - table.t0 = vint4::load(data); - table.t1 = vint4::load(data + 16); + vint4 t0 = vint4::load(data); + vint4 t1 = vint4::load(data + 16); + + table.t01[0] = vreinterpretq_s8_s32(t0.m); + table.t01[1] = vreinterpretq_s8_s32(t1.m); } /** @@ -992,10 +992,15 @@ ASTCENC_SIMD_INLINE void vtable_prepare( vtable4_64x8& table, const uint8_t* data ) { - table.t0 = vint4::load(data); - table.t1 = vint4::load(data + 16); - table.t2 = vint4::load(data + 32); - table.t3 = vint4::load(data + 48); + vint4 t0 = vint4::load(data); + vint4 t1 = vint4::load(data + 16); + vint4 t2 = vint4::load(data + 32); + vint4 t3 = vint4::load(data + 48); + + table.t0123[0] = vreinterpretq_s8_s32(t0.m); + table.t0123[1] = vreinterpretq_s8_s32(t1.m); + table.t0123[2] = vreinterpretq_s8_s32(t2.m); + table.t0123[3] = vreinterpretq_s8_s32(t3.m); } /** @@ -1005,15 +1010,11 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup( const vtable4_16x8& tbl, vint4 idx ) { - int8x16_t table { - vreinterpretq_s8_s32(tbl.t0.m) - }; - // Set index byte above max index for unused bytes so table lookup returns zero int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00)); uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked); - return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(table, idx_bytes))); + return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(tbl.t0, idx_bytes))); } /** @@ -1023,16 +1024,11 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup( const vtable4_32x8& tbl, vint4 idx ) { - int8x16x2_t table { - vreinterpretq_s8_s32(tbl.t0.m), - vreinterpretq_s8_s32(tbl.t1.m) - }; - // Set index byte above max index for unused bytes so table lookup returns zero int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00)); uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked); - return vint4(vreinterpretq_s32_s8(vqtbl2q_s8(table, idx_bytes))); + return vint4(vreinterpretq_s32_s8(vqtbl2q_s8(table.t01, idx_bytes))); } /** @@ -1042,18 +1038,11 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup( const vtable4_64x8& tbl, vint4 idx ) { - int8x16x4_t table { - vreinterpretq_s8_s32(tbl.t0.m), - vreinterpretq_s8_s32(tbl.t1.m), - vreinterpretq_s8_s32(tbl.t2.m), - vreinterpretq_s8_s32(tbl.t3.m) - }; - // Set index byte above max index for unused bytes so table lookup returns zero int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00)); uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked); - return vint4(vreinterpretq_s32_s8(vqtbl4q_s8(table, idx_bytes))); + return vint4(vreinterpretq_s32_s8(vqtbl4q_s8(table.t0123, idx_bytes))); } /** diff --git a/Source/astcenc_vecmathlib_sse_4.h b/Source/astcenc_vecmathlib_sse_4.h index 5e260a8da..02c487820 100644 --- a/Source/astcenc_vecmathlib_sse_4.h +++ b/Source/astcenc_vecmathlib_sse_4.h @@ -1041,25 +1041,37 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v) * Table structure for a 16x 8-bit entry table. */ struct vtable4_16x8 { +#if ASTCENC_SSE >= 41 vint4 t0; +#else + const uint8_t* data; +#endif }; /* * Table structure for a 32x 8-bit entry table. */ struct vtable4_32x8 { +#if ASTCENC_SSE >= 41 vint4 t0; vint4 t1; +#else + const uint8_t* data; +#endif }; /* * Table structure for a 64x 8-bit entry table. */ struct vtable4_64x8 { +#if ASTCENC_SSE >= 41 vint4 t0; vint4 t1; vint4 t2; vint4 t3; +#else + const uint8_t* data; +#endif }; /** @@ -1069,7 +1081,11 @@ ASTCENC_SIMD_INLINE void vtable_prepare( vtable4_16x8& table, const uint8_t* data ) { +#if ASTCENC_SSE >= 41 table.t0 = vint4::load(data); +#else + table.data = data; +#endif } /** @@ -1079,11 +1095,13 @@ ASTCENC_SIMD_INLINE void vtable_prepare( vtable4_32x8& table, const uint8_t* data ) { +#if ASTCENC_SSE >= 41 table.t0 = vint4::load(data); table.t1 = vint4::load(data + 16); -#if ASTCENC_SSE >= 41 table.t1 = table.t1 ^ table.t0; +#else + table.data = data; #endif } @@ -1094,15 +1112,17 @@ ASTCENC_SIMD_INLINE void vtable_prepare( vtable4_64x8& table, const uint8_t* data ) { +#if ASTCENC_SSE >= 41 table.t0 = vint4::load(data); table.t1 = vint4::load(data + 16); table.t2 = vint4::load(data + 32); table.t3 = vint4::load(data + 48); -#if ASTCENC_SSE >= 41 table.t3 = table.t3 ^ table.t2; table.t2 = table.t2 ^ table.t1; table.t1 = table.t1 ^ table.t0; +#else + table.data = data; #endif } @@ -1120,14 +1140,10 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup( __m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx); return vint4(result); #else - uint8_t table[16]; - - std::memcpy(table + 0, &tbl.t0.m, 4 * sizeof(int)); - - return vint4(table[idx.lane<0>()], - table[idx.lane<1>()], - table[idx.lane<2>()], - table[idx.lane<3>()]); + return vint4(tbl.data[idx.lane<0>()], + tbl.data[idx.lane<1>()], + tbl.data[idx.lane<2>()], + tbl.data[idx.lane<3>()]); #endif } @@ -1150,15 +1166,10 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup( return vint4(result); #else - uint8_t table[32]; - - std::memcpy(table + 0, &tbl.t0.m, 4 * sizeof(int)); - std::memcpy(table + 16, &tbl.t1.m, 4 * sizeof(int)); - - return vint4(table[idx.lane<0>()], - table[idx.lane<1>()], - table[idx.lane<2>()], - table[idx.lane<3>()]); + return vint4(tbl.data[idx.lane<0>()], + tbl.data[idx.lane<1>()], + tbl.data[idx.lane<2>()], + tbl.data[idx.lane<3>()]); #endif } @@ -1189,17 +1200,10 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup( return vint4(result); #else - uint8_t table[64]; - - std::memcpy(table + 0, &tbl.t0.m, 4 * sizeof(int)); - std::memcpy(table + 16, &tbl.t1.m, 4 * sizeof(int)); - std::memcpy(table + 32, &tbl.t2.m, 4 * sizeof(int)); - std::memcpy(table + 48, &tbl.t3.m, 4 * sizeof(int)); - - return vint4(table[idx.lane<0>()], - table[idx.lane<1>()], - table[idx.lane<2>()], - table[idx.lane<3>()]); + return vint4(tbl.data[idx.lane<0>()], + tbl.data[idx.lane<1>()], + tbl.data[idx.lane<2>()], + tbl.data[idx.lane<3>()]); #endif } From a252f3007329fa8e551270647621bb5ec64eb36a Mon Sep 17 00:00:00 2001 From: Peter Harris Date: Fri, 9 Aug 2024 23:50:42 +0100 Subject: [PATCH 4/8] Make API explicitly 32-bit return --- Source/UnitTest/test_simd.cpp | 12 ++++++------ Source/astcenc_decompress_symbolic.cpp | 6 +++--- Source/astcenc_ideal_endpoints_and_weights.cpp | 8 ++++---- Source/astcenc_vecmathlib_avx2_8.h | 6 +++--- Source/astcenc_vecmathlib_neon_4.h | 6 +++--- Source/astcenc_vecmathlib_none_4.h | 6 +++--- Source/astcenc_vecmathlib_sse_4.h | 6 +++--- Source/astcenc_vecmathlib_sve_8.h | 6 +++--- 8 files changed, 28 insertions(+), 28 deletions(-) diff --git a/Source/UnitTest/test_simd.cpp b/Source/UnitTest/test_simd.cpp index dcfa4e644..24fb63f53 100644 --- a/Source/UnitTest/test_simd.cpp +++ b/Source/UnitTest/test_simd.cpp @@ -1959,7 +1959,7 @@ TEST(vint4, vtable4_16x8) vint4 index(0, 7, 4, 15); - vint4 result = vtable_lookup(table, index); + vint4 result = vtable_lookup_32bit(table, index); EXPECT_EQ(result.lane<0>(), 0); EXPECT_EQ(result.lane<1>(), 7); @@ -1982,7 +1982,7 @@ TEST(vint4, vtable4_32x8) vint4 index(0, 7, 4, 31); - vint4 result = vtable_lookup(table, index); + vint4 result = vtable_lookup_32bit(table, index); EXPECT_EQ(result.lane<0>(), 0); EXPECT_EQ(result.lane<1>(), 7); @@ -2009,7 +2009,7 @@ TEST(vint4, vtable4_64x8) vint4 index(0, 7, 38, 63); - vint4 result = vtable_lookup(table, index); + vint4 result = vtable_lookup_32bit(table, index); uint8_t* hack = reinterpret_cast(&table); std::cout << "38: " << hack[38] << "\n"; @@ -3704,7 +3704,7 @@ TEST(vint8, vtable8_16x8) vint8 index = vint8_lit(0, 7, 4, 15, 1, 2, 14, 4); - vint8 result = vtable_lookup(table, index); + vint8 result = vtable_lookup_32bit(table, index); alignas(32) int ra[8]; store(result, ra); @@ -3734,7 +3734,7 @@ TEST(vint8, vtable8_32x8) vint8 index = vint8_lit(0, 7, 4, 15, 16, 20, 23, 31); - vint8 result = vtable_lookup(table, index); + vint8 result = vtable_lookup_32bit(table, index); alignas(32) int ra[8]; store(result, ra); @@ -3768,7 +3768,7 @@ TEST(vint8, vtable8_64x8) vint8 index = vint8_lit(0, 7, 4, 15, 16, 20, 38, 63); - vint8 result = vtable_lookup(table, index); + vint8 result = vtable_lookup_32bit(table, index); alignas(32) int ra[8]; store(result, ra); diff --git a/Source/astcenc_decompress_symbolic.cpp b/Source/astcenc_decompress_symbolic.cpp index ef9165db8..e7791eef6 100644 --- a/Source/astcenc_decompress_symbolic.cpp +++ b/Source/astcenc_decompress_symbolic.cpp @@ -113,7 +113,7 @@ void unpack_weights( vint texel_weights(di.texel_weights_tr[j] + i); vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i); - summed_value += vtable_lookup(table, texel_weights) * texel_weights_int; + summed_value += vtable_lookup_32bit(table, texel_weights) * texel_weights_int; } store(lsr<4>(summed_value), weights_plane1 + i); @@ -144,8 +144,8 @@ void unpack_weights( vint texel_weights(di.texel_weights_tr[j] + i); vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i); - sum_plane1 += vtable_lookup(tab_plane1, texel_weights) * texel_weights_int; - sum_plane2 += vtable_lookup(tab_plane2, texel_weights) * texel_weights_int; + sum_plane1 += vtable_lookup_32bit(tab_plane1, texel_weights) * texel_weights_int; + sum_plane2 += vtable_lookup_32bit(tab_plane2, texel_weights) * texel_weights_int; } store(lsr<4>(sum_plane1), weights_plane1 + i); diff --git a/Source/astcenc_ideal_endpoints_and_weights.cpp b/Source/astcenc_ideal_endpoints_and_weights.cpp index 8a09c6f67..ec680dd5e 100644 --- a/Source/astcenc_ideal_endpoints_and_weights.cpp +++ b/Source/astcenc_ideal_endpoints_and_weights.cpp @@ -1037,8 +1037,8 @@ void compute_quantized_weights_for_decimation( vint weightl = float_to_int(ix1); vint weighth = min(weightl + vint(1), steps_m1); - vint ixli = vtable_lookup(table, weightl); - vint ixhi = vtable_lookup(table, weighth); + vint ixli = vtable_lookup_32bit(table, weightl); + vint ixhi = vtable_lookup_32bit(table, weighth); vfloat ixl = int_to_float(ixli); vfloat ixh = int_to_float(ixhi); @@ -1068,8 +1068,8 @@ void compute_quantized_weights_for_decimation( vint weightl = float_to_int(ix1); vint weighth = min(weightl + vint(1), steps_m1); - vint ixli = vtable_lookup(table, weightl); - vint ixhi = vtable_lookup(table, weighth); + vint ixli = vtable_lookup_32bit(table, weightl); + vint ixhi = vtable_lookup_32bit(table, weighth); vfloat ixl = int_to_float(ixli); vfloat ixh = int_to_float(ixhi); diff --git a/Source/astcenc_vecmathlib_avx2_8.h b/Source/astcenc_vecmathlib_avx2_8.h index 3ed01a0b8..7c75818ae 100644 --- a/Source/astcenc_vecmathlib_avx2_8.h +++ b/Source/astcenc_vecmathlib_avx2_8.h @@ -1054,7 +1054,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare( /** * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint8 vtable_lookup( +ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit( const vtable8_16x8& tbl, vint8 idx ) { @@ -1068,7 +1068,7 @@ ASTCENC_SIMD_INLINE vint8 vtable_lookup( /** * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint8 vtable_lookup( +ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit( const vtable8_32x8& tbl, vint8 idx ) { @@ -1086,7 +1086,7 @@ ASTCENC_SIMD_INLINE vint8 vtable_lookup( /** * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint8 vtable_lookup( +ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit( const vtable8_64x8& tbl, vint8 idx ) { diff --git a/Source/astcenc_vecmathlib_neon_4.h b/Source/astcenc_vecmathlib_neon_4.h index b12fff3ff..beea3425f 100644 --- a/Source/astcenc_vecmathlib_neon_4.h +++ b/Source/astcenc_vecmathlib_neon_4.h @@ -1006,7 +1006,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare( /** * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint4 vtable_lookup( +ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( const vtable4_16x8& tbl, vint4 idx ) { @@ -1020,7 +1020,7 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup( /** * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint4 vtable_lookup( +ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( const vtable4_32x8& tbl, vint4 idx ) { @@ -1034,7 +1034,7 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup( /** * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint4 vtable_lookup( +ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( const vtable4_64x8& tbl, vint4 idx ) { diff --git a/Source/astcenc_vecmathlib_none_4.h b/Source/astcenc_vecmathlib_none_4.h index 9f4f4aed1..4646e84ad 100644 --- a/Source/astcenc_vecmathlib_none_4.h +++ b/Source/astcenc_vecmathlib_none_4.h @@ -1121,7 +1121,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare( /** * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint4 vtable_lookup( +ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( const vtable4_16x8& table, vint4 idx ) { @@ -1134,7 +1134,7 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup( /** * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint4 vtable_lookup( +ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( const vtable4_32x8& table, vint4 idx ) { @@ -1147,7 +1147,7 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup( /** * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint4 vtable_lookup( +ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( const vtable4_64x8& table, vint4 idx ) { diff --git a/Source/astcenc_vecmathlib_sse_4.h b/Source/astcenc_vecmathlib_sse_4.h index 02c487820..5c726b6ab 100644 --- a/Source/astcenc_vecmathlib_sse_4.h +++ b/Source/astcenc_vecmathlib_sse_4.h @@ -1129,7 +1129,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare( /** * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint4 vtable_lookup( +ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( const vtable4_16x8& tbl, vint4 idx ) { @@ -1150,7 +1150,7 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup( /** * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint4 vtable_lookup( +ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( const vtable4_32x8& tbl, vint4 idx ) { @@ -1176,7 +1176,7 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup( /** * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint4 vtable_lookup( +ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( const vtable4_64x8& tbl, vint4 idx ) { diff --git a/Source/astcenc_vecmathlib_sve_8.h b/Source/astcenc_vecmathlib_sve_8.h index a1eeb18c3..6cd6ed046 100644 --- a/Source/astcenc_vecmathlib_sve_8.h +++ b/Source/astcenc_vecmathlib_sve_8.h @@ -963,7 +963,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare( /** * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint8 vtable_lookup( +ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit( const vtable8_16x8& tbl, vint8 idx ) { @@ -980,7 +980,7 @@ ASTCENC_SIMD_INLINE vint8 vtable_lookup( /** * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint8 vtable_lookup( +ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit( const vtable8_32x8& tbl, vint8 idx ) { @@ -997,7 +997,7 @@ ASTCENC_SIMD_INLINE vint8 vtable_lookup( /** * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices. */ -ASTCENC_SIMD_INLINE vint8 vtable_lookup( +ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit( const vtable8_64x8& tbl, vint8 idx ) { From 3aea47327a8c7f3da655b76f284749a2ec2518a8 Mon Sep 17 00:00:00 2001 From: Peter Harris Date: Sat, 10 Aug 2024 09:10:00 +0100 Subject: [PATCH 5/8] Neon build fixes --- Source/astcenc_vecmathlib_neon_4.h | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/Source/astcenc_vecmathlib_neon_4.h b/Source/astcenc_vecmathlib_neon_4.h index beea3425f..faaff52e2 100644 --- a/Source/astcenc_vecmathlib_neon_4.h +++ b/Source/astcenc_vecmathlib_neon_4.h @@ -967,8 +967,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare( vtable4_16x8& table, const uint8_t* data ) { - vint4 t0 = vint4::load(data); - table.t0 = vreinterpretq_s8_s32(t0.m); + table.t01 = vldq_u8(data); } /** @@ -978,11 +977,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare( vtable4_32x8& table, const uint8_t* data ) { - vint4 t0 = vint4::load(data); - vint4 t1 = vint4::load(data + 16); - - table.t01[0] = vreinterpretq_s8_s32(t0.m); - table.t01[1] = vreinterpretq_s8_s32(t1.m); + table.t01 = vld2q_u8(data); } /** @@ -992,15 +987,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare( vtable4_64x8& table, const uint8_t* data ) { - vint4 t0 = vint4::load(data); - vint4 t1 = vint4::load(data + 16); - vint4 t2 = vint4::load(data + 32); - vint4 t3 = vint4::load(data + 48); - - table.t0123[0] = vreinterpretq_s8_s32(t0.m); - table.t0123[1] = vreinterpretq_s8_s32(t1.m); - table.t0123[2] = vreinterpretq_s8_s32(t2.m); - table.t0123[3] = vreinterpretq_s8_s32(t3.m); + table.t0123 = vld4q_u8(data); } /** @@ -1028,7 +1015,7 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00)); uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked); - return vint4(vreinterpretq_s32_s8(vqtbl2q_s8(table.t01, idx_bytes))); + return vint4(vreinterpretq_s32_s8(vqtbl2q_s8(tbl.t01, idx_bytes))); } /** @@ -1042,7 +1029,7 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00)); uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked); - return vint4(vreinterpretq_s32_s8(vqtbl4q_s8(table.t0123, idx_bytes))); + return vint4(vreinterpretq_s32_s8(vqtbl4q_s8(tbl.t0123, idx_bytes))); } /** From 3520617a9b7e6a176006713831b64cc064bcaadf Mon Sep 17 00:00:00 2001 From: Peter Harris Date: Sat, 10 Aug 2024 09:13:36 +0100 Subject: [PATCH 6/8] Neon build fixes --- Source/astcenc_vecmathlib_neon_4.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Source/astcenc_vecmathlib_neon_4.h b/Source/astcenc_vecmathlib_neon_4.h index faaff52e2..9be70d4e6 100644 --- a/Source/astcenc_vecmathlib_neon_4.h +++ b/Source/astcenc_vecmathlib_neon_4.h @@ -943,21 +943,21 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v) * Table structure for a 16x 8-bit entry table. */ struct vtable4_16x8 { - int8x16_t t0; + uint8x16_t t0; }; /* * Table structure for a 32x 8-bit entry table. */ struct vtable4_32x8 { - int8x16x2_t t01; + uint8x16x2_t t01; }; /* * Table structure for a 64x 8-bit entry table. */ struct vtable4_64x8 { - int8x16x4_t t0123; + uint8x16x4_t t0123; }; /** @@ -967,7 +967,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare( vtable4_16x8& table, const uint8_t* data ) { - table.t01 = vldq_u8(data); + table.t0 = vld1q_u8(data); } /** @@ -1001,7 +1001,7 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00)); uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked); - return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(tbl.t0, idx_bytes))); + return vint4(vreinterpretq_s32_u8(vqtbl1q_u8(tbl.t0, idx_bytes))); } /** @@ -1015,7 +1015,7 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00)); uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked); - return vint4(vreinterpretq_s32_s8(vqtbl2q_s8(tbl.t01, idx_bytes))); + return vint4(vreinterpretq_s32_u8(vqtbl2q_u8(tbl.t01, idx_bytes))); } /** @@ -1029,7 +1029,7 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit( int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00)); uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked); - return vint4(vreinterpretq_s32_s8(vqtbl4q_s8(tbl.t0123, idx_bytes))); + return vint4(vreinterpretq_s32_u8(vqtbl4q_u8(tbl.t0123, idx_bytes))); } /** From 1a8e6bff78d10183fc3a3305e7c1970dcfa65688 Mon Sep 17 00:00:00 2001 From: Peter Harris Date: Sat, 10 Aug 2024 09:32:13 +0100 Subject: [PATCH 7/8] Fix Neon test failure --- Source/astcenc_vecmathlib_neon_4.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/Source/astcenc_vecmathlib_neon_4.h b/Source/astcenc_vecmathlib_neon_4.h index 9be70d4e6..c7ff01289 100644 --- a/Source/astcenc_vecmathlib_neon_4.h +++ b/Source/astcenc_vecmathlib_neon_4.h @@ -977,7 +977,10 @@ ASTCENC_SIMD_INLINE void vtable_prepare( vtable4_32x8& table, const uint8_t* data ) { - table.t01 = vld2q_u8(data); + table.t01 = uint8x16x2_t { + vld1q_u8(data), + vld1q_u8(data + 16) + }; } /** @@ -987,7 +990,12 @@ ASTCENC_SIMD_INLINE void vtable_prepare( vtable4_64x8& table, const uint8_t* data ) { - table.t0123 = vld4q_u8(data); + table.t0123 = uint8x16x4_t { + vld1q_u8(data), + vld1q_u8(data + 16), + vld1q_u8(data + 32), + vld1q_u8(data + 48) + }; } /** From ed9836221898e6f406daf4e9512188edbe13968f Mon Sep 17 00:00:00 2001 From: Peter Harris Date: Sat, 10 Aug 2024 18:15:49 +0000 Subject: [PATCH 8/8] Fix SVE build issue --- Source/astcenc_vecmathlib_sve_8.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Source/astcenc_vecmathlib_sve_8.h b/Source/astcenc_vecmathlib_sve_8.h index 6cd6ed046..9b48411b3 100644 --- a/Source/astcenc_vecmathlib_sve_8.h +++ b/Source/astcenc_vecmathlib_sve_8.h @@ -988,7 +988,7 @@ ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit( svint32_8_t idx_masked = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00)); svuint8_8_t idx_bytes = svreinterpret_u8_s32(idx_masked); - svuint8_8_t tbl_bytes = svreinterpret_u8_s32(table.t0.m); + svuint8_8_t tbl_bytes = svreinterpret_u8_s32(tbl.t0.m); svuint8_8_t result = svtbl_u8(tbl_bytes, idx_bytes); return vint8(svreinterpret_s32_u8(result));