From 1e748f916a78277749af2a6d44e30f9a1fd9ee65 Mon Sep 17 00:00:00 2001
From: Peter Harris <peter.harris@arm.com>
Date: Fri, 9 Aug 2024 23:02:39 +0100
Subject: [PATCH 1/8] New vtable API for x86

---
 Source/UnitTest/test_simd.cpp                 | 183 ++++++++++++------
 Source/astcenc_decompress_symbolic.cpp        |  27 +--
 .../astcenc_ideal_endpoints_and_weights.cpp   |  19 +-
 Source/astcenc_vecmathlib.h                   |  12 ++
 Source/astcenc_vecmathlib_avx2_8.h            | 128 +++++++-----
 Source/astcenc_vecmathlib_none_4.h            | 120 ++++++------
 Source/astcenc_vecmathlib_sse_4.h             | 129 +++++++-----
 Test/astc_profile_valgrind.py                 |   2 +-
 8 files changed, 388 insertions(+), 232 deletions(-)

diff --git a/Source/UnitTest/test_simd.cpp b/Source/UnitTest/test_simd.cpp
index f857c3550..dcfa4e644 100644
--- a/Source/UnitTest/test_simd.cpp
+++ b/Source/UnitTest/test_simd.cpp
@@ -1947,43 +1947,78 @@ TEST(vmask4, not)
 }
 
 /** @brief Test vint4 table permute. */
-TEST(vint4, vtable_8bt_32bi_32entry)
+TEST(vint4, vtable4_16x8)
 {
-	vint4 table0(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
-	vint4 table1(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f);
+	uint8_t data[16] = {
+		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+	};
 
-	vint4 table0p, table1p;
-	vtable_prepare(table0, table1, table0p, table1p);
+	vtable4_16x8 table;
+	vtable_prepare(table, data);
 
-	vint4 index(0, 7, 4, 31);
+	vint4 index(0, 7, 4, 15);
 
-	vint4 result = vtable_8bt_32bi(table0p, table1p, index);
+	vint4 result = vtable_lookup(table, index);
 
-	EXPECT_EQ(result.lane<0>(),  3);
-	EXPECT_EQ(result.lane<1>(),  4);
-	EXPECT_EQ(result.lane<2>(),  7);
-	EXPECT_EQ(result.lane<3>(), 28);
+	EXPECT_EQ(result.lane<0>(),  0);
+	EXPECT_EQ(result.lane<1>(),  7);
+	EXPECT_EQ(result.lane<2>(),  4);
+	EXPECT_EQ(result.lane<3>(), 15);
 }
 
 /** @brief Test vint4 table permute. */
-TEST(vint4, vtable_8bt_32bi_64entry)
+TEST(vint4, vtable4_32x8)
 {
-	vint4 table0(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
-	vint4 table1(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f);
-	vint4 table2(0x20212223, 0x24252627, 0x28292a2b, 0x2c2d2e2f);
-	vint4 table3(0x30313233, 0x34353637, 0x38393a3b, 0x3c3d3e3f);
+	uint8_t data[32] = {
+		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
+	};
+
+	vtable4_32x8 table;
+	vtable_prepare(table, data);
+
+	vint4 index(0, 7, 4, 31);
+
+	vint4 result = vtable_lookup(table, index);
 
-	vint4 table0p, table1p, table2p, table3p;
-	vtable_prepare(table0, table1, table2, table3, table0p, table1p, table2p, table3p);
+	EXPECT_EQ(result.lane<0>(),  0);
+	EXPECT_EQ(result.lane<1>(),  7);
+	EXPECT_EQ(result.lane<2>(),  4);
+	EXPECT_EQ(result.lane<3>(), 31);
+}
+
+/** @brief Test vint4 table permute. */
+TEST(vint4, vtable4_64x8)
+{
+	uint8_t data[64] = {
+		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+		0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+		0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+		0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+		0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f
+	};
+
+	vtable4_64x8 table;
+	vtable_prepare(table, data);
 
 	vint4 index(0, 7, 38, 63);
 
-	vint4 result = vtable_8bt_32bi(table0p, table1p, table2p, table3p, index);
+	vint4 result = vtable_lookup(table, index);
+
+	uint8_t* hack = reinterpret_cast<uint8_t*>(&table);
+	std::cout << "38: " << hack[38] << "\n";
+	std::cout << "63: " << hack[63] << "\n";
 
-	EXPECT_EQ(result.lane<0>(),  3);
-	EXPECT_EQ(result.lane<1>(),  4);
-	EXPECT_EQ(result.lane<2>(), 37);
-	EXPECT_EQ(result.lane<3>(), 60);
+	EXPECT_EQ(result.lane<0>(),  0);
+	EXPECT_EQ(result.lane<1>(),  7);
+	EXPECT_EQ(result.lane<2>(), 38);
+	EXPECT_EQ(result.lane<3>(), 63);
 }
 
 /** @brief Test vint4 rgba byte interleave. */
@@ -3657,57 +3692,95 @@ TEST(vmask8, not)
 }
 
 /** @brief Test vint8 table permute. */
-TEST(vint8, vtable_8bt_32bi_32entry)
+TEST(vint8, vtable8_16x8)
 {
-	vint4 table0(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
-	vint4 table1(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f);
+	uint8_t data[16] = {
+		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+	};
 
-	vint8 table0p, table1p;
-	vtable_prepare(table0, table1, table0p, table1p);
+	vtable8_16x8 table;
+	vtable_prepare(table, data);
 
-	vint8 index = vint8_lit(0, 7, 4, 15, 16, 20, 23, 31);
+	vint8 index = vint8_lit(0, 7, 4, 15, 1, 2, 14, 4);
 
-	vint8 result = vtable_8bt_32bi(table0p, table1p, index);
+	vint8 result = vtable_lookup(table, index);
 
 	alignas(32) int ra[8];
 	store(result, ra);
 
-	EXPECT_EQ(ra[0],  3);
-	EXPECT_EQ(ra[1],  4);
-	EXPECT_EQ(ra[2],  7);
-	EXPECT_EQ(ra[3], 12);
-	EXPECT_EQ(ra[4], 19);
-	EXPECT_EQ(ra[5], 23);
-	EXPECT_EQ(ra[6], 20);
-	EXPECT_EQ(ra[7], 28);
+	EXPECT_EQ(ra[0],  0);
+	EXPECT_EQ(ra[1],  7);
+	EXPECT_EQ(ra[2],  4);
+	EXPECT_EQ(ra[3], 15);
+	EXPECT_EQ(ra[4],  1);
+	EXPECT_EQ(ra[5],  2);
+	EXPECT_EQ(ra[6], 14);
+	EXPECT_EQ(ra[7],  4);
 }
 
-/** @brief Test vint4 table permute. */
-TEST(vint8, vtable_8bt_32bi_64entry)
+/** @brief Test vint8 table permute. */
+TEST(vint8, vtable8_32x8)
 {
-	vint4 table0(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
-	vint4 table1(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f);
-	vint4 table2(0x20212223, 0x24252627, 0x28292a2b, 0x2c2d2e2f);
-	vint4 table3(0x30313233, 0x34353637, 0x38393a3b, 0x3c3d3e3f);
+	uint8_t data[32] = {
+		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
+	};
+
+	vtable8_32x8 table;
+	vtable_prepare(table, data);
+
+	vint8 index = vint8_lit(0, 7, 4, 15, 16, 20, 23, 31);
 
-	vint8 table0p, table1p, table2p, table3p;
-	vtable_prepare(table0, table1, table2, table3, table0p, table1p, table2p, table3p);
+	vint8 result = vtable_lookup(table, index);
+
+	alignas(32) int ra[8];
+	store(result, ra);
+
+	EXPECT_EQ(ra[0],  0);
+	EXPECT_EQ(ra[1],  7);
+	EXPECT_EQ(ra[2],  4);
+	EXPECT_EQ(ra[3], 15);
+	EXPECT_EQ(ra[4], 16);
+	EXPECT_EQ(ra[5], 20);
+	EXPECT_EQ(ra[6], 23);
+	EXPECT_EQ(ra[7], 31);
+}
+
+/** @brief Test vint8 table permute. */
+TEST(vint8, vtable8_64x8)
+{
+	uint8_t data[64] = {
+		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+		0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+		0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+		0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+		0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f
+	};
+
+	vtable8_64x8 table;
+	vtable_prepare(table, data);
 
 	vint8 index = vint8_lit(0, 7, 4, 15, 16, 20, 38, 63);
 
-	vint8 result = vtable_8bt_32bi(table0p, table1p, table2p, table3p, index);
+	vint8 result = vtable_lookup(table, index);
 
 	alignas(32) int ra[8];
 	store(result, ra);
 
-	EXPECT_EQ(ra[0],  3);
-	EXPECT_EQ(ra[1],  4);
-	EXPECT_EQ(ra[2],  7);
-	EXPECT_EQ(ra[3], 12);
-	EXPECT_EQ(ra[4], 19);
-	EXPECT_EQ(ra[5], 23);
-	EXPECT_EQ(ra[6], 37);
-	EXPECT_EQ(ra[7], 60);
+	EXPECT_EQ(ra[0],  0);
+	EXPECT_EQ(ra[1],  7);
+	EXPECT_EQ(ra[2],  4);
+	EXPECT_EQ(ra[3], 15);
+	EXPECT_EQ(ra[4], 16);
+	EXPECT_EQ(ra[5], 20);
+	EXPECT_EQ(ra[6], 38);
+	EXPECT_EQ(ra[7], 63);
 }
 
 #endif
diff --git a/Source/astcenc_decompress_symbolic.cpp b/Source/astcenc_decompress_symbolic.cpp
index 902a3f3e9..ef9165db8 100644
--- a/Source/astcenc_decompress_symbolic.cpp
+++ b/Source/astcenc_decompress_symbolic.cpp
@@ -98,13 +98,8 @@ void unpack_weights(
 	if (!is_dual_plane)
 	{
 		// Build full 64-entry weight lookup table
-		vint4 tab0 = vint4::load(scb.weights +  0);
-		vint4 tab1 = vint4::load(scb.weights + 16);
-		vint4 tab2 = vint4::load(scb.weights + 32);
-		vint4 tab3 = vint4::load(scb.weights + 48);
-
-		vint tab0p, tab1p, tab2p, tab3p;
-		vtable_prepare(tab0, tab1, tab2, tab3, tab0p, tab1p, tab2p, tab3p);
+		vtable_64x8 table;
+		vtable_prepare(table, scb.weights);
 
 		for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
 		{
@@ -118,7 +113,7 @@ void unpack_weights(
 				vint texel_weights(di.texel_weights_tr[j] + i);
 				vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
 
-				summed_value += vtable_8bt_32bi(tab0p, tab1p, tab2p, tab3p, texel_weights) * texel_weights_int;
+				summed_value += vtable_lookup(table, texel_weights) * texel_weights_int;
 			}
 
 			store(lsr<4>(summed_value), weights_plane1 + i);
@@ -128,16 +123,12 @@ void unpack_weights(
 	{
 		// Build a 32-entry weight lookup table per plane
 		// Plane 1
-		vint4 tab0_plane1 = vint4::load(scb.weights +  0);
-		vint4 tab1_plane1 = vint4::load(scb.weights + 16);
-		vint tab0_plane1p, tab1_plane1p;
-		vtable_prepare(tab0_plane1, tab1_plane1, tab0_plane1p, tab1_plane1p);
+		vtable_32x8 tab_plane1;
+		vtable_prepare(tab_plane1, scb.weights);
 
 		// Plane 2
-		vint4 tab0_plane2 = vint4::load(scb.weights + 32);
-		vint4 tab1_plane2 = vint4::load(scb.weights + 48);
-		vint tab0_plane2p, tab1_plane2p;
-		vtable_prepare(tab0_plane2, tab1_plane2, tab0_plane2p, tab1_plane2p);
+		vtable_32x8 tab_plane2;
+		vtable_prepare(tab_plane2, scb.weights + 32);
 
 		for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
 		{
@@ -153,8 +144,8 @@ void unpack_weights(
 				vint texel_weights(di.texel_weights_tr[j] + i);
 				vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
 
-				sum_plane1 += vtable_8bt_32bi(tab0_plane1p, tab1_plane1p, texel_weights) * texel_weights_int;
-				sum_plane2 += vtable_8bt_32bi(tab0_plane2p, tab1_plane2p, texel_weights) * texel_weights_int;
+				sum_plane1 += vtable_lookup(tab_plane1, texel_weights) * texel_weights_int;
+				sum_plane2 += vtable_lookup(tab_plane2, texel_weights) * texel_weights_int;
 			}
 
 			store(lsr<4>(sum_plane1), weights_plane1 + i);
diff --git a/Source/astcenc_ideal_endpoints_and_weights.cpp b/Source/astcenc_ideal_endpoints_and_weights.cpp
index 3442464d5..8a09c6f67 100644
--- a/Source/astcenc_ideal_endpoints_and_weights.cpp
+++ b/Source/astcenc_ideal_endpoints_and_weights.cpp
@@ -1023,9 +1023,8 @@ void compute_quantized_weights_for_decimation(
 	// safe data in compute_ideal_weights_for_decimation and arrays are always 64 elements
 	if (get_quant_level(quant_level) <= 16)
 	{
-		vint4 tab0 = vint4::load(qat.quant_to_unquant);
-		vint tab0p;
-		vtable_prepare(tab0, tab0p);
+		vtable_16x8 table;
+		vtable_prepare(table, qat.quant_to_unquant);
 
 		for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
 		{
@@ -1038,8 +1037,8 @@ void compute_quantized_weights_for_decimation(
 			vint weightl = float_to_int(ix1);
 			vint weighth = min(weightl + vint(1), steps_m1);
 
-			vint ixli = vtable_8bt_32bi(tab0p, weightl);
-			vint ixhi = vtable_8bt_32bi(tab0p, weighth);
+			vint ixli = vtable_lookup(table, weightl);
+			vint ixhi = vtable_lookup(table, weighth);
 
 			vfloat ixl = int_to_float(ixli);
 			vfloat ixh = int_to_float(ixhi);
@@ -1055,10 +1054,8 @@ void compute_quantized_weights_for_decimation(
 	}
 	else
 	{
-		vint4 tab0 = vint4::load(qat.quant_to_unquant +  0);
-		vint4 tab1 = vint4::load(qat.quant_to_unquant + 16);
-		vint tab0p, tab1p;
-		vtable_prepare(tab0, tab1, tab0p, tab1p);
+		vtable_32x8 table;
+		vtable_prepare(table, qat.quant_to_unquant);
 
 		for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
 		{
@@ -1071,8 +1068,8 @@ void compute_quantized_weights_for_decimation(
 			vint weightl = float_to_int(ix1);
 			vint weighth = min(weightl + vint(1), steps_m1);
 
-			vint ixli = vtable_8bt_32bi(tab0p, tab1p, weightl);
-			vint ixhi = vtable_8bt_32bi(tab0p, tab1p, weighth);
+			vint ixli = vtable_lookup(table, weightl);
+			vint ixhi = vtable_lookup(table, weighth);
 
 			vfloat ixl = int_to_float(ixli);
 			vfloat ixh = int_to_float(ixhi);
diff --git a/Source/astcenc_vecmathlib.h b/Source/astcenc_vecmathlib.h
index 628755619..07b933206 100644
--- a/Source/astcenc_vecmathlib.h
+++ b/Source/astcenc_vecmathlib.h
@@ -96,6 +96,10 @@
 	using vint = vint8;
 	using vmask = vmask8;
 
+	using vtable_16x8 = vtable8_16x8;
+	using vtable_32x8 = vtable8_32x8;
+	using vtable_64x8 = vtable8_64x8;
+
 	constexpr auto loada = vfloat8::loada;
 	constexpr auto load1 = vfloat8::load1;
 
@@ -111,6 +115,10 @@
 	using vint = vint4;
 	using vmask = vmask4;
 
+	using vtable_16x8 = vtable4_16x8;
+	using vtable_32x8 = vtable4_32x8;
+	using vtable_64x8 = vtable4_64x8;
+
 	constexpr auto loada = vfloat4::loada;
 	constexpr auto load1 = vfloat4::load1;
 
@@ -185,6 +193,10 @@
 	using vint = vint4;
 	using vmask = vmask4;
 
+	using vtable_16x8 = vtable4_16x8;
+	using vtable_32x8 = vtable4_32x8;
+	using vtable_64x8 = vtable4_64x8;
+
 	constexpr auto loada = vfloat4::loada;
 	constexpr auto load1 = vfloat4::load1;
 #endif
diff --git a/Source/astcenc_vecmathlib_avx2_8.h b/Source/astcenc_vecmathlib_avx2_8.h
index 347abf83c..c3ebbba29 100644
--- a/Source/astcenc_vecmathlib_avx2_8.h
+++ b/Source/astcenc_vecmathlib_avx2_8.h
@@ -971,98 +971,138 @@ ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a)
 	return vfloat8(_mm256_castsi256_ps(a.m));
 }
 
+/*
+ * Table structure for a 16x 8-bit entry table.
+ */
+struct vtable8_16x8 {
+	vint8 t0;
+};
+
+/*
+ * Table structure for a 32x 8-bit entry table.
+ */
+struct vtable8_32x8 {
+	vint8 t0;
+	vint8 t1;
+};
+
+/*
+ * Table structure for a 64x 8-bit entry table.
+ */
+struct vtable8_64x8 {
+	vint8 t0;
+	vint8 t1;
+	vint8 t2;
+	vint8 t3;
+};
+
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table for 16x 8-bit entry table.
  */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint8& t0p)
-{
-	// AVX2 duplicates the table within each 128-bit lane
-	__m128i t0n = t0.m;
-	t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vtable8_16x8& table,
+	const uint8_t* data
+) {
+	// AVX2 tables duplicate table entries in each 128-bit half-register
+	vint4 d0 = vint4::load(data);
+	table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));
 }
 
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table for 32x 8-bit entry table.
  */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint8& t0p, vint8& t1p)
-{
-	// AVX2 duplicates the table within each 128-bit lane
-	__m128i t0n = t0.m;
-	t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vtable8_32x8& table,
+	const uint8_t* data
+) {
+	// AVX2 tables duplicate table entries in each 128-bit half-register
+	// Direct lookup for first row
+	vint4 d0 = vint4::load(data);
+	table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));
 
-	__m128i t1n = _mm_xor_si128(t0.m, t1.m);
-	t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n));
+	// XOR with previous rows for subsequent rows
+	vint4 d1 = vint4::load(data + 16);
+	d1 = d1 ^ d0;
+	table.t1 = vint8(astcenc_mm256_set_m128i(d1.m, d1.m));
 }
 
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table 64x 8-bit entry table.
  */
 ASTCENC_SIMD_INLINE void vtable_prepare(
-	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
-	vint8& t0p, vint8& t1p, vint8& t2p, vint8& t3p)
-{
-	// AVX2 duplicates the table within each 128-bit lane
-	__m128i t0n = t0.m;
-	t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
+	vtable8_64x8& table,
+	const uint8_t* data
+) {
+	// AVX2 tables duplicate table entries in each 128-bit half-register
+	vint4 d0 = vint4::load(data);
+	vint4 d1 = vint4::load(data + 16);
+	vint4 d2 = vint4::load(data + 32);
+	vint4 d3 = vint4::load(data + 48);
 
-	__m128i t1n = _mm_xor_si128(t0.m, t1.m);
-	t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n));
+	table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));
+	table.t1 = vint8(astcenc_mm256_set_m128i(d1.m, d1.m));
+	table.t2 = vint8(astcenc_mm256_set_m128i(d2.m, d2.m));
+	table.t3 = vint8(astcenc_mm256_set_m128i(d3.m, d3.m));
 
-	__m128i t2n = _mm_xor_si128(t1.m, t2.m);
-	t2p = vint8(astcenc_mm256_set_m128i(t2n, t2n));
-
-	__m128i t3n = _mm_xor_si128(t2.m, t3.m);
-	t3p = vint8(astcenc_mm256_set_m128i(t3n, t3n));
+	table.t3 = table.t3 ^ table.t2;
+	table.t2 = table.t2 ^ table.t1;
+	table.t1 = table.t1 ^ table.t0;
 }
 
 /**
- * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 idx)
-{
+ASTCENC_SIMD_INLINE vint8 vtable_lookup(
+	const vtable8_16x8& tbl,
+	vint8 idx
+) {
 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
 	__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
 
-	__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
+	__m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx);
 	return vint8(result);
 }
 
 /**
- * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 idx)
-{
+ASTCENC_SIMD_INLINE vint8 vtable_lookup(
+	const vtable8_32x8& tbl,
+	vint8 idx
+) {
 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
 	__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
 
-	__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
+	__m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx);
 	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
 
-	__m256i result2 = _mm256_shuffle_epi8(t1.m, idxx);
+	__m256i result2 = _mm256_shuffle_epi8(tbl.t1.m, idxx);
 	result = _mm256_xor_si256(result, result2);
 	return vint8(result);
 }
 
 /**
- * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3, vint8 idx)
-{
+ASTCENC_SIMD_INLINE vint8 vtable_lookup(
+	const vtable8_64x8& tbl,
+	vint8 idx
+) {
 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
 	__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
 
-	__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
+	__m256i result = _mm256_shuffle_epi8(tbl.t0.m, idxx);
 	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
 
-	__m256i result2 = _mm256_shuffle_epi8(t1.m, idxx);
+	__m256i result2 = _mm256_shuffle_epi8(tbl.t1.m, idxx);
 	result = _mm256_xor_si256(result, result2);
 	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
 
-	result2 = _mm256_shuffle_epi8(t2.m, idxx);
+	result2 = _mm256_shuffle_epi8(tbl.t2.m, idxx);
 	result = _mm256_xor_si256(result, result2);
 	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
 
-	result2 = _mm256_shuffle_epi8(t3.m, idxx);
+	result2 = _mm256_shuffle_epi8(tbl.t3.m, idxx);
 	result = _mm256_xor_si256(result, result2);
 
 	return vint8(result);
diff --git a/Source/astcenc_vecmathlib_none_4.h b/Source/astcenc_vecmathlib_none_4.h
index 55981e4c3..9f4f4aed1 100644
--- a/Source/astcenc_vecmathlib_none_4.h
+++ b/Source/astcenc_vecmathlib_none_4.h
@@ -1067,84 +1067,94 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 a)
 	return r;
 }
 
+/*
+ * Table structure for a 16x 8-bit entry table.
+ */
+struct vtable4_16x8 {
+	const uint8_t* data;
+};
+
+/*
+ * Table structure for a 32x 8-bit entry table.
+ */
+struct vtable4_32x8 {
+	const uint8_t* data;
+};
+
+/*
+ * Table structure for a 64x 8-bit entry table.
+ */
+struct vtable4_64x8 {
+	const uint8_t* data;
+};
+
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table for 16x 8-bit entry table.
  */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
-{
-	t0p = t0;
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vtable4_16x8& table,
+	const uint8_t* data
+) {
+	table.data = data;
 }
 
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table for 32x 8-bit entry table.
  */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
-{
-	t0p = t0;
-	t1p = t1;
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vtable4_32x8& table,
+	const uint8_t* data
+) {
+	table.data = data;
 }
 
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table 64x 8-bit entry table.
  */
 ASTCENC_SIMD_INLINE void vtable_prepare(
-	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
-	vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
-{
-	t0p = t0;
-	t1p = t1;
-	t2p = t2;
-	t3p = t3;
+	vtable4_64x8& table,
+	const uint8_t* data
+) {
+	table.data = data;
 }
 
 /**
- * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
-{
-	uint8_t table[16];
-
-	std::memcpy(table +  0, t0.m, 4 * sizeof(int));
-
-	return vint4(table[idx.lane<0>()],
-	             table[idx.lane<1>()],
-	             table[idx.lane<2>()],
-	             table[idx.lane<3>()]);
+ASTCENC_SIMD_INLINE vint4 vtable_lookup(
+	const vtable4_16x8& table,
+	vint4 idx
+) {
+	return vint4(table.data[idx.lane<0>()],
+	             table.data[idx.lane<1>()],
+	             table.data[idx.lane<2>()],
+	             table.data[idx.lane<3>()]);
 }
 
-
 /**
- * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
-{
-	uint8_t table[32];
-
-	std::memcpy(table +  0, t0.m, 4 * sizeof(int));
-	std::memcpy(table + 16, t1.m, 4 * sizeof(int));
-
-	return vint4(table[idx.lane<0>()],
-	             table[idx.lane<1>()],
-	             table[idx.lane<2>()],
-	             table[idx.lane<3>()]);
+ASTCENC_SIMD_INLINE vint4 vtable_lookup(
+	const vtable4_32x8& table,
+	vint4 idx
+) {
+	return vint4(table.data[idx.lane<0>()],
+	             table.data[idx.lane<1>()],
+	             table.data[idx.lane<2>()],
+	             table.data[idx.lane<3>()]);
 }
 
 /**
- * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
-{
-	uint8_t table[64];
-
-	std::memcpy(table +  0, t0.m, 4 * sizeof(int));
-	std::memcpy(table + 16, t1.m, 4 * sizeof(int));
-	std::memcpy(table + 32, t2.m, 4 * sizeof(int));
-	std::memcpy(table + 48, t3.m, 4 * sizeof(int));
-
-	return vint4(table[idx.lane<0>()],
-	             table[idx.lane<1>()],
-	             table[idx.lane<2>()],
-	             table[idx.lane<3>()]);
+ASTCENC_SIMD_INLINE vint4 vtable_lookup(
+	const vtable4_64x8& table,
+	vint4 idx
+) {
+	return vint4(table.data[idx.lane<0>()],
+	             table.data[idx.lane<1>()],
+	             table.data[idx.lane<2>()],
+	             table.data[idx.lane<3>()]);
 }
 
 /**
diff --git a/Source/astcenc_vecmathlib_sse_4.h b/Source/astcenc_vecmathlib_sse_4.h
index 111d9b9e5..5e260a8da 100644
--- a/Source/astcenc_vecmathlib_sse_4.h
+++ b/Source/astcenc_vecmathlib_sse_4.h
@@ -1037,63 +1037,92 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
 	return vfloat4(_mm_castsi128_ps(v.m));
 }
 
+/*
+ * Table structure for a 16x 8-bit entry table.
+ */
+struct vtable4_16x8 {
+	vint4 t0;
+};
+
+/*
+ * Table structure for a 32x 8-bit entry table.
+ */
+struct vtable4_32x8 {
+	vint4 t0;
+	vint4 t1;
+};
+
+/*
+ * Table structure for a 64x 8-bit entry table.
+ */
+struct vtable4_64x8 {
+	vint4 t0;
+	vint4 t1;
+	vint4 t2;
+	vint4 t3;
+};
+
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table for 16x 8-bit entry table.
  */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
-{
-	t0p = t0;
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vtable4_16x8& table,
+	const uint8_t* data
+) {
+	table.t0 = vint4::load(data);
 }
 
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table for 32x 8-bit entry table.
  */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
-{
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vtable4_32x8& table,
+	const uint8_t* data
+) {
+	table.t0 = vint4::load(data);
+	table.t1 = vint4::load(data + 16);
+
 #if ASTCENC_SSE >= 41
-	t0p = t0;
-	t1p = t0 ^ t1;
-#else
-	t0p = t0;
-	t1p = t1;
+	table.t1 = table.t1 ^ table.t0;
 #endif
 }
 
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table 64x 8-bit entry table.
  */
 ASTCENC_SIMD_INLINE void vtable_prepare(
-	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
-	vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
-{
+	vtable4_64x8& table,
+	const uint8_t* data
+) {
+	table.t0 = vint4::load(data);
+	table.t1 = vint4::load(data + 16);
+	table.t2 = vint4::load(data + 32);
+	table.t3 = vint4::load(data + 48);
+
 #if ASTCENC_SSE >= 41
-	t0p = t0;
-	t1p = t0 ^ t1;
-	t2p = t1 ^ t2;
-	t3p = t2 ^ t3;
-#else
-	t0p = t0;
-	t1p = t1;
-	t2p = t2;
-	t3p = t3;
+	table.t3 = table.t3 ^ table.t2;
+	table.t2 = table.t2 ^ table.t1;
+	table.t1 = table.t1 ^ table.t0;
 #endif
 }
 
 /**
- * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
-{
+ASTCENC_SIMD_INLINE vint4 vtable_lookup(
+	const vtable4_16x8& tbl,
+	vint4 idx
+) {
 #if ASTCENC_SSE >= 41
 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
 	__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
 
-	__m128i result = _mm_shuffle_epi8(t0.m, idxx);
+	__m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx);
 	return vint4(result);
 #else
 	uint8_t table[16];
 
-	std::memcpy(table +  0, &t0.m, 4 * sizeof(int));
+	std::memcpy(table +  0, &tbl.t0.m, 4 * sizeof(int));
 
 	return vint4(table[idx.lane<0>()],
 	             table[idx.lane<1>()],
@@ -1103,26 +1132,28 @@ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
 }
 
 /**
- * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
-{
+ASTCENC_SIMD_INLINE vint4 vtable_lookup(
+	const vtable4_32x8& tbl,
+	vint4 idx
+) {
 #if ASTCENC_SSE >= 41
 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
 	__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
 
-	__m128i result = _mm_shuffle_epi8(t0.m, idxx);
+	__m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx);
 	idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
 
-	__m128i result2 = _mm_shuffle_epi8(t1.m, idxx);
+	__m128i result2 = _mm_shuffle_epi8(tbl.t1.m, idxx);
 	result = _mm_xor_si128(result, result2);
 
 	return vint4(result);
 #else
 	uint8_t table[32];
 
-	std::memcpy(table +  0, &t0.m, 4 * sizeof(int));
-	std::memcpy(table + 16, &t1.m, 4 * sizeof(int));
+	std::memcpy(table +  0, &tbl.t0.m, 4 * sizeof(int));
+	std::memcpy(table + 16, &tbl.t1.m, 4 * sizeof(int));
 
 	return vint4(table[idx.lane<0>()],
 	             table[idx.lane<1>()],
@@ -1132,36 +1163,38 @@ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
 }
 
 /**
- * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
-{
+ASTCENC_SIMD_INLINE vint4 vtable_lookup(
+	const vtable4_64x8& tbl,
+	vint4 idx
+) {
 #if ASTCENC_SSE >= 41
 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
 	__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
 
-	__m128i result = _mm_shuffle_epi8(t0.m, idxx);
+	__m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx);
 	idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
 
-	__m128i result2 = _mm_shuffle_epi8(t1.m, idxx);
+	__m128i result2 = _mm_shuffle_epi8(tbl.t1.m, idxx);
 	result = _mm_xor_si128(result, result2);
 	idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
 
-	result2 = _mm_shuffle_epi8(t2.m, idxx);
+	result2 = _mm_shuffle_epi8(tbl.t2.m, idxx);
 	result = _mm_xor_si128(result, result2);
 	idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
 
-	result2 = _mm_shuffle_epi8(t3.m, idxx);
+	result2 = _mm_shuffle_epi8(tbl.t3.m, idxx);
 	result = _mm_xor_si128(result, result2);
 
 	return vint4(result);
 #else
 	uint8_t table[64];
 
-	std::memcpy(table +  0, &t0.m, 4 * sizeof(int));
-	std::memcpy(table + 16, &t1.m, 4 * sizeof(int));
-	std::memcpy(table + 32, &t2.m, 4 * sizeof(int));
-	std::memcpy(table + 48, &t3.m, 4 * sizeof(int));
+	std::memcpy(table +  0, &tbl.t0.m, 4 * sizeof(int));
+	std::memcpy(table + 16, &tbl.t1.m, 4 * sizeof(int));
+	std::memcpy(table + 32, &tbl.t2.m, 4 * sizeof(int));
+	std::memcpy(table + 48, &tbl.t3.m, 4 * sizeof(int));
 
 	return vint4(table[idx.lane<0>()],
 	             table[idx.lane<1>()],
diff --git a/Test/astc_profile_valgrind.py b/Test/astc_profile_valgrind.py
index eb9a82a45..3cd08445b 100644
--- a/Test/astc_profile_valgrind.py
+++ b/Test/astc_profile_valgrind.py
@@ -125,7 +125,7 @@ def run_pass(image, noStartup, encoder, blocksize, quality):
 
     if noStartup:
         args = ["gprof2dot", "--format=callgrind", "--output=out.dot", "callgrind.txt",
-                "-s", "-z", "compress_block(astcenc_contexti const&, image_block const&, physical_compressed_block&, compression_working_buffers&)"]
+                "-s", "-z", "compress_block(astcenc_contexti const&, image_block const&, unsigned char*, compression_working_buffers&)"]
     else:
         args = ["gprof2dot", "--format=callgrind", "--output=out.dot", "callgrind.txt",
                 "-s",  "-z", "main"]

From 50dd7a43bf72cbc8de516ace8dcb352b79a40cb9 Mon Sep 17 00:00:00 2001
From: Peter Harris <peter.harris@arm.com>
Date: Fri, 9 Aug 2024 23:25:39 +0100
Subject: [PATCH 2/8] New vtable API for Arm

---
 Source/astcenc_vecmathlib.h        |   8 +++
 Source/astcenc_vecmathlib_neon_4.h | 102 ++++++++++++++++++---------
 Source/astcenc_vecmathlib_sve_8.h  | 109 ++++++++++++++++-------------
 3 files changed, 136 insertions(+), 83 deletions(-)

diff --git a/Source/astcenc_vecmathlib.h b/Source/astcenc_vecmathlib.h
index 07b933206..b41f6fa3a 100644
--- a/Source/astcenc_vecmathlib.h
+++ b/Source/astcenc_vecmathlib.h
@@ -146,6 +146,10 @@
 	using vint = vint8;
 	using vmask = vmask8;
 
+	using vtable_16x8 = vtable8_16x8;
+	using vtable_32x8 = vtable8_32x8;
+	using vtable_64x8 = vtable8_64x8;
+
 	constexpr auto loada = vfloat8::loada;
 	constexpr auto load1 = vfloat8::load1;
 
@@ -161,6 +165,10 @@
 	using vint = vint4;
 	using vmask = vmask4;
 
+	using vtable_16x8 = vtable4_16x8;
+	using vtable_32x8 = vtable4_32x8;
+	using vtable_64x8 = vtable4_64x8;
+
 	constexpr auto loada = vfloat4::loada;
 	constexpr auto load1 = vfloat4::load1;
 
diff --git a/Source/astcenc_vecmathlib_neon_4.h b/Source/astcenc_vecmathlib_neon_4.h
index e15d4f9ef..c5d43c14d 100644
--- a/Source/astcenc_vecmathlib_neon_4.h
+++ b/Source/astcenc_vecmathlib_neon_4.h
@@ -939,44 +939,74 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
 	return vfloat4(vreinterpretq_f32_s32(v.m));
 }
 
+/*
+ * Table structure for a 16x 8-bit entry table.
+ */
+struct vtable4_16x8 {
+	vint4 t0;
+};
+
+/*
+ * Table structure for a 32x 8-bit entry table.
+ */
+struct vtable4_32x8 {
+	vint4 t0;
+	vint4 t1;
+};
+
+/*
+ * Table structure for a 64x 8-bit entry table.
+ */
+struct vtable4_64x8 {
+	vint4 t0;
+	vint4 t1;
+	vint4 t2;
+	vint4 t3;
+};
+
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table for 16x 8-bit entry table.
  */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
-{
-	t0p = t0;
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vtable4_16x8& table,
+	const uint8_t* data
+) {
+	table.t0 = vint4::load(data);
 }
 
-
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table for 32x 8-bit entry table.
  */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
-{
-	t0p = t0;
-	t1p = t1;
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vtable4_32x8& table,
+	const uint8_t* data
+) {
+	table.t0 = vint4::load(data);
+	table.t1 = vint4::load(data + 16);
 }
 
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table 64x 8-bit entry table.
  */
 ASTCENC_SIMD_INLINE void vtable_prepare(
-	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
-	vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
-{
-	t0p = t0;
-	t1p = t1;
-	t2p = t2;
-	t3p = t3;
+	vtable4_64x8& table,
+	const uint8_t* data
+) {
+	table.t0 = vint4::load(data);
+	table.t1 = vint4::load(data + 16);
+	table.t2 = vint4::load(data + 32);
+	table.t3 = vint4::load(data + 48);
 }
 
 /**
- * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
-{
+ASTCENC_SIMD_INLINE vint4 vtable_lookup(
+	const vtable4_16x8& tbl,
+	vint4 idx
+) {
 	int8x16_t table {
-		vreinterpretq_s8_s32(t0.m)
+		vreinterpretq_s8_s32(tbl.t0.m)
 	};
 
 	// Set index byte above max index for unused bytes so table lookup returns zero
@@ -987,13 +1017,15 @@ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
 }
 
 /**
- * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
-{
+ASTCENC_SIMD_INLINE vint4 vtable_lookup(
+	const vtable4_32x8& tbl,
+	vint4 idx
+) {
 	int8x16x2_t table {
-		vreinterpretq_s8_s32(t0.m),
-		vreinterpretq_s8_s32(t1.m)
+		vreinterpretq_s8_s32(tbl.t0.m),
+		vreinterpretq_s8_s32(tbl.t1.m)
 	};
 
 	// Set index byte above max index for unused bytes so table lookup returns zero
@@ -1004,15 +1036,17 @@ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
 }
 
 /**
- * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
-{
+ASTCENC_SIMD_INLINE vint4 vtable_lookup(
+	const vtable4_64x8& tbl,
+	vint4 idx
+) {
 	int8x16x4_t table {
-		vreinterpretq_s8_s32(t0.m),
-		vreinterpretq_s8_s32(t1.m),
-		vreinterpretq_s8_s32(t2.m),
-		vreinterpretq_s8_s32(t3.m)
+		vreinterpretq_s8_s32(tbl.t0.m),
+		vreinterpretq_s8_s32(tbl.t1.m),
+		vreinterpretq_s8_s32(tbl.t2.m),
+		vreinterpretq_s8_s32(tbl.t3.m)
 	};
 
 	// Set index byte above max index for unused bytes so table lookup returns zero
diff --git a/Source/astcenc_vecmathlib_sve_8.h b/Source/astcenc_vecmathlib_sve_8.h
index 169f28cb1..a1eeb18c3 100644
--- a/Source/astcenc_vecmathlib_sve_8.h
+++ b/Source/astcenc_vecmathlib_sve_8.h
@@ -906,90 +906,101 @@ ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a)
 	return vfloat8(svreinterpret_f32_s32(a.m));
 }
 
-/**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+/*
+ * Table structure for a 16x 8-bit entry table.
  */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint8& t0p)
-{
-	t0p = vint8(svdup_neonq_f32(t0.m));
-}
+struct vtable8_16x8 {
+	vint8 t0;
+};
 
-/**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+/*
+ * Table structure for a 32x 8-bit entry table.
  */
-ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint8& t0p, vint8& t1p)
-{
-	// 8-wide SVE uses a single table register, so t1 is unused
-	(void)t1p;
+struct vtable8_32x8 {
+	vint8 t0;
+};
 
-	svfloat32_8_t t0v = svdup_neonq_f32(t0.m);
-	svfloat32_8_t t1v = svdup_neonq_f32(t1.m);
+/*
+ * Table structure for a 64x 8-bit entry table.
+ */
+struct vtable8_64x8 {
+	vint8 t0;
+	vint8 t1;
+};
 
-	t0p = vint8(svext_f32(t0v, t1v, 4));
+/**
+ * @brief Prepare a vtable lookup table for 16x 8-bit entry table.
+ */
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vtable8_16x8& table,
+	const uint8_t* data
+) {
+	// Top half of register will be zeros
+	table.t0 = vint8(svld1_u8(svptrue_pat_b8(SV_VL16), data));
 }
 
 /**
- * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ * @brief Prepare a vtable lookup table for 32x 8-bit entry table.
  */
 ASTCENC_SIMD_INLINE void vtable_prepare(
-	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
-	vint8& t0p, vint8& t1p, vint8& t2p, vint8& t3p)
-{
-	// 8-wide SVE uses a two table registers, so t2 and t3 are unused
-	(void)t2p;
-	(void)t3p;
-
-	svfloat32_8_t t0v = svdup_neonq_f32(t0.m);
-	svfloat32_8_t t1v = svdup_neonq_f32(t1.m);
-	svfloat32_8_t t2v = svdup_neonq_f32(t2.m);
-	svfloat32_8_t t3v = svdup_neonq_f32(t3.m);
+	vtable8_32x8& table,
+	const uint8_t* data
+) {
+	table.t0 = vint8(svld1_u8(svptrue_b8(), data));
+}
 
-	t0p = vint8(svext_f32(t0v, t1v, 4));
-	t1p = vint8(svext_f32(t2v, t3v, 4));
+/**
+ * @brief Prepare a vtable lookup table 64x 8-bit entry table.
+ */
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vtable8_64x8& table,
+	const uint8_t* data
+) {
+	table.t0 = vint8(svld1_u8(svptrue_b8(), data));
+	table.t1 = vint8(svld1_u8(svptrue_b8(), data + 32));
 }
 
 /**
- * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 idx)
-{
+ASTCENC_SIMD_INLINE vint8 vtable_lookup(
+	const vtable8_16x8& tbl,
+	vint8 idx
+) {
 	// Set index byte above max index for unused bytes so table lookup returns zero
 	svint32_8_t idx_masked = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00));
 
 	svuint8_8_t idx_bytes = svreinterpret_u8_s32(idx_masked);
-	svuint8_8_t tbl_bytes = svreinterpret_u8_s32(t0.m);
+	svuint8_8_t tbl_bytes = svreinterpret_u8_s32(tbl.t0.m);
 	svuint8_8_t result = svtbl_u8(tbl_bytes, idx_bytes);
 
 	return vint8(svreinterpret_s32_u8(result));
 }
 
 /**
- * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 idx)
-{
-	// 8-wide SVE uses a single table register, so t1 is unused
-	(void)t1;
-
+ASTCENC_SIMD_INLINE vint8 vtable_lookup(
+	const vtable8_32x8& tbl,
+	vint8 idx
+) {
 	// Set index byte above max index for unused bytes so table lookup returns zero
 	svint32_8_t idx_masked = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00));
 
 	svuint8_8_t idx_bytes = svreinterpret_u8_s32(idx_masked);
-	svuint8_8_t tbl_bytes = svreinterpret_u8_s32(t0.m);
+	svuint8_8_t tbl_bytes = svreinterpret_u8_s32(table.t0.m);
 	svuint8_8_t result = svtbl_u8(tbl_bytes, idx_bytes);
 
 	return vint8(svreinterpret_s32_u8(result));
 }
 
 /**
- * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
+ * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3, vint8 idx)
-{
-	// 8-wide SVE uses a two table registers, so t2 and t3 are unused
-	(void)t2;
-	(void)t3;
-
+ASTCENC_SIMD_INLINE vint8 vtable_lookup(
+	const vtable8_64x8& tbl,
+	vint8 idx
+) {
 	// Set index byte above max index for unused bytes so table lookup returns zero
 	svint32_8_t literal32 = svdup_s32(32);
 	svbool_8_t idx_lo_select = svcmplt(svptrue_b32(), idx.m, literal32);
@@ -999,8 +1010,8 @@ ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3
 	svuint8_8_t idx_lo_bytes = svreinterpret_u8_s32(idx_lo_masked);
 	svuint8_8_t idx_hi_bytes = svreinterpret_u8_s32(idx_hi_masked);
 
-	svuint8_8_t tbl0_bytes = svreinterpret_u8_s32(t0.m);
-	svuint8_8_t tbl1_bytes = svreinterpret_u8_s32(t1.m);
+	svuint8_8_t tbl0_bytes = svreinterpret_u8_s32(tbl.t0.m);
+	svuint8_8_t tbl1_bytes = svreinterpret_u8_s32(tbl.t1.m);
 
 	svint32_8_t t0_lookup = svreinterpret_s32_u8(svtbl_u8(tbl0_bytes, idx_lo_bytes));
 	svint32_8_t t1_lookup = svreinterpret_s32_u8(svtbl_u8(tbl1_bytes, idx_hi_bytes));

From cd251739f686ee378112b33f2138c36d4d86e6d1 Mon Sep 17 00:00:00 2001
From: Peter Harris <peter.harris@arm.com>
Date: Fri, 9 Aug 2024 23:45:25 +0100
Subject: [PATCH 3/8] Fix self-review

---
 Source/astcenc_vecmathlib_avx2_8.h | 12 +++---
 Source/astcenc_vecmathlib_neon_4.h | 55 ++++++++++---------------
 Source/astcenc_vecmathlib_sse_4.h  | 64 ++++++++++++++++--------------
 3 files changed, 63 insertions(+), 68 deletions(-)

diff --git a/Source/astcenc_vecmathlib_avx2_8.h b/Source/astcenc_vecmathlib_avx2_8.h
index c3ebbba29..3ed01a0b8 100644
--- a/Source/astcenc_vecmathlib_avx2_8.h
+++ b/Source/astcenc_vecmathlib_avx2_8.h
@@ -1005,6 +1005,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
 ) {
 	// AVX2 tables duplicate table entries in each 128-bit half-register
 	vint4 d0 = vint4::load(data);
+
 	table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));
 }
 
@@ -1016,14 +1017,14 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
 	const uint8_t* data
 ) {
 	// AVX2 tables duplicate table entries in each 128-bit half-register
-	// Direct lookup for first row
 	vint4 d0 = vint4::load(data);
-	table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));
-
-	// XOR with previous rows for subsequent rows
 	vint4 d1 = vint4::load(data + 16);
-	d1 = d1 ^ d0;
+
+	table.t0 = vint8(astcenc_mm256_set_m128i(d0.m, d0.m));
 	table.t1 = vint8(astcenc_mm256_set_m128i(d1.m, d1.m));
+
+	// XOR chain the high rows to allow table emulation
+	table.t1 = table.t1 ^ table.t0;
 }
 
 /**
@@ -1044,6 +1045,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
 	table.t2 = vint8(astcenc_mm256_set_m128i(d2.m, d2.m));
 	table.t3 = vint8(astcenc_mm256_set_m128i(d3.m, d3.m));
 
+	// XOR chain the high rows to allow table emulation
 	table.t3 = table.t3 ^ table.t2;
 	table.t2 = table.t2 ^ table.t1;
 	table.t1 = table.t1 ^ table.t0;
diff --git a/Source/astcenc_vecmathlib_neon_4.h b/Source/astcenc_vecmathlib_neon_4.h
index c5d43c14d..b12fff3ff 100644
--- a/Source/astcenc_vecmathlib_neon_4.h
+++ b/Source/astcenc_vecmathlib_neon_4.h
@@ -943,25 +943,21 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
  * Table structure for a 16x 8-bit entry table.
  */
 struct vtable4_16x8 {
-	vint4 t0;
+	int8x16_t t0;
 };
 
 /*
  * Table structure for a 32x 8-bit entry table.
  */
 struct vtable4_32x8 {
-	vint4 t0;
-	vint4 t1;
+	int8x16x2_t t01;
 };
 
 /*
  * Table structure for a 64x 8-bit entry table.
  */
 struct vtable4_64x8 {
-	vint4 t0;
-	vint4 t1;
-	vint4 t2;
-	vint4 t3;
+	int8x16x4_t t0123;
 };
 
 /**
@@ -971,7 +967,8 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
 	vtable4_16x8& table,
 	const uint8_t* data
 ) {
-	table.t0 = vint4::load(data);
+	vint4 t0 = vint4::load(data);
+	table.t0 = vreinterpretq_s8_s32(t0.m);
 }
 
 /**
@@ -981,8 +978,11 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
 	vtable4_32x8& table,
 	const uint8_t* data
 ) {
-	table.t0 = vint4::load(data);
-	table.t1 = vint4::load(data + 16);
+	vint4 t0 = vint4::load(data);
+	vint4 t1 = vint4::load(data + 16);
+
+	table.t01[0] = vreinterpretq_s8_s32(t0.m);
+	table.t01[1] = vreinterpretq_s8_s32(t1.m);
 }
 
 /**
@@ -992,10 +992,15 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
 	vtable4_64x8& table,
 	const uint8_t* data
 ) {
-	table.t0 = vint4::load(data);
-	table.t1 = vint4::load(data + 16);
-	table.t2 = vint4::load(data + 32);
-	table.t3 = vint4::load(data + 48);
+	vint4 t0 = vint4::load(data);
+	vint4 t1 = vint4::load(data + 16);
+	vint4 t2 = vint4::load(data + 32);
+	vint4 t3 = vint4::load(data + 48);
+
+	table.t0123[0] = vreinterpretq_s8_s32(t0.m);
+	table.t0123[1] = vreinterpretq_s8_s32(t1.m);
+	table.t0123[2] = vreinterpretq_s8_s32(t2.m);
+	table.t0123[3] = vreinterpretq_s8_s32(t3.m);
 }
 
 /**
@@ -1005,15 +1010,11 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup(
 	const vtable4_16x8& tbl,
 	vint4 idx
 ) {
-	int8x16_t table {
-		vreinterpretq_s8_s32(tbl.t0.m)
-	};
-
 	// Set index byte above max index for unused bytes so table lookup returns zero
 	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
 	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
 
-	return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(table, idx_bytes)));
+	return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(tbl.t0, idx_bytes)));
 }
 
 /**
@@ -1023,16 +1024,11 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup(
 	const vtable4_32x8& tbl,
 	vint4 idx
 ) {
-	int8x16x2_t table {
-		vreinterpretq_s8_s32(tbl.t0.m),
-		vreinterpretq_s8_s32(tbl.t1.m)
-	};
-
 	// Set index byte above max index for unused bytes so table lookup returns zero
 	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
 	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
 
-	return vint4(vreinterpretq_s32_s8(vqtbl2q_s8(table, idx_bytes)));
+	return vint4(vreinterpretq_s32_s8(vqtbl2q_s8(table.t01, idx_bytes)));
 }
 
 /**
@@ -1042,18 +1038,11 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup(
 	const vtable4_64x8& tbl,
 	vint4 idx
 ) {
-	int8x16x4_t table {
-		vreinterpretq_s8_s32(tbl.t0.m),
-		vreinterpretq_s8_s32(tbl.t1.m),
-		vreinterpretq_s8_s32(tbl.t2.m),
-		vreinterpretq_s8_s32(tbl.t3.m)
-	};
-
 	// Set index byte above max index for unused bytes so table lookup returns zero
 	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
 	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
 
-	return vint4(vreinterpretq_s32_s8(vqtbl4q_s8(table, idx_bytes)));
+	return vint4(vreinterpretq_s32_s8(vqtbl4q_s8(table.t0123, idx_bytes)));
 }
 
 /**
diff --git a/Source/astcenc_vecmathlib_sse_4.h b/Source/astcenc_vecmathlib_sse_4.h
index 5e260a8da..02c487820 100644
--- a/Source/astcenc_vecmathlib_sse_4.h
+++ b/Source/astcenc_vecmathlib_sse_4.h
@@ -1041,25 +1041,37 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
  * Table structure for a 16x 8-bit entry table.
  */
 struct vtable4_16x8 {
+#if ASTCENC_SSE >= 41
 	vint4 t0;
+#else
+	const uint8_t* data;
+#endif
 };
 
 /*
  * Table structure for a 32x 8-bit entry table.
  */
 struct vtable4_32x8 {
+#if ASTCENC_SSE >= 41
 	vint4 t0;
 	vint4 t1;
+#else
+	const uint8_t* data;
+#endif
 };
 
 /*
  * Table structure for a 64x 8-bit entry table.
  */
 struct vtable4_64x8 {
+#if ASTCENC_SSE >= 41
 	vint4 t0;
 	vint4 t1;
 	vint4 t2;
 	vint4 t3;
+#else
+	const uint8_t* data;
+#endif
 };
 
 /**
@@ -1069,7 +1081,11 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
 	vtable4_16x8& table,
 	const uint8_t* data
 ) {
+#if ASTCENC_SSE >= 41
 	table.t0 = vint4::load(data);
+#else
+	table.data = data;
+#endif
 }
 
 /**
@@ -1079,11 +1095,13 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
 	vtable4_32x8& table,
 	const uint8_t* data
 ) {
+#if ASTCENC_SSE >= 41
 	table.t0 = vint4::load(data);
 	table.t1 = vint4::load(data + 16);
 
-#if ASTCENC_SSE >= 41
 	table.t1 = table.t1 ^ table.t0;
+#else
+	table.data = data;
 #endif
 }
 
@@ -1094,15 +1112,17 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
 	vtable4_64x8& table,
 	const uint8_t* data
 ) {
+#if ASTCENC_SSE >= 41
 	table.t0 = vint4::load(data);
 	table.t1 = vint4::load(data + 16);
 	table.t2 = vint4::load(data + 32);
 	table.t3 = vint4::load(data + 48);
 
-#if ASTCENC_SSE >= 41
 	table.t3 = table.t3 ^ table.t2;
 	table.t2 = table.t2 ^ table.t1;
 	table.t1 = table.t1 ^ table.t0;
+#else
+	table.data = data;
 #endif
 }
 
@@ -1120,14 +1140,10 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup(
 	__m128i result = _mm_shuffle_epi8(tbl.t0.m, idxx);
 	return vint4(result);
 #else
-	uint8_t table[16];
-
-	std::memcpy(table +  0, &tbl.t0.m, 4 * sizeof(int));
-
-	return vint4(table[idx.lane<0>()],
-	             table[idx.lane<1>()],
-	             table[idx.lane<2>()],
-	             table[idx.lane<3>()]);
+	return vint4(tbl.data[idx.lane<0>()],
+	             tbl.data[idx.lane<1>()],
+	             tbl.data[idx.lane<2>()],
+	             tbl.data[idx.lane<3>()]);
 #endif
 }
 
@@ -1150,15 +1166,10 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup(
 
 	return vint4(result);
 #else
-	uint8_t table[32];
-
-	std::memcpy(table +  0, &tbl.t0.m, 4 * sizeof(int));
-	std::memcpy(table + 16, &tbl.t1.m, 4 * sizeof(int));
-
-	return vint4(table[idx.lane<0>()],
-	             table[idx.lane<1>()],
-	             table[idx.lane<2>()],
-	             table[idx.lane<3>()]);
+	return vint4(tbl.data[idx.lane<0>()],
+	             tbl.data[idx.lane<1>()],
+	             tbl.data[idx.lane<2>()],
+	             tbl.data[idx.lane<3>()]);
 #endif
 }
 
@@ -1189,17 +1200,10 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup(
 
 	return vint4(result);
 #else
-	uint8_t table[64];
-
-	std::memcpy(table +  0, &tbl.t0.m, 4 * sizeof(int));
-	std::memcpy(table + 16, &tbl.t1.m, 4 * sizeof(int));
-	std::memcpy(table + 32, &tbl.t2.m, 4 * sizeof(int));
-	std::memcpy(table + 48, &tbl.t3.m, 4 * sizeof(int));
-
-	return vint4(table[idx.lane<0>()],
-	             table[idx.lane<1>()],
-	             table[idx.lane<2>()],
-	             table[idx.lane<3>()]);
+	return vint4(tbl.data[idx.lane<0>()],
+	             tbl.data[idx.lane<1>()],
+	             tbl.data[idx.lane<2>()],
+	             tbl.data[idx.lane<3>()]);
 #endif
 }
 

From a252f3007329fa8e551270647621bb5ec64eb36a Mon Sep 17 00:00:00 2001
From: Peter Harris <peter.harris@arm.com>
Date: Fri, 9 Aug 2024 23:50:42 +0100
Subject: [PATCH 4/8] Make API explicitly 32-bit return

---
 Source/UnitTest/test_simd.cpp                  | 12 ++++++------
 Source/astcenc_decompress_symbolic.cpp         |  6 +++---
 Source/astcenc_ideal_endpoints_and_weights.cpp |  8 ++++----
 Source/astcenc_vecmathlib_avx2_8.h             |  6 +++---
 Source/astcenc_vecmathlib_neon_4.h             |  6 +++---
 Source/astcenc_vecmathlib_none_4.h             |  6 +++---
 Source/astcenc_vecmathlib_sse_4.h              |  6 +++---
 Source/astcenc_vecmathlib_sve_8.h              |  6 +++---
 8 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/Source/UnitTest/test_simd.cpp b/Source/UnitTest/test_simd.cpp
index dcfa4e644..24fb63f53 100644
--- a/Source/UnitTest/test_simd.cpp
+++ b/Source/UnitTest/test_simd.cpp
@@ -1959,7 +1959,7 @@ TEST(vint4, vtable4_16x8)
 
 	vint4 index(0, 7, 4, 15);
 
-	vint4 result = vtable_lookup(table, index);
+	vint4 result = vtable_lookup_32bit(table, index);
 
 	EXPECT_EQ(result.lane<0>(),  0);
 	EXPECT_EQ(result.lane<1>(),  7);
@@ -1982,7 +1982,7 @@ TEST(vint4, vtable4_32x8)
 
 	vint4 index(0, 7, 4, 31);
 
-	vint4 result = vtable_lookup(table, index);
+	vint4 result = vtable_lookup_32bit(table, index);
 
 	EXPECT_EQ(result.lane<0>(),  0);
 	EXPECT_EQ(result.lane<1>(),  7);
@@ -2009,7 +2009,7 @@ TEST(vint4, vtable4_64x8)
 
 	vint4 index(0, 7, 38, 63);
 
-	vint4 result = vtable_lookup(table, index);
+	vint4 result = vtable_lookup_32bit(table, index);
 
 	uint8_t* hack = reinterpret_cast<uint8_t*>(&table);
 	std::cout << "38: " << hack[38] << "\n";
@@ -3704,7 +3704,7 @@ TEST(vint8, vtable8_16x8)
 
 	vint8 index = vint8_lit(0, 7, 4, 15, 1, 2, 14, 4);
 
-	vint8 result = vtable_lookup(table, index);
+	vint8 result = vtable_lookup_32bit(table, index);
 
 	alignas(32) int ra[8];
 	store(result, ra);
@@ -3734,7 +3734,7 @@ TEST(vint8, vtable8_32x8)
 
 	vint8 index = vint8_lit(0, 7, 4, 15, 16, 20, 23, 31);
 
-	vint8 result = vtable_lookup(table, index);
+	vint8 result = vtable_lookup_32bit(table, index);
 
 	alignas(32) int ra[8];
 	store(result, ra);
@@ -3768,7 +3768,7 @@ TEST(vint8, vtable8_64x8)
 
 	vint8 index = vint8_lit(0, 7, 4, 15, 16, 20, 38, 63);
 
-	vint8 result = vtable_lookup(table, index);
+	vint8 result = vtable_lookup_32bit(table, index);
 
 	alignas(32) int ra[8];
 	store(result, ra);
diff --git a/Source/astcenc_decompress_symbolic.cpp b/Source/astcenc_decompress_symbolic.cpp
index ef9165db8..e7791eef6 100644
--- a/Source/astcenc_decompress_symbolic.cpp
+++ b/Source/astcenc_decompress_symbolic.cpp
@@ -113,7 +113,7 @@ void unpack_weights(
 				vint texel_weights(di.texel_weights_tr[j] + i);
 				vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
 
-				summed_value += vtable_lookup(table, texel_weights) * texel_weights_int;
+				summed_value += vtable_lookup_32bit(table, texel_weights) * texel_weights_int;
 			}
 
 			store(lsr<4>(summed_value), weights_plane1 + i);
@@ -144,8 +144,8 @@ void unpack_weights(
 				vint texel_weights(di.texel_weights_tr[j] + i);
 				vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
 
-				sum_plane1 += vtable_lookup(tab_plane1, texel_weights) * texel_weights_int;
-				sum_plane2 += vtable_lookup(tab_plane2, texel_weights) * texel_weights_int;
+				sum_plane1 += vtable_lookup_32bit(tab_plane1, texel_weights) * texel_weights_int;
+				sum_plane2 += vtable_lookup_32bit(tab_plane2, texel_weights) * texel_weights_int;
 			}
 
 			store(lsr<4>(sum_plane1), weights_plane1 + i);
diff --git a/Source/astcenc_ideal_endpoints_and_weights.cpp b/Source/astcenc_ideal_endpoints_and_weights.cpp
index 8a09c6f67..ec680dd5e 100644
--- a/Source/astcenc_ideal_endpoints_and_weights.cpp
+++ b/Source/astcenc_ideal_endpoints_and_weights.cpp
@@ -1037,8 +1037,8 @@ void compute_quantized_weights_for_decimation(
 			vint weightl = float_to_int(ix1);
 			vint weighth = min(weightl + vint(1), steps_m1);
 
-			vint ixli = vtable_lookup(table, weightl);
-			vint ixhi = vtable_lookup(table, weighth);
+			vint ixli = vtable_lookup_32bit(table, weightl);
+			vint ixhi = vtable_lookup_32bit(table, weighth);
 
 			vfloat ixl = int_to_float(ixli);
 			vfloat ixh = int_to_float(ixhi);
@@ -1068,8 +1068,8 @@ void compute_quantized_weights_for_decimation(
 			vint weightl = float_to_int(ix1);
 			vint weighth = min(weightl + vint(1), steps_m1);
 
-			vint ixli = vtable_lookup(table, weightl);
-			vint ixhi = vtable_lookup(table, weighth);
+			vint ixli = vtable_lookup_32bit(table, weightl);
+			vint ixhi = vtable_lookup_32bit(table, weighth);
 
 			vfloat ixl = int_to_float(ixli);
 			vfloat ixh = int_to_float(ixhi);
diff --git a/Source/astcenc_vecmathlib_avx2_8.h b/Source/astcenc_vecmathlib_avx2_8.h
index 3ed01a0b8..7c75818ae 100644
--- a/Source/astcenc_vecmathlib_avx2_8.h
+++ b/Source/astcenc_vecmathlib_avx2_8.h
@@ -1054,7 +1054,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
 /**
  * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint8 vtable_lookup(
+ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
 	const vtable8_16x8& tbl,
 	vint8 idx
 ) {
@@ -1068,7 +1068,7 @@ ASTCENC_SIMD_INLINE vint8 vtable_lookup(
 /**
  * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint8 vtable_lookup(
+ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
 	const vtable8_32x8& tbl,
 	vint8 idx
 ) {
@@ -1086,7 +1086,7 @@ ASTCENC_SIMD_INLINE vint8 vtable_lookup(
 /**
  * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint8 vtable_lookup(
+ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
 	const vtable8_64x8& tbl,
 	vint8 idx
 ) {
diff --git a/Source/astcenc_vecmathlib_neon_4.h b/Source/astcenc_vecmathlib_neon_4.h
index b12fff3ff..beea3425f 100644
--- a/Source/astcenc_vecmathlib_neon_4.h
+++ b/Source/astcenc_vecmathlib_neon_4.h
@@ -1006,7 +1006,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
 /**
  * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_lookup(
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
 	const vtable4_16x8& tbl,
 	vint4 idx
 ) {
@@ -1020,7 +1020,7 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup(
 /**
  * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_lookup(
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
 	const vtable4_32x8& tbl,
 	vint4 idx
 ) {
@@ -1034,7 +1034,7 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup(
 /**
  * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_lookup(
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
 	const vtable4_64x8& tbl,
 	vint4 idx
 ) {
diff --git a/Source/astcenc_vecmathlib_none_4.h b/Source/astcenc_vecmathlib_none_4.h
index 9f4f4aed1..4646e84ad 100644
--- a/Source/astcenc_vecmathlib_none_4.h
+++ b/Source/astcenc_vecmathlib_none_4.h
@@ -1121,7 +1121,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
 /**
  * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_lookup(
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
 	const vtable4_16x8& table,
 	vint4 idx
 ) {
@@ -1134,7 +1134,7 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup(
 /**
  * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_lookup(
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
 	const vtable4_32x8& table,
 	vint4 idx
 ) {
@@ -1147,7 +1147,7 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup(
 /**
  * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_lookup(
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
 	const vtable4_64x8& table,
 	vint4 idx
 ) {
diff --git a/Source/astcenc_vecmathlib_sse_4.h b/Source/astcenc_vecmathlib_sse_4.h
index 02c487820..5c726b6ab 100644
--- a/Source/astcenc_vecmathlib_sse_4.h
+++ b/Source/astcenc_vecmathlib_sse_4.h
@@ -1129,7 +1129,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
 /**
  * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_lookup(
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
 	const vtable4_16x8& tbl,
 	vint4 idx
 ) {
@@ -1150,7 +1150,7 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup(
 /**
  * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_lookup(
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
 	const vtable4_32x8& tbl,
 	vint4 idx
 ) {
@@ -1176,7 +1176,7 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup(
 /**
  * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint4 vtable_lookup(
+ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
 	const vtable4_64x8& tbl,
 	vint4 idx
 ) {
diff --git a/Source/astcenc_vecmathlib_sve_8.h b/Source/astcenc_vecmathlib_sve_8.h
index a1eeb18c3..6cd6ed046 100644
--- a/Source/astcenc_vecmathlib_sve_8.h
+++ b/Source/astcenc_vecmathlib_sve_8.h
@@ -963,7 +963,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
 /**
  * @brief Perform a vtable lookup in a 16x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint8 vtable_lookup(
+ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
 	const vtable8_16x8& tbl,
 	vint8 idx
 ) {
@@ -980,7 +980,7 @@ ASTCENC_SIMD_INLINE vint8 vtable_lookup(
 /**
  * @brief Perform a vtable lookup in a 32x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint8 vtable_lookup(
+ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
 	const vtable8_32x8& tbl,
 	vint8 idx
 ) {
@@ -997,7 +997,7 @@ ASTCENC_SIMD_INLINE vint8 vtable_lookup(
 /**
  * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
  */
-ASTCENC_SIMD_INLINE vint8 vtable_lookup(
+ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
 	const vtable8_64x8& tbl,
 	vint8 idx
 ) {

From 3aea47327a8c7f3da655b76f284749a2ec2518a8 Mon Sep 17 00:00:00 2001
From: Peter Harris <peter.harris@arm.com>
Date: Sat, 10 Aug 2024 09:10:00 +0100
Subject: [PATCH 5/8] Neon build fixes

---
 Source/astcenc_vecmathlib_neon_4.h | 23 +++++------------------
 1 file changed, 5 insertions(+), 18 deletions(-)

diff --git a/Source/astcenc_vecmathlib_neon_4.h b/Source/astcenc_vecmathlib_neon_4.h
index beea3425f..faaff52e2 100644
--- a/Source/astcenc_vecmathlib_neon_4.h
+++ b/Source/astcenc_vecmathlib_neon_4.h
@@ -967,8 +967,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
 	vtable4_16x8& table,
 	const uint8_t* data
 ) {
-	vint4 t0 = vint4::load(data);
-	table.t0 = vreinterpretq_s8_s32(t0.m);
+	table.t01 = vldq_u8(data);
 }
 
 /**
@@ -978,11 +977,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
 	vtable4_32x8& table,
 	const uint8_t* data
 ) {
-	vint4 t0 = vint4::load(data);
-	vint4 t1 = vint4::load(data + 16);
-
-	table.t01[0] = vreinterpretq_s8_s32(t0.m);
-	table.t01[1] = vreinterpretq_s8_s32(t1.m);
+	table.t01 = vld2q_u8(data);
 }
 
 /**
@@ -992,15 +987,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
 	vtable4_64x8& table,
 	const uint8_t* data
 ) {
-	vint4 t0 = vint4::load(data);
-	vint4 t1 = vint4::load(data + 16);
-	vint4 t2 = vint4::load(data + 32);
-	vint4 t3 = vint4::load(data + 48);
-
-	table.t0123[0] = vreinterpretq_s8_s32(t0.m);
-	table.t0123[1] = vreinterpretq_s8_s32(t1.m);
-	table.t0123[2] = vreinterpretq_s8_s32(t2.m);
-	table.t0123[3] = vreinterpretq_s8_s32(t3.m);
+	table.t0123 = vld4q_u8(data);
 }
 
 /**
@@ -1028,7 +1015,7 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
 	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
 	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
 
-	return vint4(vreinterpretq_s32_s8(vqtbl2q_s8(table.t01, idx_bytes)));
+	return vint4(vreinterpretq_s32_s8(vqtbl2q_s8(tbl.t01, idx_bytes)));
 }
 
 /**
@@ -1042,7 +1029,7 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
 	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
 	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
 
-	return vint4(vreinterpretq_s32_s8(vqtbl4q_s8(table.t0123, idx_bytes)));
+	return vint4(vreinterpretq_s32_s8(vqtbl4q_s8(tbl.t0123, idx_bytes)));
 }
 
 /**

From 3520617a9b7e6a176006713831b64cc064bcaadf Mon Sep 17 00:00:00 2001
From: Peter Harris <peter.harris@arm.com>
Date: Sat, 10 Aug 2024 09:13:36 +0100
Subject: [PATCH 6/8] Neon build fixes

---
 Source/astcenc_vecmathlib_neon_4.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/Source/astcenc_vecmathlib_neon_4.h b/Source/astcenc_vecmathlib_neon_4.h
index faaff52e2..9be70d4e6 100644
--- a/Source/astcenc_vecmathlib_neon_4.h
+++ b/Source/astcenc_vecmathlib_neon_4.h
@@ -943,21 +943,21 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
  * Table structure for a 16x 8-bit entry table.
  */
 struct vtable4_16x8 {
-	int8x16_t t0;
+	uint8x16_t t0;
 };
 
 /*
  * Table structure for a 32x 8-bit entry table.
  */
 struct vtable4_32x8 {
-	int8x16x2_t t01;
+	uint8x16x2_t t01;
 };
 
 /*
  * Table structure for a 64x 8-bit entry table.
  */
 struct vtable4_64x8 {
-	int8x16x4_t t0123;
+	uint8x16x4_t t0123;
 };
 
 /**
@@ -967,7 +967,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
 	vtable4_16x8& table,
 	const uint8_t* data
 ) {
-	table.t01 = vldq_u8(data);
+	table.t0 = vld1q_u8(data);
 }
 
 /**
@@ -1001,7 +1001,7 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
 	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
 	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
 
-	return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(tbl.t0, idx_bytes)));
+	return vint4(vreinterpretq_s32_u8(vqtbl1q_u8(tbl.t0, idx_bytes)));
 }
 
 /**
@@ -1015,7 +1015,7 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
 	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
 	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
 
-	return vint4(vreinterpretq_s32_s8(vqtbl2q_s8(tbl.t01, idx_bytes)));
+	return vint4(vreinterpretq_s32_u8(vqtbl2q_u8(tbl.t01, idx_bytes)));
 }
 
 /**
@@ -1029,7 +1029,7 @@ ASTCENC_SIMD_INLINE vint4 vtable_lookup_32bit(
 	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
 	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
 
-	return vint4(vreinterpretq_s32_s8(vqtbl4q_s8(tbl.t0123, idx_bytes)));
+	return vint4(vreinterpretq_s32_u8(vqtbl4q_u8(tbl.t0123, idx_bytes)));
 }
 
 /**

From 1a8e6bff78d10183fc3a3305e7c1970dcfa65688 Mon Sep 17 00:00:00 2001
From: Peter Harris <peterharris747@gmail.com>
Date: Sat, 10 Aug 2024 09:32:13 +0100
Subject: [PATCH 7/8] Fix Neon test failure

---
 Source/astcenc_vecmathlib_neon_4.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/Source/astcenc_vecmathlib_neon_4.h b/Source/astcenc_vecmathlib_neon_4.h
index 9be70d4e6..c7ff01289 100644
--- a/Source/astcenc_vecmathlib_neon_4.h
+++ b/Source/astcenc_vecmathlib_neon_4.h
@@ -977,7 +977,10 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
 	vtable4_32x8& table,
 	const uint8_t* data
 ) {
-	table.t01 = vld2q_u8(data);
+	table.t01 = uint8x16x2_t {
+		vld1q_u8(data),
+		vld1q_u8(data + 16)
+	};
 }
 
 /**
@@ -987,7 +990,12 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
 	vtable4_64x8& table,
 	const uint8_t* data
 ) {
-	table.t0123 = vld4q_u8(data);
+	table.t0123 = uint8x16x4_t {
+		vld1q_u8(data),
+		vld1q_u8(data + 16),
+		vld1q_u8(data + 32),
+		vld1q_u8(data + 48)
+	};
 }
 
 /**

From ed9836221898e6f406daf4e9512188edbe13968f Mon Sep 17 00:00:00 2001
From: Peter Harris <peter.harris@arm.com>
Date: Sat, 10 Aug 2024 18:15:49 +0000
Subject: [PATCH 8/8] Fix SVE build issue

---
 Source/astcenc_vecmathlib_sve_8.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Source/astcenc_vecmathlib_sve_8.h b/Source/astcenc_vecmathlib_sve_8.h
index 6cd6ed046..9b48411b3 100644
--- a/Source/astcenc_vecmathlib_sve_8.h
+++ b/Source/astcenc_vecmathlib_sve_8.h
@@ -988,7 +988,7 @@ ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
 	svint32_8_t idx_masked = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00));
 
 	svuint8_8_t idx_bytes = svreinterpret_u8_s32(idx_masked);
-	svuint8_8_t tbl_bytes = svreinterpret_u8_s32(table.t0.m);
+	svuint8_8_t tbl_bytes = svreinterpret_u8_s32(tbl.t0.m);
 	svuint8_8_t result = svtbl_u8(tbl_bytes, idx_bytes);
 
 	return vint8(svreinterpret_s32_u8(result));