From f716eabec7502ca84badbfeca1ba94a1e3d2dac3 Mon Sep 17 00:00:00 2001
From: Peter Harris <peter.harris@arm.com>
Date: Tue, 5 Nov 2024 12:40:12 +0000
Subject: [PATCH] Use SVE widening loads

---
 Docs/ChangeLog-5x.md               |  2 ++
 Source/astcenc_vecmathlib_neon_4.h | 13 +++++++++----
 2 files changed, 11 insertions(+), 4 deletions(-)
diff --git a/Docs/ChangeLog-5x.md b/Docs/ChangeLog-5x.md
index d19f6224..166843fb 100644
--- a/Docs/ChangeLog-5x.md
+++ b/Docs/ChangeLog-5x.md
@@ -24,6 +24,8 @@ The 5.1.0 release is a maintenance release.
     byte-to-int index conversion.
   * **Optimization:** Added improved intrinsics sequence for SSE and AVX2
     `hmin()` and `hmax()`.
+  * **Optimization:** Added improved intrinsics sequence for `vint4(uint8_t*)`
+    on systems implementing Arm SVE.
 
 <!-- ---------------------------------------------------------------------- -->
 ## 5.0.0
diff --git a/Source/astcenc_vecmathlib_neon_4.h b/Source/astcenc_vecmathlib_neon_4.h
index 9142a328..7cfd0a2f 100644
--- a/Source/astcenc_vecmathlib_neon_4.h
+++ b/Source/astcenc_vecmathlib_neon_4.h
@@ -194,10 +194,15 @@ struct vint4
 	 */
 	ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p)
 	{
-		// Cast is safe - NEON loads are allowed to be unaligned
-		uint32x2_t t8 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(p));
-		uint16x4_t t16 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(t8)));
-		m = vreinterpretq_s32_u32(vmovl_u16(t16));
+#if ASTCENC_SVE == 0
+	// Cast is safe - NEON loads are allowed to be unaligned
+	uint32x2_t t8 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(p));
+	uint16x4_t t16 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(t8)));
+	m = vreinterpretq_s32_u32(vmovl_u16(t16));
+#else
+	svint32_t data = svld1ub_s32(svptrue_pat_b32(SV_VL4), p);
+	m = svget_neonq(data);
+#endif
 	}
 
 	/**