From f716eabec7502ca84badbfeca1ba94a1e3d2dac3 Mon Sep 17 00:00:00 2001 From: Peter Harris Date: Tue, 5 Nov 2024 12:40:12 +0000 Subject: [PATCH] Use SVE widening loads --- Docs/ChangeLog-5x.md | 2 ++ Source/astcenc_vecmathlib_neon_4.h | 13 +++++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/Docs/ChangeLog-5x.md b/Docs/ChangeLog-5x.md index d19f6224..166843fb 100644 --- a/Docs/ChangeLog-5x.md +++ b/Docs/ChangeLog-5x.md @@ -24,6 +24,8 @@ The 5.1.0 release is a maintenance release. byte-to-int index conversion. * **Optimization:** Added improved intrinsics sequence for SSE and AVX2 `hmin()` and `hmax()`. + * **Optimization:** Added improved intrinsics sequence for `vint4(uint8_t*)` + on systems implementing Arm SVE. ## 5.0.0 diff --git a/Source/astcenc_vecmathlib_neon_4.h b/Source/astcenc_vecmathlib_neon_4.h index 9142a328..7cfd0a2f 100644 --- a/Source/astcenc_vecmathlib_neon_4.h +++ b/Source/astcenc_vecmathlib_neon_4.h @@ -194,10 +194,15 @@ struct vint4 */ ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p) { - // Cast is safe - NEON loads are allowed to be unaligned - uint32x2_t t8 = vld1_dup_u32(reinterpret_cast(p)); - uint16x4_t t16 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(t8))); - m = vreinterpretq_s32_u32(vmovl_u16(t16)); +#if ASTCENC_SVE == 0 + // Cast is safe - NEON loads are allowed to be unaligned + uint32x2_t t8 = vld1_dup_u32(reinterpret_cast(p)); + uint16x4_t t16 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(t8))); + m = vreinterpretq_s32_u32(vmovl_u16(t16)); +#else + svint32_t data = svld1ub_s32(svptrue_pat_b32(SV_VL4), p); + m = svget_neonq(data); +#endif } /**