diff --git a/Docs/ChangeLog-4x.md b/Docs/ChangeLog-4x.md index e4f601b0..0228f883 100644 --- a/Docs/ChangeLog-4x.md +++ b/Docs/ChangeLog-4x.md @@ -11,14 +11,21 @@ clocked at 4.2 GHz, running `astcenc` using AVX2 and 6 threads. **Status:** In development -The 4.9.0 release is a minor maintenance release. +The 4.9.0 release is a small release adding support for Arm Scalable Vector +Extensions SIMD, as well as some minor bug fixes. * **General:** * **Bug fix:** Fixed incorrect return type in "None" vector library reference implementation. * **Bug fix:** Fixed sincos table index under/overflow. - * **Feature:** Added backend for Arm SVE fixed-width 256-bit builds. - * **Feature:** Added backend for Arm SVE fixed-width 128-bit builds. + * **Feature:** Changed `ASTCENC_ISA_NATIVE` builds to use `-march=native` and + `-mcpu=native`. + * **Feature:** Added backend for Arm SVE fixed-width 256-bit builds. These + can only run on hardware implementing 256-bit SVE. + * **Feature:** Added backend for Arm SVE 128-bit builds. These are portable + builds and can run on hardware implemnting any SVE vector length, but the + explicit SVE use is augmented NEON and will only use the bottom 128-bits of + each SVE vector. * **Feature:** Optimized NEON mask `any()` and `all()` functions. * **Feature:** Migrated build and test to GitHub Actions pipelines. @@ -36,8 +43,9 @@ The 4.8.0 release is a minor maintenance release. language behavior, to improve support for deployment using Emscripten. * **Feature:** Builds using Clang can now build with undefined behavior sanitizer by setting `-DASTCENC_UBSAN=ON` on the CMake configure line. - * **Feature:** Updated to Wuffs library 0.3.4, which ignores tRNS alpha chunks - for type 4 (LA) and 6 (RGBA) PNGs, to improve compatibility with libpng. + * **Feature:** Updated to Wuffs library 0.3.4, which ignores tRNS alpha + chunks for type 4 (LA) and 6 (RGBA) PNGs, to improve compatibility with + libpng. ## 4.7.0 @@ -49,8 +57,8 @@ the decompressor to match the Khronos specification. This fix includes the addition of explicit support for optimizing for `decode_unorm8` rounding. Reminder - the codec library API is not designed to be binary compatible across -versions. We always recommend rebuilding your client-side code using the updated -`astcenc.h` header. +versions. We always recommend rebuilding your client-side code using the +updated `astcenc.h` header. * **General:** * **Bug fix:** sRGB LDR decompression now uses the correct endpoint expansion diff --git a/Source/UnitTest/cmake_core.cmake b/Source/UnitTest/cmake_core.cmake index 1bf9ddad..61b3764f 100644 --- a/Source/UnitTest/cmake_core.cmake +++ b/Source/UnitTest/cmake_core.cmake @@ -117,7 +117,7 @@ elseif(${ASTCENC_ISA_SIMD} MATCHES "sve_128") # Enable SVE target_compile_options(${ASTCENC_TEST} PRIVATE - -march=armv8-a+sve -msve-vector-bits=128) + -march=armv8-a+sve) elseif(${ASTCENC_ISA_SIMD} MATCHES "sse2") target_compile_definitions(${ASTCENC_TEST} diff --git a/Source/astcenc_mathlib.h b/Source/astcenc_mathlib.h index 959f9ba9..f6901539 100644 --- a/Source/astcenc_mathlib.h +++ b/Source/astcenc_mathlib.h @@ -74,7 +74,18 @@ #endif #ifndef ASTCENC_SVE - #define ASTCENC_SVE 0 + #if defined(__ARM_FEATURE_SVE) + #if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 256 + #define ASTCENC_SVE 8 + // Auto-detected SVE can only assume vector width of 4 is available, but + // must also allow for hardware being longer and so all use of intrinsics + // must explicitly use predicate masks to limit to 4-wide. + #else + #define ASTCENC_SVE 4 + #endif + #else + #define ASTCENC_SVE 0 + #endif #endif // Force vector-sized SIMD alignment diff --git a/Source/astcenccli_entry2.cpp b/Source/astcenccli_entry2.cpp index 60fe6d4b..2434a692 100644 --- a/Source/astcenccli_entry2.cpp +++ b/Source/astcenccli_entry2.cpp @@ -57,8 +57,10 @@ int astcenc_main_veneer( int argc, char **argv ) { -#if ASTCENC_SVE != 0 - // svcntw() return compile-time length if used with -msve-vector-bits + // We don't need this check for 128-bit SVE, because that is compiled as + // VLA code, using predicate masks in the augmented NEON. +#if ASTCENC_SVE > 4 + // svcntw() returns compile-time length if used with -msve-vector-bits if (svcntw() != ASTCENC_SVE) { int bits = ASTCENC_SVE * 32; diff --git a/Source/astcenccli_toplevel_help.cpp b/Source/astcenccli_toplevel_help.cpp index ad8ef3f2..f475b39c 100644 --- a/Source/astcenccli_toplevel_help.cpp +++ b/Source/astcenccli_toplevel_help.cpp @@ -585,6 +585,14 @@ void astcenc_print_header() unsigned int bits = static_cast(sizeof(void*) * 8); printf(astcenc_copyright_string, VERSION_STRING, bits, simdtype, pcnttype, f16ctype, YEAR_STRING); + + // If possible, print hint that 8-wide SVE could be used +#if ASTCENC_SVE == 4 + if (svcntw() == 8) + { + printf("Note: This CPU can support 256-bit SVE builds.\n"); + } +#endif } /* See header for documentation. */ diff --git a/Source/cmake_core.cmake b/Source/cmake_core.cmake index 1f6a4d73..b53bfa04 100644 --- a/Source/cmake_core.cmake +++ b/Source/cmake_core.cmake @@ -336,10 +336,13 @@ macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_VENEER_TYPE) ASTCENC_F16C=0) # Enable SVE in the core library + # Note that for 128-bit SVE the generated code is actually + # vector-length agnostic, but any manual intrinsics used in the + # enhanced-NEON library use 128-bit data width predicates if (NOT ${ASTCENC_VENEER_TYPE}) target_compile_options(${ASTCENC_TARGET_NAME} PRIVATE - -march=armv8-a+sve -msve-vector-bits=128) + -march=armv8-a+sve) # Enable SVE without fixed vector length in the veneer elseif (${ASTCENC_VENEER_TYPE} EQUAL 2) @@ -429,6 +432,21 @@ macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_VENEER_TYPE) $<${is_gnu_fe}:-mfma>) endif() + elseif(${ASTCENC_ISA_SIMD} MATCHES "native") + target_compile_definitions(${ASTCENC_TARGET_NAME} + PRIVATE) + + if (${ASTCENC_VENEER_TYPE} GREATER 0) + target_compile_options(${ASTCENC_TARGET_NAME} + PRIVATE + $<${is_gnu_fe}:-Wno-unused-command-line-argument>) + else() + target_compile_options(${ASTCENC_TARGET_NAME} + PRIVATE + $<${is_clangcl}:-mcpu=native -march=native> + $<${is_gnu_fe}:-mcpu=native -march=native> + $<${is_gnu_fe}:-Wno-unused-command-line-argument>) + endif() endif() endmacro()