ARM-software · solidpixel · Mar 21, 2025 · Mar 21, 2025 · Mar 21, 2025
diff --git a/Source/astcenc_averages_and_directions.cpp b/Source/astcenc_averages_and_directions.cpp
@@ -68,7 +68,7 @@ static void compute_partition_averages_rgb(
 		{
 			vint texel_partition(pi.partition_of_texel + i);
 
-			vmask lane_mask = lane_id < vint(texel_count);
+			vmask lane_mask = lane_id < vint_from_size(texel_count);
 			lane_id += vint(ASTCENC_SIMD_WIDTH);
 
 			vmask p0_mask = lane_mask & (texel_partition == vint(0));
@@ -104,7 +104,7 @@ static void compute_partition_averages_rgb(
 		{
 			vint texel_partition(pi.partition_of_texel + i);
 
-			vmask lane_mask = lane_id < vint(texel_count);
+			vmask lane_mask = lane_id < vint_from_size(texel_count);
 			lane_id += vint(ASTCENC_SIMD_WIDTH);
 
 			vmask p0_mask = lane_mask & (texel_partition == vint(0));
@@ -149,7 +149,7 @@ static void compute_partition_averages_rgb(
 		{
 			vint texel_partition(pi.partition_of_texel + i);
 
-			vmask lane_mask = lane_id < vint(texel_count);
+			vmask lane_mask = lane_id < vint_from_size(texel_count);
 			lane_id += vint(ASTCENC_SIMD_WIDTH);
 
 			vmask p0_mask = lane_mask & (texel_partition == vint(0));
@@ -239,7 +239,7 @@ static void compute_partition_averages_rgba(
 		{
 			vint texel_partition(pi.partition_of_texel + i);
 
-			vmask lane_mask = lane_id < vint(texel_count);
+			vmask lane_mask = lane_id < vint_from_size(texel_count);
 			lane_id += vint(ASTCENC_SIMD_WIDTH);
 
 			vmask p0_mask = lane_mask & (texel_partition == vint(0));
@@ -279,7 +279,7 @@ static void compute_partition_averages_rgba(
 		{
 			vint texel_partition(pi.partition_of_texel + i);
 
-			vmask lane_mask = lane_id < vint(texel_count);
+			vmask lane_mask = lane_id < vint_from_size(texel_count);
 			lane_id += vint(ASTCENC_SIMD_WIDTH);
 
 			vmask p0_mask = lane_mask & (texel_partition == vint(0));
@@ -330,7 +330,7 @@ static void compute_partition_averages_rgba(
 		{
 			vint texel_partition(pi.partition_of_texel + i);
 
-			vmask lane_mask = lane_id < vint(texel_count);
+			vmask lane_mask = lane_id < vint_from_size(texel_count);
 			lane_id += vint(ASTCENC_SIMD_WIDTH);
 
 			vmask p0_mask = lane_mask & (texel_partition == vint(0));
@@ -777,7 +777,7 @@ void compute_error_squared_rgba(
 		vint lane_ids = vint::lane_id();
 		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
 		{
-			vmask mask = lane_ids < vint(texel_count);
+			vmask mask = lane_ids < vint_from_size(texel_count);
 			const uint8_t* texel_idxs = texel_indexes + i;
 
 			vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, texel_idxs);
@@ -891,7 +891,7 @@ void compute_error_squared_rgb(
 		vint lane_ids = vint::lane_id();
 		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
 		{
-			vmask mask = lane_ids < vint(texel_count);
+			vmask mask = lane_ids < vint_from_size(texel_count);
 			const uint8_t* texel_idxs = texel_indexes + i;
 
 			vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, texel_idxs);

diff --git a/Source/astcenc_mathlib.h b/Source/astcenc_mathlib.h
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2024 Arm Limited
+// Copyright 2011-2025 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -48,7 +48,7 @@
     #define ASTCENC_SSE 42
   #elif defined(__SSE4_1__)
     #define ASTCENC_SSE 41
-  #elif defined(__SSE2__)
+  #elif defined(__SSE2__) || (defined(_M_AMD64) && !defined(_M_ARM64EC))
     #define ASTCENC_SSE 20
   #else
     #define ASTCENC_SSE 0
@@ -68,7 +68,7 @@
 #endif
 
 #ifndef ASTCENC_NEON
-  #if defined(__aarch64__)
+  #if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     #define ASTCENC_NEON 1
   #else
     #define ASTCENC_NEON 0

diff --git a/Source/astcenc_pick_best_endpoint_format.cpp b/Source/astcenc_pick_best_endpoint_format.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2024 Arm Limited
+// Copyright 2011-2025 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -1292,9 +1292,12 @@ unsigned int compute_ideal_endpoint_formats(
 		vint vbest_error_index(-1);
 		vfloat vbest_ep_error(ERROR_CALC_DEFAULT);
 
-		start_block_mode = round_down_to_simd_multiple_vla(start_block_mode);
-		vint lane_ids = vint::lane_id() + vint(start_block_mode);
-		for (unsigned int j = start_block_mode; j < end_block_mode; j += ASTCENC_SIMD_WIDTH)
+		// TODO: This should use size_t for the inputs of start/end_block_mode
+		// to avoid some of this type conversion, but that propagates and will
+		// need a bigger PR to fix
+		size_t start_mode = round_down_to_simd_multiple_vla(start_block_mode);
+		vint lane_ids = vint::lane_id() + vint_from_size(start_mode);
+		for (size_t j = start_mode; j < end_block_mode; j += ASTCENC_SIMD_WIDTH)
 		{
 			vfloat err = vfloat(errors_of_best_combination + j);
 			vmask mask = err < vbest_ep_error;

diff --git a/Source/astcenc_vecmathlib.h b/Source/astcenc_vecmathlib.h
@@ -104,6 +104,7 @@ template<typename T> T gatherf_byte_inds(const float* base, const uint8_t* indic
 
 	constexpr auto loada = vfloat8::loada;
 	constexpr auto load1 = vfloat8::load1;
+	constexpr auto vint_from_size = vint8_from_size;
 
 #elif ASTCENC_SSE >= 20
 	// If we have SSE expose 4-wide VLA, and 4-wide fixed width.
@@ -123,6 +124,7 @@ template<typename T> T gatherf_byte_inds(const float* base, const uint8_t* indic
 
 	constexpr auto loada = vfloat4::loada;
 	constexpr auto load1 = vfloat4::load1;
+	constexpr auto vint_from_size = vint4_from_size;
 
 #elif ASTCENC_SVE == 8
 	// Check the compiler is configured with fixed-length 256-bit SVE.
@@ -154,6 +156,7 @@ template<typename T> T gatherf_byte_inds(const float* base, const uint8_t* indic
 
 	constexpr auto loada = vfloat8::loada;
 	constexpr auto load1 = vfloat8::load1;
+	constexpr auto vint_from_size = vint8_from_size;
 
 #elif ASTCENC_NEON > 0
 	// If we have NEON expose 4-wide VLA.
@@ -173,6 +176,7 @@ template<typename T> T gatherf_byte_inds(const float* base, const uint8_t* indic
 
 	constexpr auto loada = vfloat4::loada;
 	constexpr auto load1 = vfloat4::load1;
+	constexpr auto vint_from_size = vint4_from_size;
 
 #else
 	// If we have nothing expose 4-wide VLA, and 4-wide fixed width.
@@ -209,6 +213,7 @@ template<typename T> T gatherf_byte_inds(const float* base, const uint8_t* indic
 
 	constexpr auto loada = vfloat4::loada;
 	constexpr auto load1 = vfloat4::load1;
+	constexpr auto vint_from_size = vint4_from_size;
 #endif
 
 /**

diff --git a/Source/astcenc_vecmathlib_avx2_8.h b/Source/astcenc_vecmathlib_avx2_8.h
@@ -497,6 +497,15 @@ ASTCENC_SIMD_INLINE int hmax_s(vint8 a)
 	return _mm256_cvtsi256_si32(hmax(a).m);
 }
 
+/**
+ * @brief Generate a vint8 from a size_t.
+ */
+ ASTCENC_SIMD_INLINE vint8 vint8_from_size(size_t a)
+ {
+	assert(a <= std::numeric_limits<int>::max());
+	return vint8(static_cast<int>(a));
+ }
+
 /**
  * @brief Store a vector to a 16B aligned memory address.
  */

diff --git a/Source/astcenc_vecmathlib_common_4.h b/Source/astcenc_vecmathlib_common_4.h
@@ -31,6 +31,7 @@
 #endif
 
 #include <cstdio>
+#include <limits>
 
 // ============================================================================
 // vint4 operators and functions
@@ -117,6 +118,15 @@ ASTCENC_SIMD_INLINE int hmin_s(vint4 a)
 	return hmin(a).lane<0>();
 }
 
+/**
+ * @brief Generate a vint4 from a size_t.
+ */
+ ASTCENC_SIMD_INLINE vint4 vint4_from_size(size_t a)
+ {
+	assert(a <= std::numeric_limits<int>::max());
+	return vint4(static_cast<int>(a));
+ }
+
 /**
  * @brief Return the horizontal maximum of a vector.
  */

diff --git a/Source/astcenc_vecmathlib_sve_8.h b/Source/astcenc_vecmathlib_sve_8.h
@@ -485,6 +485,15 @@ ASTCENC_SIMD_INLINE int hmax_s(vint8 a)
 	return svmaxv_s32(svptrue_b32(), a.m);
 }
 
+/**
+ * @brief Generate a vint8 from a size_t.
+ */
+ ASTCENC_SIMD_INLINE vint8 vint8_from_size(size_t a)
+ {
+	assert(a <= std::numeric_limits<int>::max());
+	return vint8(static_cast<int>(a));
+ }
+
 /**
  * @brief Store a vector to a 16B aligned memory address.
  */

diff --git a/Source/cmake_core.cmake b/Source/cmake_core.cmake
@@ -164,6 +164,7 @@ macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_VENEER_TYPE)
 
             # MSVC compiler defines
             $<${is_msvc_fe}:/EHsc>
+            $<${is_msvc_fe}:/WX>
             $<${is_msvccl}:/wd4324>
 
             # G++ and Clang++ compiler defines