From 60fb57324ca946aea534b234fb82cf2dcf19804f Mon Sep 17 00:00:00 2001
From: Fabian Giesen <fabian.giesen@epicgames.com>
Date: Fri, 8 Nov 2024 16:24:09 -0800
Subject: [PATCH 1/2] Cheaper compute_lowest_and_highest_weight

The actual weights array is constant, so we can compute the
min/max weights in a single pass ahead of time. The remapping
we apply to the weights depending on rcp_stepsize and offset
is monotonic; therefore we can figure out minidx and maxidx in
advance from the min/max weights we determined previously.

This in turn means that we don't need to do the "reset" handling
in the loop where we zero the accumulator whenever the current
min/max changes. The amount of code in the inner loop nest is
considerably reduced as a result, and the extra preprocessing
outside that loop is generally less than the work it replaces.

The exact impact depends on the block size and max_angular_steps.
Testing with RGB textures and -thorough quality, I observe
approximately 0.7% coding time reduction with 4x4 blocks; 6x6
and larger see a 1.2% reduction.

At -fast quality, there is not much gain, but neither is it
any slower.

In my tests, this change does not alter encoding results (nor
should it); the output is bitwise identical to before.

Signed-off-by: Fabian Giesen <fabian.giesen@epicgames.com>
---
 Source/astcenc_weight_align.cpp | 52 +++++++++++++++++++++++----------
 1 file changed, 37 insertions(+), 15 deletions(-)
diff --git a/Source/astcenc_weight_align.cpp b/Source/astcenc_weight_align.cpp
index c0bed070..4f302388 100644
--- a/Source/astcenc_weight_align.cpp
+++ b/Source/astcenc_weight_align.cpp
@@ -43,6 +43,7 @@
 #include <stdio.h>
 #include <cassert>
 #include <cstring>
+#include <cfloat>
 
 static constexpr unsigned int ANGULAR_STEPS { 32 };
 
@@ -169,16 +170,47 @@ static void compute_lowest_and_highest_weight(
 
 	vfloat rcp_stepsize = int_to_float(vint::lane_id()) + vfloat(1.0f);
 
+	// Compute minimum/maximum weights in the weight array. Our remapping
+	// is monotonic, so the min/max rounded weights relate to the min/max
+	// unrounded weights in a straightforward way.
+	vfloat min_weight(FLT_MAX);
+	vfloat max_weight(-FLT_MAX);
+	unsigned int partial_weight_start = round_down_to_simd_multiple_vla(weight_count);
+	for (unsigned int i = 0; i < partial_weight_start; i += ASTCENC_SIMD_WIDTH)
+	{
+		vfloat weights = loada(dec_weight_ideal_value + i);
+		min_weight = min(min_weight, weights);
+		max_weight = max(max_weight, weights);
+	}
+
+	if (partial_weight_start != weight_count)
+	{
+		vfloat partial_weights = loada(dec_weight_ideal_value + partial_weight_start);
+		vmask active = vint::lane_id() < vint(weight_count - partial_weight_start);
+
+		vmask smaller = active & (partial_weights < min_weight);
+		min_weight = select(min_weight, partial_weights, smaller);
+
+		vmask larger = active & (partial_weights > max_weight);
+		max_weight = select(max_weight, partial_weights, larger);
+	}
+
+	min_weight = hmin(min_weight);
+	max_weight = hmax(max_weight);
+
 	// Arrays are ANGULAR_STEPS long, so always safe to run full vectors
 	for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)
 	{
-		vfloat minidx(128.0f);
-		vfloat maxidx(-128.0f);
 		vfloat errval = vfloat::zero();
 		vfloat cut_low_weight_err = vfloat::zero();
 		vfloat cut_high_weight_err = vfloat::zero();
 		vfloat offset = loada(offsets + sp);
 
+		// We know the min and max weight values, so we can figure out
+		// the corresponding indices before we enter the loop.
+		vfloat minidx = round(min_weight * rcp_stepsize - offset);
+		vfloat maxidx = round(max_weight * rcp_stepsize - offset);
+
 		for (unsigned int j = 0; j < weight_count; j++)
 		{
 			vfloat sval = load1(dec_weight_ideal_value + j) * rcp_stepsize - offset;
@@ -186,22 +218,12 @@ static void compute_lowest_and_highest_weight(
 			vfloat diff = sval - svalrte;
 			errval += diff * diff;
 
-			// Reset tracker on min hit
-			vmask mask = svalrte < minidx;
-			minidx = select(minidx, svalrte, mask);
-			cut_low_weight_err = select(cut_low_weight_err, vfloat::zero(), mask);
-
-			// Accumulate on min hit
-			mask = svalrte == minidx;
+			// Accumulate errors for minimum index
+			vmask mask = svalrte == minidx;
 			vfloat accum = cut_low_weight_err + vfloat(1.0f) - vfloat(2.0f) * diff;
 			cut_low_weight_err = select(cut_low_weight_err, accum, mask);
 
-			// Reset tracker on max hit
-			mask = svalrte > maxidx;
-			maxidx = select(maxidx, svalrte, mask);
-			cut_high_weight_err = select(cut_high_weight_err, vfloat::zero(), mask);
-
-			// Accumulate on max hit
+			// Accumulate errors for maximum index
 			mask = svalrte == maxidx;
 			accum = cut_high_weight_err + vfloat(1.0f) + vfloat(2.0f) * diff;
 			cut_high_weight_err = select(cut_high_weight_err, accum, mask);

From 6b2de52d351a6bafd68c778cd5e5fff300b47417 Mon Sep 17 00:00:00 2001
From: Peter Harris <peter.harris@arm.com>
Date: Sat, 9 Nov 2024 14:09:30 +0000
Subject: [PATCH 2/2] Add to changelog

---
 Docs/ChangeLog-5x.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Docs/ChangeLog-5x.md b/Docs/ChangeLog-5x.md
index 166843fb..68707dac 100644
--- a/Docs/ChangeLog-5x.md
+++ b/Docs/ChangeLog-5x.md
@@ -22,6 +22,8 @@ The 5.1.0 release is a maintenance release.
   * **Optimization:** Added new `gather()` abstraction for gathers using byte
     indices, allowing implementations without gather hardware to skip the
     byte-to-int index conversion.
+  * **Optimization:** Optimized `compute_lowest_and_highest_weight()` to
+    pre-compute min/max outside of the main loop.
   * **Optimization:** Added improved intrinsics sequence for SSE and AVX2
     `hmin()` and `hmax()`.
   * **Optimization:** Added improved intrinsics sequence for `vint4(uint8_t*)`