From 60fb57324ca946aea534b234fb82cf2dcf19804f Mon Sep 17 00:00:00 2001 From: Fabian Giesen Date: Fri, 8 Nov 2024 16:24:09 -0800 Subject: [PATCH 1/2] Cheaper compute_lowest_and_highest_weight The actual weights array is constant, so we can compute the min/max weights in a single pass ahead of time. The remapping we apply to the weights depending on rcp_stepsize and offset is monotonic; therefore we can figure out minidx and maxidx in advance from the min/max weights we determined previously. This in turn means that we don't need to do the "reset" handling in the loop where we zero the accumulator whenever the current min/max changes. The amount of code in the inner loop nest is considerably reduced as a result, and the extra preprocessing outside that loop is generally less than the work it replaces. The exact impact depends on the block size and max_angular_steps. Testing with RGB textures and -thorough quality, I observe approximately 0.7% coding time reduction with 4x4 blocks; 6x6 and larger see a 1.2% reduction. At -fast quality, there is not much gain, but neither is it any slower. In my tests, this change does not alter encoding results (nor should it); the output is bitwise identical to before. Signed-off-by: Fabian Giesen --- Source/astcenc_weight_align.cpp | 52 +++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 15 deletions(-) diff --git a/Source/astcenc_weight_align.cpp b/Source/astcenc_weight_align.cpp index c0bed070..4f302388 100644 --- a/Source/astcenc_weight_align.cpp +++ b/Source/astcenc_weight_align.cpp @@ -43,6 +43,7 @@ #include #include #include +#include static constexpr unsigned int ANGULAR_STEPS { 32 }; @@ -169,16 +170,47 @@ static void compute_lowest_and_highest_weight( vfloat rcp_stepsize = int_to_float(vint::lane_id()) + vfloat(1.0f); + // Compute minimum/maximum weights in the weight array. Our remapping + // is monotonic, so the min/max rounded weights relate to the min/max + // unrounded weights in a straightforward way. + vfloat min_weight(FLT_MAX); + vfloat max_weight(-FLT_MAX); + unsigned int partial_weight_start = round_down_to_simd_multiple_vla(weight_count); + for (unsigned int i = 0; i < partial_weight_start; i += ASTCENC_SIMD_WIDTH) + { + vfloat weights = loada(dec_weight_ideal_value + i); + min_weight = min(min_weight, weights); + max_weight = max(max_weight, weights); + } + + if (partial_weight_start != weight_count) + { + vfloat partial_weights = loada(dec_weight_ideal_value + partial_weight_start); + vmask active = vint::lane_id() < vint(weight_count - partial_weight_start); + + vmask smaller = active & (partial_weights < min_weight); + min_weight = select(min_weight, partial_weights, smaller); + + vmask larger = active & (partial_weights > max_weight); + max_weight = select(max_weight, partial_weights, larger); + } + + min_weight = hmin(min_weight); + max_weight = hmax(max_weight); + // Arrays are ANGULAR_STEPS long, so always safe to run full vectors for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH) { - vfloat minidx(128.0f); - vfloat maxidx(-128.0f); vfloat errval = vfloat::zero(); vfloat cut_low_weight_err = vfloat::zero(); vfloat cut_high_weight_err = vfloat::zero(); vfloat offset = loada(offsets + sp); + // We know the min and max weight values, so we can figure out + // the corresponding indices before we enter the loop. + vfloat minidx = round(min_weight * rcp_stepsize - offset); + vfloat maxidx = round(max_weight * rcp_stepsize - offset); + for (unsigned int j = 0; j < weight_count; j++) { vfloat sval = load1(dec_weight_ideal_value + j) * rcp_stepsize - offset; @@ -186,22 +218,12 @@ static void compute_lowest_and_highest_weight( vfloat diff = sval - svalrte; errval += diff * diff; - // Reset tracker on min hit - vmask mask = svalrte < minidx; - minidx = select(minidx, svalrte, mask); - cut_low_weight_err = select(cut_low_weight_err, vfloat::zero(), mask); - - // Accumulate on min hit - mask = svalrte == minidx; + // Accumulate errors for minimum index + vmask mask = svalrte == minidx; vfloat accum = cut_low_weight_err + vfloat(1.0f) - vfloat(2.0f) * diff; cut_low_weight_err = select(cut_low_weight_err, accum, mask); - // Reset tracker on max hit - mask = svalrte > maxidx; - maxidx = select(maxidx, svalrte, mask); - cut_high_weight_err = select(cut_high_weight_err, vfloat::zero(), mask); - - // Accumulate on max hit + // Accumulate errors for maximum index mask = svalrte == maxidx; accum = cut_high_weight_err + vfloat(1.0f) + vfloat(2.0f) * diff; cut_high_weight_err = select(cut_high_weight_err, accum, mask); From 6b2de52d351a6bafd68c778cd5e5fff300b47417 Mon Sep 17 00:00:00 2001 From: Peter Harris Date: Sat, 9 Nov 2024 14:09:30 +0000 Subject: [PATCH 2/2] Add to changelog --- Docs/ChangeLog-5x.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Docs/ChangeLog-5x.md b/Docs/ChangeLog-5x.md index 166843fb..68707dac 100644 --- a/Docs/ChangeLog-5x.md +++ b/Docs/ChangeLog-5x.md @@ -22,6 +22,8 @@ The 5.1.0 release is a maintenance release. * **Optimization:** Added new `gather()` abstraction for gathers using byte indices, allowing implementations without gather hardware to skip the byte-to-int index conversion. + * **Optimization:** Optimized `compute_lowest_and_highest_weight()` to + pre-compute min/max outside of the main loop. * **Optimization:** Added improved intrinsics sequence for SSE and AVX2 `hmin()` and `hmax()`. * **Optimization:** Added improved intrinsics sequence for `vint4(uint8_t*)`