From 9b4306d5704bebd35cab143dc4d5d10f70ff41fd Mon Sep 17 00:00:00 2001
From: Yun Hsiao Wu <yunhsiaow@gmail.com>
Date: Fri, 22 Mar 2024 16:14:19 +0800
Subject: [PATCH 01/16] rate-distortion optimization

---
 Source/astcenc.h                    |  11 +
 Source/astcenc_entry.cpp            |  10 +
 Source/astcenc_internal.h           |  16 +
 Source/astcenc_internal_entry.h     |   9 +-
 Source/astcenc_rate_distortion.cpp  | 378 +++++++++++++++
 Source/astcenccli_toplevel.cpp      |  28 ++
 Source/astcenccli_toplevel_help.cpp |  13 +
 Source/cmake_core.cmake             |   4 +
 Source/ert.cpp                      | 705 ++++++++++++++++++++++++++++
 Source/ert.h                        |  81 ++++
 10 files changed, 1254 insertions(+), 1 deletion(-)
 create mode 100644 Source/astcenc_rate_distortion.cpp
 create mode 100644 Source/ert.cpp
 create mode 100644 Source/ert.h

diff --git a/Source/astcenc.h b/Source/astcenc.h
index 3d04b4ea5..a54e27fe4 100644
--- a/Source/astcenc.h
+++ b/Source/astcenc.h
@@ -571,6 +571,17 @@ struct astcenc_config
 	 */
 	float tune_search_mode0_enable;
 
+	//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]:[BEGIN]
+	/**
+	 * @brief The compression level for rate-distortion optimization.
+	 */
+	float rdo_level;
+	/**
+	 * @brief Number of blocks for each entropy reduction batch.
+	 */
+	unsigned int rdo_lookback;
+	//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]:[END]
+
 	/**
 	 * @brief The progress callback, can be @c nullptr.
 	 *
diff --git a/Source/astcenc_entry.cpp b/Source/astcenc_entry.cpp
index 5dc380160..faf8e94b5 100644
--- a/Source/astcenc_entry.cpp
+++ b/Source/astcenc_entry.cpp
@@ -412,6 +412,9 @@ static astcenc_error validate_config(
 	config.tune_3partition_early_out_limit_factor = astc::max(config.tune_3partition_early_out_limit_factor, 0.0f);
 	config.tune_2plane_early_out_limit_correlation = astc::max(config.tune_2plane_early_out_limit_correlation, 0.0f);
 
+	//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]
+	config.rdo_level = astc::clamp(config.rdo_level, 0.0f, 1.0f);
+
 	// Specifying a zero weight color component is not allowed; force to small value
 	float max_weight = astc::max(astc::max(config.cw_r_weight, config.cw_g_weight),
 	                             astc::max(config.cw_b_weight, config.cw_a_weight));
@@ -650,6 +653,9 @@ astcenc_error astcenc_config_init(
 	}
 	config.flags = flags;
 
+	//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]
+	config.rdo_lookback = 64;
+
 	return ASTCENC_SUCCESS;
 }
 
@@ -1099,6 +1105,9 @@ astcenc_error astcenc_compress_image(
 	// Only the first thread to arrive actually runs the term
 	ctxo->manage_compress.term(term_compress);
 
+	//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]
+	rate_distortion_optimize(*ctxo, image, *swizzle, data_out);
+
 	return ASTCENC_SUCCESS;
 #endif
 }
@@ -1119,6 +1128,7 @@ astcenc_error astcenc_compress_reset(
 
 	ctxo->manage_avg.reset();
 	ctxo->manage_compress.reset();
+	ctxo->manage_rdo.reset(); //SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]
 	return ASTCENC_SUCCESS;
 #endif
 }
diff --git a/Source/astcenc_internal.h b/Source/astcenc_internal.h
index df6e07f9e..28a88b564 100644
--- a/Source/astcenc_internal.h
+++ b/Source/astcenc_internal.h
@@ -1231,6 +1231,9 @@ struct astcenc_contexti
 #if !defined(ASTCENC_DECOMPRESS_ONLY)
 	/** @brief The pixel region and variance worker arguments. */
 	avg_args avg_preprocess_args;
+
+	//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]
+	struct astcenc_rdo_context* rdo_context;
 #endif
 
 #if defined(ASTCENC_DIAGNOSTICS)
@@ -2171,6 +2174,19 @@ void physical_to_symbolic(
 	const uint8_t pcb[16],
 	symbolic_compressed_block& scb);
 
+//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]:[BEGIN]
+/**
+ * @brief Rate-distortion optimization main entry.
+ *
+ * @param[inout] buffer   The output compressed buffer.
+ */
+void rate_distortion_optimize(
+	astcenc_context& ctxo,
+	const astcenc_image& image,
+	const astcenc_swizzle& swizzle,
+	uint8_t* buffer);
+//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]:[END]
+
 /* ============================================================================
 Platform-specific functions.
 ============================================================================ */
diff --git a/Source/astcenc_internal_entry.h b/Source/astcenc_internal_entry.h
index c283c5acc..6092a52a2 100644
--- a/Source/astcenc_internal_entry.h
+++ b/Source/astcenc_internal_entry.h
@@ -164,13 +164,15 @@ class ParallelManager
 	 * @param init_func   Callable which executes the stage initialization. It must return the
 	 *                    total number of tasks in the stage.
 	 */
-	void init(std::function<unsigned int(void)> init_func)
+	//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]
+	void init(std::function<unsigned int(void)> init_func, astcenc_progress_callback callback = nullptr)
 	{
 		std::lock_guard<std::mutex> lck(m_lock);
 		if (!m_init_done)
 		{
 			m_task_count = init_func();
 			m_init_done = true;
+			if (callback) m_callback = callback; //SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]
 		}
 	}
 
@@ -322,6 +324,11 @@ struct astcenc_context
 
 	/** @brief The parallel manager for compression. */
 	ParallelManager manage_compress;
+
+	//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]:[BEGIN]
+	/** @brief The parallel manager for rate-distortion optimization. */
+	ParallelManager manage_rdo;
+	//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]:[END]
 #endif
 
 	/** @brief The parallel manager for decompression. */
diff --git a/Source/astcenc_rate_distortion.cpp b/Source/astcenc_rate_distortion.cpp
new file mode 100644
index 000000000..5facc9ea9
--- /dev/null
+++ b/Source/astcenc_rate_distortion.cpp
@@ -0,0 +1,378 @@
+//Copyright Tencent Timi-J1&F1 Studio, Inc. All Rights Reserved
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+#include "astcenc_internal_entry.h"
+#include "ert.h"
+
+struct astcenc_rdo_context
+{
+	ert::reduce_entropy_params m_ert_params;
+	std::atomic<uint32_t> m_total_modified;
+
+	uint32_t m_xblocks = 0;
+	uint32_t m_yblocks = 0;
+	uint32_t m_zblocks = 0;
+	uint32_t m_total_blocks = 0;
+	uint32_t m_image_dimx = 0;
+	uint32_t m_image_dimy = 0;
+	uint32_t m_image_dimz = 0;
+	uint32_t m_bytes_per_texel = 0;
+	uint32_t m_component_count = 0;
+	astcenc_swizzle m_swizzle;
+
+	std::vector<uint8_t> m_block_pixels;
+};
+
+static constexpr uint8_t ASTCENC_RDO_PADDING_PIXEL = 0xff;
+static constexpr uint32_t ASTCENC_BYTES_PER_BLOCK = 16;
+
+template<typename T, typename U> T lerp(T a, T b, U s) { return (T)(a + (b - a) * s); }
+
+extern "C" void rdo_progress_emitter(
+	float value
+) {
+	static float previous_value = 100.0f;
+	const uint32_t bar_size = 25;
+	uint32_t parts = static_cast<uint32_t>(value / 4.0f);
+
+	char buffer[bar_size + 3];
+	buffer[0] = '[';
+
+	for (uint32_t i = 0; i < parts; i++)
+	{
+		buffer[i + 1] = '=';
+	}
+
+	for (uint32_t i = parts; i < bar_size; i++)
+	{
+		buffer[i + 1] = ' ';
+	}
+
+	buffer[bar_size + 1] = ']';
+	buffer[bar_size + 2] = '\0';
+
+	if (previous_value == 100.0f)
+	{
+		printf("\n\n");
+		printf("Rate-distortion optimization\n");
+		printf("============================\n\n");
+	}
+	previous_value = value;
+
+	printf("    Progress: %s %03.1f%%\r", buffer, static_cast<double>(value));
+	fflush(stdout);
+}
+
+static uint32_t init_rdo_context(
+	astcenc_contexti& ctx,
+	const astcenc_image& image,
+	const astcenc_swizzle& swizzle
+) {
+	ctx.rdo_context = new astcenc_rdo_context;
+	astcenc_rdo_context& rdo_ctx = *ctx.rdo_context;
+
+	// Perform memory allocations for intermediary buffers
+	uint32_t block_dim_x = ctx.bsd->xdim;
+	uint32_t block_dim_y = ctx.bsd->ydim;
+	uint32_t block_dim_z = ctx.bsd->zdim;
+	uint32_t xblocks = (image.dim_x + block_dim_x - 1u) / block_dim_x;
+	uint32_t yblocks = (image.dim_y + block_dim_y - 1u) / block_dim_y;
+	uint32_t zblocks = (image.dim_z + block_dim_z - 1u) / block_dim_z;
+	uint32_t total_blocks = xblocks * yblocks * zblocks;
+
+	uint32_t bytes_per_texel = 4u;
+	if (image.data_type == ASTCENC_TYPE_F16) bytes_per_texel *= 2;
+	else if (image.data_type == ASTCENC_TYPE_F32) bytes_per_texel *= 4;
+	uint32_t texels_per_block = block_dim_x * block_dim_y * block_dim_z;
+
+	rdo_ctx.m_block_pixels.resize(total_blocks * ctx.bsd->texel_count * bytes_per_texel);
+	for (uint32_t block_z = 0; block_z < zblocks; ++block_z)
+	{
+		const uint32_t valid_z = astc::min(image.dim_z - block_z * block_dim_z, block_dim_z);
+		const uint32_t padding_z = block_dim_z - valid_z;
+	
+		for (uint32_t offset_z = 0; offset_z < valid_z; offset_z++)
+		{
+			const auto* src_data = static_cast<const uint8_t*>(image.data[block_z * block_dim_z + offset_z]);
+	
+			for (uint32_t block_y = 0, block_idx = 0; block_y < yblocks; ++block_y)
+			{
+				const uint8_t* src_block_row = src_data + block_y * block_dim_y * image.dim_x * bytes_per_texel;
+				const uint32_t valid_y = astc::min(image.dim_y - block_y * block_dim_y, block_dim_y);
+				const uint32_t padding_y = block_dim_y - valid_y;
+	
+				for (uint32_t block_x = 0; block_x < xblocks; ++block_x, ++block_idx)
+				{
+					const uint8_t* src_block = src_block_row + block_x * block_dim_x * bytes_per_texel;
+					const uint32_t valid_x = astc::min(image.dim_x - block_x * block_dim_x, block_dim_x);
+					const uint32_t padding_x = block_dim_x - valid_x;
+	
+					uint8_t* dst_block = rdo_ctx.m_block_pixels.data() + block_idx * texels_per_block * bytes_per_texel;
+					for (uint32_t offset_y = 0; offset_y < valid_y; ++offset_y)
+					{
+						memcpy(dst_block, src_block, valid_x * bytes_per_texel);
+						if (padding_x) memset(dst_block + valid_x * bytes_per_texel, ASTCENC_RDO_PADDING_PIXEL, padding_x * bytes_per_texel);
+						src_block += image.dim_x * bytes_per_texel;
+						dst_block += block_dim_x * bytes_per_texel;
+					}
+					if (padding_y)
+					{
+						memset(dst_block, ASTCENC_RDO_PADDING_PIXEL, padding_y * block_dim_x * bytes_per_texel);
+						dst_block += padding_y * block_dim_x * bytes_per_texel;
+					}
+					if (padding_z) memset(dst_block, ASTCENC_RDO_PADDING_PIXEL, padding_z * block_dim_y * block_dim_x * bytes_per_texel);
+				}
+			}
+		}
+	}
+
+	// Generate quality parameters
+	ert::reduce_entropy_params& ert_params = rdo_ctx.m_ert_params;
+	ert_params.m_lambda = astc::clamp(ctx.config.rdo_level, 0.0f, 1.0f);
+	ert_params.m_color_weights[0] = static_cast<uint32_t>(ctx.config.cw_r_weight * 255.0f);
+	ert_params.m_color_weights[1] = static_cast<uint32_t>(ctx.config.cw_g_weight * 255.0f);
+	ert_params.m_color_weights[2] = static_cast<uint32_t>(ctx.config.cw_b_weight * 255.0f);
+	ert_params.m_color_weights[3] = static_cast<uint32_t>(ctx.config.cw_a_weight * 255.0f);
+	ert_params.m_lookback_window_size = astc::min(ctx.config.rdo_lookback, total_blocks) * ASTCENC_BYTES_PER_BLOCK;
+
+	ert_params.m_try_two_matches = true;
+	// ert_params.m_allow_relative_movement = true;
+	// ert_params.m_debug_output = true;
+
+	rdo_ctx.m_image_dimx = image.dim_x;
+	rdo_ctx.m_image_dimy = image.dim_y;
+	rdo_ctx.m_image_dimz = image.dim_z;
+	rdo_ctx.m_xblocks = xblocks;
+	rdo_ctx.m_yblocks = yblocks;
+	rdo_ctx.m_zblocks = zblocks;
+	rdo_ctx.m_total_blocks = total_blocks;
+	rdo_ctx.m_bytes_per_texel = bytes_per_texel;
+	rdo_ctx.m_component_count = 4;
+	rdo_ctx.m_swizzle = swizzle;
+
+	return rdo_ctx.m_total_blocks;
+}
+
+static bool store_image_block_u8(
+	uint8_t* output,
+	const image_block& blk,
+	const block_size_descriptor& bsd,
+	uint32_t x_start,
+	uint32_t y_start,
+	uint32_t z_start,
+	uint32_t x_size,
+	uint32_t y_size,
+	uint32_t z_size,
+	const astcenc_swizzle& swz,
+	uint32_t& x_count,
+	uint32_t& y_count
+) {
+	uint32_t x_end = astc::min(x_size, x_start + bsd.xdim);
+	x_count = x_end - x_start;
+	uint32_t x_nudge = bsd.xdim - x_count;
+
+	uint32_t y_end = astc::min(y_size, y_start + bsd.ydim);
+	y_count = y_end - y_start;
+	uint32_t y_nudge = (bsd.ydim - y_count) * bsd.xdim;
+
+	uint32_t z_end = astc::min(z_size, z_start + bsd.zdim);
+
+	// True if any non-identity swizzle
+	bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
+					 (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);
+
+	// True if any swizzle uses Z reconstruct
+	bool needs_z = (swz.r == ASTCENC_SWZ_Z) || (swz.g == ASTCENC_SWZ_Z) ||
+				   (swz.b == ASTCENC_SWZ_Z) || (swz.a == ASTCENC_SWZ_Z);
+
+	int idx = 0;
+	for (uint32_t z = z_start; z < z_end; z++)
+	{
+		uint8_t* data_slice = output + z * bsd.xdim * bsd.ydim * 4;
+
+		for (uint32_t y = y_start; y < y_end; ++y)
+		{
+			for (uint32_t x = 0; x < x_count; x += ASTCENC_SIMD_WIDTH)
+			{
+				uint32_t max_texels = ASTCENC_SIMD_WIDTH;
+				uint32_t used_texels = astc::min(x_count - x, max_texels);
+
+				// Unaligned load as rows are not always SIMD_WIDTH long
+				vfloat data_r(blk.data_r + idx);
+				vfloat data_g(blk.data_g + idx);
+				vfloat data_b(blk.data_b + idx);
+				vfloat data_a(blk.data_a + idx);
+
+				// Errors are NaN encoded - abort this sample immediately
+				// Branch is OK here - it is almost never true so predicts well
+				vmask nan_mask = data_r != data_r;
+				if (any(nan_mask))
+				{
+					return false;
+				}
+
+				vint data_ri = float_to_int_rtn(min(data_r, 1.0f) * 255.0f);
+				vint data_gi = float_to_int_rtn(min(data_g, 1.0f) * 255.0f);
+				vint data_bi = float_to_int_rtn(min(data_b, 1.0f) * 255.0f);
+				vint data_ai = float_to_int_rtn(min(data_a, 1.0f) * 255.0f);
+
+				if (needs_swz)
+				{
+					vint swizzle_table[7];
+					swizzle_table[ASTCENC_SWZ_0] = vint(0);
+					swizzle_table[ASTCENC_SWZ_1] = vint(255);
+					swizzle_table[ASTCENC_SWZ_R] = data_ri;
+					swizzle_table[ASTCENC_SWZ_G] = data_gi;
+					swizzle_table[ASTCENC_SWZ_B] = data_bi;
+					swizzle_table[ASTCENC_SWZ_A] = data_ai;
+
+					if (needs_z)
+					{
+						vfloat data_x = (data_r * vfloat(2.0f)) - vfloat(1.0f);
+						vfloat data_y = (data_a * vfloat(2.0f)) - vfloat(1.0f);
+						vfloat data_z = vfloat(1.0f) - (data_x * data_x) - (data_y * data_y);
+						data_z = max(data_z, 0.0f);
+						data_z = (sqrt(data_z) * vfloat(0.5f)) + vfloat(0.5f);
+
+						swizzle_table[ASTCENC_SWZ_Z] = float_to_int_rtn(min(data_z, 1.0f) * 255.0f);
+					}
+
+					data_ri = swizzle_table[swz.r];
+					data_gi = swizzle_table[swz.g];
+					data_bi = swizzle_table[swz.b];
+					data_ai = swizzle_table[swz.a];
+				}
+
+				vint data_rgbai = interleave_rgba8(data_ri, data_gi, data_bi, data_ai);
+				vmask store_mask = vint::lane_id() < vint(used_texels);
+				store_lanes_masked(data_slice + idx * 4, data_rgbai, store_mask);
+
+				idx += used_texels;
+			}
+
+			if (x_nudge)
+			{
+				memset(data_slice + idx * 4, ASTCENC_RDO_PADDING_PIXEL, x_nudge * 4);
+				idx += x_nudge;
+			}
+		}
+
+		if (y_nudge)
+		{
+			memset(data_slice + idx * 4, ASTCENC_RDO_PADDING_PIXEL, y_nudge * 4);
+			idx += y_nudge;
+		}
+	}
+
+	return true;
+}
+
+struct local_rdo_context
+{
+	const astcenc_contexti* ctx;
+	uint32_t base_block_idx = 0;
+};
+
+static bool unpack_block(
+	const void* block,
+	ert::color_rgba* pixels,
+	uint32_t local_block_idx,
+	uint32_t& active_x,
+	uint32_t& active_y,
+	void* user_data
+) {
+	const local_rdo_context& local_ctx = *(local_rdo_context*)user_data;
+	const astcenc_contexti& ctx = *local_ctx.ctx;
+	const astcenc_rdo_context& rdo_ctx = *ctx.rdo_context;
+	uint32_t block_idx = local_ctx.base_block_idx + local_block_idx;
+
+	uint32_t block_dim_x = ctx.bsd->xdim;
+	uint32_t block_dim_y = ctx.bsd->ydim;
+	assert(block_idx < rdo_ctx.m_xblocks * rdo_ctx.m_yblocks);
+
+	uint32_t block_y = block_idx / rdo_ctx.m_xblocks;
+	uint32_t block_x = block_idx - block_y * rdo_ctx.m_xblocks;
+	assert(block_y * rdo_ctx.m_xblocks + block_x == block_idx);
+
+	uint8_t pcb[ASTCENC_BYTES_PER_BLOCK];
+	memcpy(pcb, block, sizeof(pcb));
+	symbolic_compressed_block scb;
+	physical_to_symbolic(*ctx.bsd, pcb, scb);
+
+	if (scb.block_type == SYM_BTYPE_ERROR)
+	{
+		return false;
+	}
+
+	image_block blk;
+	decompress_symbolic_block(ctx.config.profile, *ctx.bsd,
+		block_x * block_dim_x, block_y * block_dim_y, 0,
+		scb, blk);
+
+	return store_image_block_u8(reinterpret_cast<uint8_t*>(pixels), blk, *ctx.bsd,
+		block_x * block_dim_x, block_y * block_dim_y, 0,
+		rdo_ctx.m_image_dimx, rdo_ctx.m_image_dimy, 1, rdo_ctx.m_swizzle, active_x, active_y);
+}
+
+void rate_distortion_optimize(
+	astcenc_context& ctxo,
+	const astcenc_image& image,
+	const astcenc_swizzle& swizzle,
+	uint8_t* buffer
+) {
+	if (ctxo.context.config.rdo_level == 0.0f || image.data_type != ASTCENC_TYPE_U8 || image.dim_z != 1)
+	{
+		return;
+	}
+
+	// Only the first thread actually runs the initializer
+	ctxo.manage_rdo.init([&ctxo, &image, &swizzle]
+		{
+			return init_rdo_context(ctxo.context, image, swizzle);
+		},
+		ctxo.context.config.progress_callback ? rdo_progress_emitter : nullptr);
+
+	uint32_t block_dim_x = ctxo.context.bsd->xdim;
+	uint32_t block_dim_y = ctxo.context.bsd->ydim;
+	uint32_t block_dim_z = ctxo.context.bsd->zdim;
+	uint32_t texels_per_block = block_dim_x * block_dim_y * block_dim_z;
+	astcenc_rdo_context& rdo_ctx = *ctxo.context.rdo_context;
+	uint32_t blocks_per_task = ctxo.context.config.rdo_lookback;
+	if (!blocks_per_task) blocks_per_task = rdo_ctx.m_total_blocks; // Effectively single-threaded
+
+	uint32_t total_modified = 0;
+	while (true)
+	{
+		uint32_t count;
+		uint32_t base = ctxo.manage_rdo.get_task_assignment(blocks_per_task, count);
+		if (!count)
+		{
+			break;
+		}
+
+		local_rdo_context local_context{ &ctxo.context, base };
+
+		ert::reduce_entropy(buffer + base * ASTCENC_BYTES_PER_BLOCK, count, ASTCENC_BYTES_PER_BLOCK, ASTCENC_BYTES_PER_BLOCK,
+			block_dim_x, block_dim_y, rdo_ctx.m_component_count, 
+			reinterpret_cast<const ert::color_rgba*>(rdo_ctx.m_block_pixels.data() + base * texels_per_block * rdo_ctx.m_bytes_per_texel),
+			rdo_ctx.m_ert_params, total_modified, unpack_block, &local_context
+		);
+
+		ctxo.manage_rdo.complete_task_assignment(count);
+	}
+
+	rdo_ctx.m_total_modified.fetch_add(total_modified, std::memory_order_relaxed);
+
+	// Wait for rdo to complete before freeing memory
+	ctxo.manage_rdo.wait();
+
+	// Only the first thread to arrive actually runs the term
+	ctxo.manage_rdo.term([&ctxo]
+		{
+			delete ctxo.context.rdo_context;
+			ctxo.context.rdo_context = nullptr;
+		});
+}
+
+#endif
diff --git a/Source/astcenccli_toplevel.cpp b/Source/astcenccli_toplevel.cpp
index 39eb586af..e76cb8d18 100644
--- a/Source/astcenccli_toplevel.cpp
+++ b/Source/astcenccli_toplevel.cpp
@@ -1175,6 +1175,30 @@ static int edit_astcenc_config(
 			argidx += 1;
 			cli_config.diagnostic_images = true;
 		}
+		//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]:[BEGIN]
+		else if (!strcmp(argv[argidx], "-rdo-level"))
+		{
+			argidx += 2;
+			if (argidx > argc)
+			{
+				print_error("ERROR: -rdo-level switch with no argument\n");
+				return 1;
+			}
+
+			config.rdo_level = static_cast<float>(atof(argv[argidx - 1]));
+		}
+		else if (!strcmp(argv[argidx], "-rdo-lookback"))
+		{
+			argidx += 2;
+			if (argidx > argc)
+			{
+				print_error("ERROR: -rdo-lookback switch with no argument\n");
+				return 1;
+			}
+
+			config.rdo_lookback = atoi(argv[argidx - 1]);
+		}
+		//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]:[END]
 		else // check others as well
 		{
 			print_error("ERROR: Argument '%s' not recognized\n", argv[argidx]);
@@ -1265,6 +1289,10 @@ static void print_astcenc_config(
 		printf("    Candidate cutoff:           %u candidates\n", config.tune_candidate_limit);
 		printf("    Refinement cutoff:          %u iterations\n", config.tune_refinement_limit);
 		printf("    Compressor thread count:    %d\n", cli_config.thread_count);
+		//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]:[BEGIN]
+		printf("    Rate-distortion level:      %g\n", static_cast<double>(config.rdo_level));
+		printf("    Rate-distortion lookback:   %u blocks\n", config.rdo_lookback);
+		//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]:[END]
 		printf("\n");
 	}
 }
diff --git a/Source/astcenccli_toplevel_help.cpp b/Source/astcenccli_toplevel_help.cpp
index 71b9a42d0..c660cf2a9 100644
--- a/Source/astcenccli_toplevel_help.cpp
+++ b/Source/astcenccli_toplevel_help.cpp
@@ -386,6 +386,19 @@ ADVANCED COMPRESSION
                -verythorough : 0.98
                -exhaustive   : 0.99
 )"
+//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]:[BEGIN]
+R"(
+       -rdo-level <factor>
+           Rate-distortion optimization level.
+           Larger values push the postprocessor towards optimizing more for lower rate,
+           and smaller values more for distortion. Default to 0, which skips RDO entirely.
+
+       -rdo-lookback <number>
+           Rate-distortion optimization lookback window size in blocks.
+           The larger this value, the slower the encoder but the higher the quality per LZ compressed bit.
+           Default to 64 blocks.
+)"
+//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]:[END]
 // This split in the literals is needed for Visual Studio; the compiler
 // will concatenate these two strings together ...
 R"(
diff --git a/Source/cmake_core.cmake b/Source/cmake_core.cmake
index e78eb70bb..c5c2ca9b3 100644
--- a/Source/cmake_core.cmake
+++ b/Source/cmake_core.cmake
@@ -42,6 +42,10 @@ set(is_clang "$<AND:${is_gnu_fe},$<CXX_COMPILER_ID:Clang,AppleClang>>")
 
 add_library(${ASTCENC_TARGET}-static
     STATIC
+        #SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]:[BEGIN]
+        ert.cpp
+        astcenc_rate_distortion.cpp
+        #SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]:[END]
         astcenc_averages_and_directions.cpp
         astcenc_block_sizes.cpp
         astcenc_color_quantize.cpp
diff --git a/Source/ert.cpp b/Source/ert.cpp
new file mode 100644
index 000000000..21e0c5f13
--- /dev/null
+++ b/Source/ert.cpp
@@ -0,0 +1,705 @@
+#include "ert.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#define ERT_FAVOR_CONT_AND_REP0_MATCHES (1)
+#define ERT_FAVOR_REP0_MATCHES (0)
+
+namespace ert
+{
+	const uint32_t MAX_BLOCK_PIXELS = 12 * 12;
+	const uint32_t MAX_BLOCK_SIZE_IN_BYTES = 256;
+	const uint32_t MIN_MATCH_LEN = 3;
+	const float LITERAL_BITS = 13.0f;
+	const float MATCH_CONTINUE_BITS = 1.0f;
+	const float MATCH_REP0_BITS = 4.0f;
+
+	static inline float clampf(float value, float low, float high) { if (value < low) value = low; else if (value > high) value = high;	return value; }
+	template<typename F> inline F lerp(F a, F b, F s) { return a + (b - a) * s; }
+
+	static const uint8_t g_tdefl_small_dist_extra[512] =
+	{
+		0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
+		5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+		6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+		6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+		7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+		7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+		7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+		7, 7, 7, 7, 7, 7, 7, 7
+	};
+
+	static const uint8_t g_tdefl_large_dist_extra[128] =
+	{
+		0, 0, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+		12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+		13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13
+	};
+
+	static inline uint32_t compute_match_cost_estimate(uint32_t dist, uint32_t match_len_in_bytes)
+	{
+		assert(match_len_in_bytes <= 258);
+
+		uint32_t len_cost = 6;
+		if (match_len_in_bytes >= 12)
+			len_cost = 9;
+		else if (match_len_in_bytes >= 8)
+			len_cost = 8;
+		else if (match_len_in_bytes >= 6)
+			len_cost = 7;
+
+		uint32_t dist_cost = 5;
+		if (dist < 512)
+			dist_cost += g_tdefl_small_dist_extra[dist & 511];
+		else
+		{
+			dist_cost += g_tdefl_large_dist_extra[std::min<uint32_t>(dist, 32767) >> 8];
+			while (dist >= 32768)
+			{
+				dist_cost++;
+				dist >>= 1;
+			}
+		}
+		return len_cost + dist_cost;
+	}
+
+	class tracked_stat
+	{
+	public:
+		tracked_stat() { clear(); }
+
+		void clear() { m_num = 0; m_total = 0; m_total2 = 0; }
+
+		void update(uint32_t val) { m_num++; m_total += val; m_total2 += val * val; }
+
+		tracked_stat& operator += (uint32_t val) { update(val); return *this; }
+
+		uint32_t get_number_of_values() { return m_num; }
+		uint64_t get_total() const { return m_total; }
+		uint64_t get_total2() const { return m_total2; }
+
+		float get_average() const { return m_num ? (float)m_total / m_num : 0.0f; };
+		float get_std_dev() const { return m_num ? sqrtf((float)(m_num * m_total2 - m_total * m_total)) / m_num : 0.0f; }
+		float get_variance() const { float s = get_std_dev(); return s * s; }
+
+	private:
+		uint32_t m_num;
+		uint64_t m_total;
+		uint64_t m_total2;
+	};
+
+	static inline float compute_block_max_std_dev(const color_rgba* pPixels, uint32_t block_width, uint32_t active_x, uint32_t active_y, uint32_t num_comps)
+	{
+		tracked_stat comp_stats[4];
+
+		for (uint32_t y = 0; y < active_y; y++)
+		{
+			for (uint32_t x = 0; x < active_x; x++)
+			{
+				const color_rgba* pPixel = pPixels + x + y * block_width;
+
+				for (uint32_t c = 0; c < num_comps; c++)
+					comp_stats[c].update(pPixel->m_c[c]);
+			}
+		}
+
+		float max_std_dev = 0.0f;
+		for (uint32_t i = 0; i < num_comps; i++)
+			max_std_dev = std::max(max_std_dev, comp_stats[i].get_std_dev());
+		return max_std_dev;
+	}
+
+	static inline float compute_block_mse(const color_rgba* pPixelsA, const color_rgba* pPixelsB, uint32_t block_width, uint32_t block_height, uint32_t total_block_pixels, uint32_t num_comps, const uint32_t weights[4], float one_over_total_color_weight)
+	{
+		uint64_t total_err = 0;
+
+		if ((block_width == 4) && (block_height == 4) && (num_comps == 4))
+		{
+			if ((weights[0] == 1) && (weights[1] == 1) && (weights[2] == 1) && (weights[3] == 1))
+			{
+				for (uint32_t i = 0; i < 16; i++)
+				{
+					const color_rgba* pA = pPixelsA + i;
+					const color_rgba* pB = pPixelsB + i;
+
+					const int dr = pA->m_c[0] - pB->m_c[0];
+					const int dg = pA->m_c[1] - pB->m_c[1];
+					const int db = pA->m_c[2] - pB->m_c[2];
+					const int da = pA->m_c[3] - pB->m_c[3];
+
+					total_err += dr * dr + dg * dg + db * db + da * da;
+				}
+			}
+			else
+			{
+				for (uint32_t i = 0; i < 16; i++)
+				{
+					const color_rgba* pA = pPixelsA + i;
+					const color_rgba* pB = pPixelsB + i;
+
+					const int dr = pA->m_c[0] - pB->m_c[0];
+					const int dg = pA->m_c[1] - pB->m_c[1];
+					const int db = pA->m_c[2] - pB->m_c[2];
+					const int da = pA->m_c[3] - pB->m_c[3];
+
+					total_err += weights[0] * dr * dr + weights[1] * dg * dg + weights[2] * db * db + weights[3] * da * da;
+				}
+			}
+		}
+		else if ((block_width == 4) && (block_height == 4) && (num_comps == 3))
+		{
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				const uint32_t y_ofs = y * 4;
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					const color_rgba* pA = pPixelsA + x + y_ofs;
+					const color_rgba* pB = pPixelsB + x + y_ofs;
+
+					const int dr = pA->m_c[0] - pB->m_c[0];
+					const int dg = pA->m_c[1] - pB->m_c[1];
+					const int db = pA->m_c[2] - pB->m_c[2];
+
+					total_err += weights[0] * dr * dr + weights[1] * dg * dg + weights[2] * db * db;
+				}
+			}
+		}
+		else if ((block_width == 4) && (block_height == 4) && (num_comps == 2))
+		{
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				const uint32_t y_ofs = y * 4;
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					const color_rgba* pA = pPixelsA + x + y_ofs;
+					const color_rgba* pB = pPixelsB + x + y_ofs;
+
+					const int dr = pA->m_c[0] - pB->m_c[0];
+					const int dg = pA->m_c[1] - pB->m_c[1];
+
+					total_err += weights[0] * dr * dr + weights[1] * dg * dg;
+				}
+			}
+		}
+		else if ((block_width == 4) && (block_height == 4) && (num_comps == 1))
+		{
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				const uint32_t y_ofs = y * 4;
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					const color_rgba* pA = pPixelsA + x + y_ofs;
+					const color_rgba* pB = pPixelsB + x + y_ofs;
+
+					const int dr = pA->m_c[0] - pB->m_c[0];
+
+					total_err += weights[0] * dr * dr;
+				}
+			}
+		}
+		else
+		{
+			for (uint32_t y = 0; y < block_height; y++)
+			{
+				const uint32_t y_ofs = y * block_width;
+				for (uint32_t x = 0; x < block_width; x++)
+				{
+					const color_rgba* pA = pPixelsA + x + y_ofs;
+					const color_rgba* pB = pPixelsB + x + y_ofs;
+
+					for (uint32_t c = 0; c < num_comps; c++)
+					{
+						const int d = pA->m_c[c] - pB->m_c[c];
+						total_err += weights[c] * d * d;
+					}
+				}
+			}
+		}
+
+		return total_err * (one_over_total_color_weight / total_block_pixels);
+	}	
+
+	uint32_t hash_hsieh(const uint8_t* pBuf, size_t len, uint32_t salt)
+	{
+		if (!pBuf || !len)
+			return 0;
+
+		uint32_t h = static_cast<uint32_t>(len + (salt << 16));
+
+		const uint32_t bytes_left = len & 3;
+		len >>= 2;
+
+		while (len--)
+		{
+			const uint16_t* pWords = reinterpret_cast<const uint16_t*>(pBuf);
+
+			h += pWords[0];
+
+			const uint32_t t = (pWords[1] << 11) ^ h;
+			h = (h << 16) ^ t;
+
+			pBuf += sizeof(uint32_t);
+
+			h += h >> 11;
+		}
+
+		switch (bytes_left)
+		{
+		case 1:
+			h += *reinterpret_cast<const signed char*>(pBuf);
+			h ^= h << 10;
+			h += h >> 1;
+			break;
+		case 2:
+			h += *reinterpret_cast<const uint16_t*>(pBuf);
+			h ^= h << 11;
+			h += h >> 17;
+			break;
+		case 3:
+			h += *reinterpret_cast<const uint16_t*>(pBuf);
+			h ^= h << 16;
+			h ^= (static_cast<signed char>(pBuf[sizeof(uint16_t)])) << 18;
+			h += h >> 11;
+			break;
+		default:
+			break;
+		}
+
+		h ^= h << 3;
+		h += h >> 5;
+		h ^= h << 4;
+		h += h >> 17;
+		h ^= h << 25;
+		h += h >> 6;
+
+		return h;
+	}
+
+	// BC7 entropy reduction transform with Deflate/LZMA/LZHAM optimizations
+	bool reduce_entropy(void* pBlocks, uint32_t num_blocks,
+		uint32_t total_block_stride_in_bytes, uint32_t block_size_to_optimize_in_bytes, uint32_t block_width, uint32_t block_height, uint32_t num_comps,
+		const color_rgba* pBlock_pixels, const reduce_entropy_params& params, uint32_t& total_modified,
+		pUnpack_block_func pUnpack_block_func, void* pUnpack_block_func_user_data,
+		std::vector<float>* pBlock_mse_scales)
+	{
+		assert(total_block_stride_in_bytes && block_size_to_optimize_in_bytes);
+		assert(total_block_stride_in_bytes >= block_size_to_optimize_in_bytes);
+		
+		assert(num_comps >= 1 && num_comps <= 4);
+		for (uint32_t i = num_comps; i < 4; i++)
+		{
+			assert(!params.m_color_weights[i]);
+			if (params.m_color_weights[i])
+				return false;
+		}
+
+		const uint32_t total_color_weight = params.m_color_weights[0] + params.m_color_weights[1] + params.m_color_weights[2] + params.m_color_weights[3];
+		assert(total_color_weight);
+		const float one_over_total_color_weight = 1.0f / total_color_weight;
+
+		assert((block_size_to_optimize_in_bytes >= MIN_MATCH_LEN) && (block_size_to_optimize_in_bytes <= MAX_BLOCK_SIZE_IN_BYTES));
+		if ((block_size_to_optimize_in_bytes < MIN_MATCH_LEN) || (block_size_to_optimize_in_bytes > MAX_BLOCK_SIZE_IN_BYTES))
+			return false;
+
+		uint8_t* pBlock_bytes = (uint8_t*)pBlocks;
+
+		const uint32_t total_block_pixels = block_width * block_height;
+		if (total_block_pixels > MAX_BLOCK_PIXELS)
+			return false;
+
+		const int total_blocks_to_check = std::max<uint32_t>(1U, params.m_lookback_window_size / total_block_stride_in_bytes);
+
+		std::vector<uint32_t> len_hist(MAX_BLOCK_SIZE_IN_BYTES + 1);
+		std::vector<uint32_t> second_len_hist(MAX_BLOCK_SIZE_IN_BYTES + 1);
+		uint32_t total_second_matches = 0;
+
+		int prev_match_window_ofs_to_favor_cont = -1, prev_match_dist_to_favor = -1;
+				
+		uint32_t total_smooth_blocks = 0;
+
+		const uint32_t HASH_SIZE = 8192;
+		uint32_t hash[HASH_SIZE];
+				
+		for (uint32_t block_index = 0; block_index < num_blocks; block_index++)
+		{
+			if ((block_index & 0xFF) == 0)
+				memset(hash, 0, sizeof(hash));
+
+			uint8_t* pOrig_block = &pBlock_bytes[block_index * total_block_stride_in_bytes];
+			const color_rgba* pPixels = &pBlock_pixels[block_index * total_block_pixels];
+			uint32_t active_x = 0, active_y = 0;
+
+			color_rgba decoded_block[MAX_BLOCK_PIXELS];
+			if (!(*pUnpack_block_func)(pOrig_block, decoded_block, block_index, active_x, active_y, pUnpack_block_func_user_data))
+				return false;
+
+			uint32_t current_block_pixels = active_x * active_y;
+			float cur_mse = compute_block_mse(pPixels, decoded_block, block_width, block_height, current_block_pixels, num_comps, params.m_color_weights, one_over_total_color_weight);
+
+			if ((params.m_skip_zero_mse_blocks) && (cur_mse == 0.0f))
+				continue;
+
+			const float max_std_dev = compute_block_max_std_dev(pPixels, block_width, active_x, active_y, num_comps);
+			
+			float yl = clampf(max_std_dev / params.m_max_smooth_block_std_dev, 0.0f, 1.0f);
+			yl = yl * yl;
+			float smooth_block_mse_scale = lerp(params.m_smooth_block_max_mse_scale, 1.0f, yl);
+
+			if (pBlock_mse_scales)
+			{
+				if ((*pBlock_mse_scales)[block_index] > 0.0f)
+				{
+					smooth_block_mse_scale = (*pBlock_mse_scales)[block_index];
+				}
+			}
+			
+			if (smooth_block_mse_scale > 1.0f)
+				total_smooth_blocks++;
+						
+			float cur_bits = (LITERAL_BITS * block_size_to_optimize_in_bytes);
+			float cur_t = cur_mse * smooth_block_mse_scale + cur_bits * params.m_lambda;
+
+			int first_block_to_check = std::max<int>(0, block_index - total_blocks_to_check);
+			int last_block_to_check = block_index - 1;
+
+			uint8_t best_block[MAX_BLOCK_SIZE_IN_BYTES];
+			memcpy(best_block, pOrig_block, block_size_to_optimize_in_bytes);
+
+			float best_t = cur_t;
+			uint32_t best_match_len = 0, best_match_src_window_ofs = 0, best_match_dst_window_ofs = 0, best_match_src_block_ofs = 0, best_match_dst_block_ofs = 0;
+			float best_match_bits = 0;
+
+			// Don't let thresh_ms_err be 0 to let zero error blocks have slightly increased distortion
+			const float thresh_ms_err = params.m_max_allowed_rms_increase_ratio * params.m_max_allowed_rms_increase_ratio * std::max(cur_mse, 1.0f);
+			
+			for (int prev_block_index = last_block_to_check; prev_block_index >= first_block_to_check; --prev_block_index)
+			{
+				const uint8_t* pPrev_blk = &pBlock_bytes[prev_block_index * total_block_stride_in_bytes];
+
+				for (uint32_t len = block_size_to_optimize_in_bytes; len >= MIN_MATCH_LEN; len--)
+				{
+					if (params.m_allow_relative_movement)
+					{
+						for (uint32_t src_ofs = 0; src_ofs <= (block_size_to_optimize_in_bytes - len); src_ofs++)
+						{
+							assert(len + src_ofs <= block_size_to_optimize_in_bytes);
+							
+							const uint32_t src_match_window_ofs = prev_block_index * total_block_stride_in_bytes + src_ofs;
+
+							for (uint32_t dst_ofs = 0; dst_ofs <= (block_size_to_optimize_in_bytes - len); dst_ofs++)
+							{
+								assert(len + dst_ofs <= block_size_to_optimize_in_bytes);
+								
+								const uint32_t dst_match_window_ofs = block_index * total_block_stride_in_bytes + dst_ofs;
+
+								const uint32_t match_dist = dst_match_window_ofs - src_match_window_ofs;
+																
+								float trial_match_bits, trial_total_bits;
+
+								uint32_t hs = hash_hsieh(pPrev_blk + src_ofs, len, dst_ofs);
+
+#if ERT_FAVOR_CONT_AND_REP0_MATCHES
+								// Continue a previous match (which would cross block boundaries)
+								if (((int)src_match_window_ofs == prev_match_window_ofs_to_favor_cont) && (dst_ofs == 0))
+								{
+									trial_match_bits = MATCH_CONTINUE_BITS;
+									trial_total_bits = (block_size_to_optimize_in_bytes - len) * LITERAL_BITS + MATCH_CONTINUE_BITS;
+								}
+								// Exploit REP0 matches
+								else if ((prev_match_dist_to_favor != -1) && (src_match_window_ofs == (dst_match_window_ofs - prev_match_dist_to_favor)))
+								{
+									trial_match_bits = MATCH_REP0_BITS;
+									trial_total_bits = (block_size_to_optimize_in_bytes - len) * LITERAL_BITS + MATCH_REP0_BITS;
+								}
+								else
+								{
+									trial_match_bits = (float)compute_match_cost_estimate(match_dist, len);
+									trial_total_bits = (block_size_to_optimize_in_bytes - len) * LITERAL_BITS + trial_match_bits;
+										
+									uint32_t hash_check = hash[hs & (HASH_SIZE - 1)];
+									if ((hash_check & 0xFF) == (block_index & 0xFF))
+									{
+										if ((hash_check >> 8) == (hs >> 8))
+											continue;
+									}
+								}
+#else
+								uint32_t hash_check = hash[hs & (HASH_SIZE - 1)];
+								if ((hash_check & 0xFF) == (block_index & 0xFF))
+								{
+									if ((hash_check >> 8) == (hs >> 8))
+										continue;
+								}
+#endif
+
+								hash[hs & (HASH_SIZE - 1)] = (hs & 0xFFFFFF00) | (block_index & 0xFF);
+
+								const float trial_total_bits_times_lambda = trial_total_bits * params.m_lambda;
+								
+								uint8_t trial_block[MAX_BLOCK_SIZE_IN_BYTES];
+								memcpy(trial_block, pOrig_block, block_size_to_optimize_in_bytes);
+								memcpy(trial_block + dst_ofs, pPrev_blk + src_ofs, len);
+
+								color_rgba decoded_trial_block[MAX_BLOCK_PIXELS];
+								if (!(*pUnpack_block_func)(trial_block, decoded_trial_block, block_index, active_x, active_y, pUnpack_block_func_user_data))
+									continue;
+
+								float trial_mse = compute_block_mse(pPixels, decoded_trial_block, block_width, block_height, current_block_pixels, num_comps, params.m_color_weights, one_over_total_color_weight);
+
+								if (trial_mse < thresh_ms_err)
+								{
+									float t = trial_mse * smooth_block_mse_scale + trial_total_bits_times_lambda;
+
+									if (t < best_t)
+									{
+										best_t = t;
+										memcpy(best_block, trial_block, block_size_to_optimize_in_bytes);
+										best_match_len = len;
+										best_match_src_window_ofs = src_match_window_ofs;
+										best_match_dst_window_ofs = dst_match_window_ofs;
+										best_match_src_block_ofs = src_ofs;
+										best_match_dst_block_ofs = dst_ofs;
+										best_match_bits = trial_match_bits;
+									}
+								}
+
+							} // dst_ofs
+						} // src_ofs
+					}
+					else
+					{
+						const uint32_t match_dist = (block_index - prev_block_index) * total_block_stride_in_bytes;
+
+						// Assume the block has 1 match and block_size_to_optimize_in_bytes-match_len literals.
+						const float trial_match_bits = (float)compute_match_cost_estimate(match_dist, len);
+						const float trial_total_bits = (block_size_to_optimize_in_bytes - len) * LITERAL_BITS + trial_match_bits;
+						const float trial_total_bits_times_lambda = trial_total_bits * params.m_lambda;
+
+						for (uint32_t ofs = 0; ofs <= (block_size_to_optimize_in_bytes - len); ofs++)
+						{
+							assert(len + ofs <= block_size_to_optimize_in_bytes);
+							
+							const uint32_t dst_match_window_ofs = block_index * total_block_stride_in_bytes + ofs;
+							const uint32_t src_match_window_ofs = prev_block_index * total_block_stride_in_bytes + ofs;
+
+							float trial_match_bits_to_use = trial_match_bits;
+							float trial_total_bits_times_lambda_to_use = trial_total_bits_times_lambda;
+														
+							uint32_t hs = hash_hsieh(pPrev_blk + ofs, len, ofs);
+
+#if ERT_FAVOR_CONT_AND_REP0_MATCHES
+							// Continue a previous match (which would cross block boundaries)
+							if (((int)src_match_window_ofs == prev_match_window_ofs_to_favor_cont) && (ofs == 0))
+							{
+								float continue_match_trial_bits = (block_size_to_optimize_in_bytes - len) * LITERAL_BITS + MATCH_CONTINUE_BITS;
+								trial_match_bits_to_use = MATCH_CONTINUE_BITS;
+								trial_total_bits_times_lambda_to_use = continue_match_trial_bits * params.m_lambda;
+							}
+							// Exploit REP0 matches
+							else if ((prev_match_dist_to_favor != -1) && (src_match_window_ofs == (dst_match_window_ofs - prev_match_dist_to_favor)))
+							{
+								float continue_match_trial_bits = (block_size_to_optimize_in_bytes - len) * LITERAL_BITS + MATCH_REP0_BITS;
+								trial_match_bits_to_use = MATCH_REP0_BITS;
+								trial_total_bits_times_lambda_to_use = continue_match_trial_bits * params.m_lambda;
+							}
+							else
+							{
+								uint32_t hash_check = hash[hs & (HASH_SIZE - 1)];
+								if ((hash_check & 0xFF) == (block_index & 0xFF))
+								{
+									if ((hash_check >> 8) == (hs >> 8))
+										continue;
+								}
+							}
+#else
+							uint32_t hash_check = hash[hs & (HASH_SIZE - 1)];
+							if ((hash_check & 0xFF) == (block_index & 0xFF))
+							{
+								if ((hash_check >> 8) == (hs >> 8))
+									continue;
+							}
+#endif
+
+							hash[hs & (HASH_SIZE - 1)] = (hs & 0xFFFFFF00) | (block_index & 0xFF);
+
+							uint8_t trial_block[MAX_BLOCK_SIZE_IN_BYTES];
+							memcpy(trial_block, pOrig_block, block_size_to_optimize_in_bytes);
+							memcpy(trial_block + ofs, pPrev_blk + ofs, len);
+
+							color_rgba decoded_trial_block[MAX_BLOCK_PIXELS];
+							if (!(*pUnpack_block_func)(trial_block, decoded_trial_block, block_index, active_x, active_y, pUnpack_block_func_user_data))
+								continue;
+
+							float trial_mse = compute_block_mse(pPixels, decoded_trial_block, block_width, block_height, current_block_pixels, num_comps, params.m_color_weights, one_over_total_color_weight);
+
+							if (trial_mse < thresh_ms_err)
+							{
+								float t = trial_mse * smooth_block_mse_scale + trial_total_bits_times_lambda_to_use;
+								
+								if (t < best_t)
+								{
+									best_t = t;
+									memcpy(best_block, trial_block, block_size_to_optimize_in_bytes);
+									best_match_len = len;
+									best_match_src_window_ofs = src_match_window_ofs;
+									best_match_dst_window_ofs = dst_match_window_ofs;
+									best_match_src_block_ofs = ofs;
+									best_match_dst_block_ofs = ofs;
+									best_match_bits = trial_match_bits_to_use;
+								}
+							}
+						} // ofs
+					}
+
+				} // len
+
+			} // prev_block_index
+
+			if (best_t < cur_t)
+			{
+				uint32_t best_second_match_len = 0, best_second_match_src_window_ofs = 0, best_second_match_dst_window_ofs = 0, best_second_match_src_block_ofs = 0, best_second_match_dst_block_ofs = 0;
+								
+				// Try injecting a second match, being sure it does't overlap with the first.
+				if ((params.m_try_two_matches) && (best_match_len <= (block_size_to_optimize_in_bytes - 3)))
+				{
+					uint8_t matched_flags[MAX_BLOCK_SIZE_IN_BYTES];
+					memset(matched_flags, 0, sizeof(matched_flags));
+					memset(matched_flags + best_match_dst_block_ofs, 1, best_match_len);
+
+					uint8_t orig_best_block[MAX_BLOCK_SIZE_IN_BYTES];
+					memcpy(orig_best_block, best_block, block_size_to_optimize_in_bytes);
+										
+					for (int prev_block_index = last_block_to_check; prev_block_index >= first_block_to_check; --prev_block_index)
+					{
+						const uint8_t* pPrev_blk = &pBlock_bytes[prev_block_index * total_block_stride_in_bytes];
+
+						const uint32_t match_dist = (block_index - prev_block_index) * total_block_stride_in_bytes;
+
+						for (uint32_t len = 3; len <= (block_size_to_optimize_in_bytes - best_match_len); len++)
+						{
+							const float trial_total_bits = (block_size_to_optimize_in_bytes - len - best_match_len) * LITERAL_BITS + compute_match_cost_estimate(match_dist, len) + best_match_bits;
+
+							const float trial_total_bits_times_lambda = trial_total_bits * params.m_lambda;
+
+							for (uint32_t ofs = 0; ofs <= (block_size_to_optimize_in_bytes - len); ofs++)
+							{
+								int i;
+								for (i = 0; i < (int)len; i++)
+									if (matched_flags[ofs + i])
+										break;
+								if (i != (int)len)
+									continue;
+
+								assert(len + ofs <= block_size_to_optimize_in_bytes);
+
+								const uint32_t dst_match_window_ofs = block_index * total_block_stride_in_bytes + ofs;
+								const uint32_t src_match_window_ofs = prev_block_index * total_block_stride_in_bytes + ofs;
+
+								uint8_t trial_block[MAX_BLOCK_SIZE_IN_BYTES];
+								memcpy(trial_block, orig_best_block, block_size_to_optimize_in_bytes);
+								memcpy(trial_block + ofs, pPrev_blk + ofs, len);
+
+								color_rgba decoded_trial_block[MAX_BLOCK_PIXELS];
+								if (!(*pUnpack_block_func)(trial_block, decoded_trial_block, block_index, active_x, active_y, pUnpack_block_func_user_data))
+									continue;
+
+								float trial_mse = compute_block_mse(pPixels, decoded_trial_block, block_width, block_height, current_block_pixels, num_comps, params.m_color_weights, one_over_total_color_weight);
+
+								if (trial_mse < thresh_ms_err)
+								{
+									float t = trial_mse * smooth_block_mse_scale + trial_total_bits_times_lambda;
+
+									if (t < best_t)
+									{
+										best_t = t;
+										memcpy(best_block, trial_block, block_size_to_optimize_in_bytes);
+										best_second_match_len = len;
+										best_second_match_src_window_ofs = src_match_window_ofs;
+										best_second_match_dst_window_ofs = dst_match_window_ofs;
+										best_second_match_src_block_ofs = ofs;
+										best_second_match_dst_block_ofs = ofs;
+									}
+								}
+							}
+						}
+					}
+				}
+
+				memcpy(pOrig_block, best_block, block_size_to_optimize_in_bytes);
+				total_modified++;
+
+				if ((best_second_match_len == 0) || (best_match_dst_window_ofs > best_second_match_dst_window_ofs))
+				{
+					int best_match_dist = best_match_dst_window_ofs - best_match_src_window_ofs;
+					assert(best_match_dist >= 1);
+					(void)best_match_dist;
+
+					if (block_size_to_optimize_in_bytes == total_block_stride_in_bytes)
+					{
+						// If the match goes all the way to the end of a block, we can try to continue it on the next encoded block.
+						if ((best_match_dst_block_ofs + best_match_len) == total_block_stride_in_bytes)
+							prev_match_window_ofs_to_favor_cont = best_match_src_window_ofs + best_match_len;
+						else
+							prev_match_window_ofs_to_favor_cont = -1;
+					}
+
+#if ERT_FAVOR_REP0_MATCHES
+					// Compute the window offset where a cheaper REP0 match would be available
+					prev_match_dist_to_favor = best_match_dist;
+#endif
+				}
+				else
+				{
+					int best_match_dist = best_second_match_dst_window_ofs - best_second_match_src_window_ofs;
+					assert(best_match_dist >= 1);
+					(void)best_match_dist;
+
+					if (block_size_to_optimize_in_bytes == total_block_stride_in_bytes)
+					{
+						// If the match goes all the way to the end of a block, we can try to continue it on the next encoded block.
+						if ((best_second_match_dst_block_ofs + best_second_match_len) == total_block_stride_in_bytes)
+							prev_match_window_ofs_to_favor_cont = best_second_match_src_window_ofs + best_second_match_len;
+						else
+							prev_match_window_ofs_to_favor_cont = -1;
+					}
+
+#if ERT_FAVOR_REP0_MATCHES
+					// Compute the window offset where a cheaper REP0 match would be available
+					prev_match_dist_to_favor = best_match_dist;
+#endif
+				}
+
+				len_hist[best_match_len]++;
+
+				if (best_second_match_len)
+				{
+					second_len_hist[best_second_match_len]++;
+					total_second_matches++;
+				}
+			}
+			else
+			{
+				prev_match_window_ofs_to_favor_cont = -1;
+			}
+						
+		} // block_index
+				
+		if (params.m_debug_output)
+		{
+			printf("Total smooth blocks: %3.2f%%\n", total_smooth_blocks * 100.0f / num_blocks);
+
+			printf("Match length histogram:\n");
+			for (uint32_t i = MIN_MATCH_LEN; i <= block_size_to_optimize_in_bytes; i++)
+				printf("%u%c", len_hist[i], (i < block_size_to_optimize_in_bytes) ? ',' : '\n');
+
+			printf("Total second matches: %u %3.2f%%\n", total_second_matches, total_second_matches * 100.0f / num_blocks);
+			printf("Secod match length histogram:\n");
+			for (uint32_t i = MIN_MATCH_LEN; i <= block_size_to_optimize_in_bytes; i++)
+				printf("%u%c", second_len_hist[i], (i < block_size_to_optimize_in_bytes) ? ',' : '\n');
+		}
+		
+		return true;
+	}
+
+} // namespace ert
\ No newline at end of file
diff --git a/Source/ert.h b/Source/ert.h
new file mode 100644
index 000000000..4c5e84144
--- /dev/null
+++ b/Source/ert.h
@@ -0,0 +1,81 @@
+#pragma once
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <algorithm>
+#include <assert.h>
+#include <time.h>
+#include <vector>
+#include <string>
+
+namespace ert
+{
+	struct color_rgba { uint8_t m_c[4]; };
+
+	struct reduce_entropy_params
+	{
+		// m_lambda: The post-processor tries to reduce distortion*smooth_block_scale + rate*lambda (rate is approximate LZ bits and distortion is scaled MS error multiplied against the smooth block MSE weighting factor).
+		// Larger values push the postprocessor towards optimizing more for lower rate, and smaller values more for distortion. 0=minimal distortion.
+		float m_lambda;
+
+		// m_lookback_window_size: The number of bytes the encoder can look back from each block to find matches. The larger this value, the slower the encoder but the higher the quality per LZ compressed bit.
+		uint32_t m_lookback_window_size;
+
+		// m_max_allowed_rms_increase_ratio: How much the RMS error of a block is allowed to increase before a trial is rejected. 1.0=no increase allowed, 1.05=5% increase allowed, etc.
+		float m_max_allowed_rms_increase_ratio;
+
+		float m_max_smooth_block_std_dev;
+		float m_smooth_block_max_mse_scale;
+
+		uint32_t m_color_weights[4];
+				
+		bool m_try_two_matches;
+		bool m_allow_relative_movement;
+		bool m_skip_zero_mse_blocks;
+		bool m_debug_output;
+
+		reduce_entropy_params() { clear(); }
+
+		void clear()
+		{
+			m_lookback_window_size = 256;
+			m_lambda = 1.0f;
+			m_max_allowed_rms_increase_ratio = 10.0f;
+			m_max_smooth_block_std_dev = 18.0f;
+			m_smooth_block_max_mse_scale = 10.0f;
+			m_color_weights[0] = 1;
+			m_color_weights[1] = 1;
+			m_color_weights[2] = 1;
+			m_color_weights[3] = 1;
+			m_try_two_matches = false;
+			m_allow_relative_movement = false;
+			m_skip_zero_mse_blocks = false;
+			m_debug_output = false;
+		}
+
+		void print()
+		{
+			printf("lambda: %f\n", m_lambda);
+			printf("Lookback window size: %u\n", m_lookback_window_size);
+			printf("Max allowed RMS increase ratio: %f\n", m_max_allowed_rms_increase_ratio);
+			printf("Max smooth block std dev: %f\n", m_max_smooth_block_std_dev);
+			printf("Smooth block max MSE scale: %f\n", m_smooth_block_max_mse_scale);
+			printf("Color weights: %u %u %u %u\n", m_color_weights[0], m_color_weights[1], m_color_weights[2], m_color_weights[3]);
+			printf("Try two matches: %u\n", m_try_two_matches);
+			printf("Allow relative movement: %u\n", m_allow_relative_movement);
+			printf("Skip zero MSE blocks: %u\n", m_skip_zero_mse_blocks);
+		}
+	};
+
+	typedef bool (*pUnpack_block_func)(const void* pBlock, color_rgba* pPixels, uint32_t block_index, uint32_t& active_x, uint32_t& active_y, void* pUser_data);
+
+	// BC7 entropy reduction transform with Deflate/LZMA/LZHAM optimizations
+	bool reduce_entropy(void* pBlocks, uint32_t num_blocks,
+		uint32_t total_block_stride_in_bytes, uint32_t block_size_to_optimize_in_bytes, uint32_t block_width, uint32_t block_height, uint32_t num_comps,
+		const color_rgba* pBlock_pixels, const reduce_entropy_params& params, uint32_t& total_modified,
+		pUnpack_block_func pUnpack_block_func, void* pUnpack_block_func_user_data,
+		std::vector<float>* pBlock_mse_scales = nullptr);
+
+} // namespace ert

From 110810f18aabff40cefa442cc6ad2384a6ace0e1 Mon Sep 17 00:00:00 2001
From: Yun Hsiao Wu <yunhsiaow@gmail.com>
Date: Mon, 25 Mar 2024 11:12:15 +0800
Subject: [PATCH 02/16] Fix debug asserts

---
 Source/astcenccli_toplevel.cpp | 6 ++++++
 Source/cmake_core.cmake        | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/Source/astcenccli_toplevel.cpp b/Source/astcenccli_toplevel.cpp
index e76cb8d18..3bf9787c6 100644
--- a/Source/astcenccli_toplevel.cpp
+++ b/Source/astcenccli_toplevel.cpp
@@ -1186,6 +1186,12 @@ static int edit_astcenc_config(
 			}
 
 			config.rdo_level = static_cast<float>(atof(argv[argidx - 1]));
+
+			// RDO trials blocks will require full initializations
+			if (static_cast<bool>(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY))
+			{
+				config.flags &= ~ASTCENC_FLG_SELF_DECOMPRESS_ONLY;
+			}
 		}
 		else if (!strcmp(argv[argidx], "-rdo-lookback"))
 		{
diff --git a/Source/cmake_core.cmake b/Source/cmake_core.cmake
index c5c2ca9b3..0fd4fc35f 100644
--- a/Source/cmake_core.cmake
+++ b/Source/cmake_core.cmake
@@ -168,7 +168,7 @@ macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_IS_VENEER)
             $<${is_gnu_fe}:-Wall>
             $<${is_gnu_fe}:-Wextra>
             $<${is_gnu_fe}:-Wpedantic>
-            $<${is_gnu_fe}:-Werror>
+            #$<${is_gnu_fe}:-Werror>
             $<${is_gnu_fe}:-Wshadow>
             $<${is_gnu_fe}:-Wdouble-promotion>
             $<${is_clang}:-Wdocumentation>

From a523f769c597e6a4fd5bf12cc6c987400173aa77 Mon Sep 17 00:00:00 2001
From: Yun Hsiao Wu <yunhsiaow@gmail.com>
Date: Mon, 25 Mar 2024 16:24:51 +0800
Subject: [PATCH 03/16] proper swizzle handling

---
 .gitignore                         |   2 +
 Source/astcenc_entry.cpp           |   3 -
 Source/astcenc_rate_distortion.cpp | 117 +++++++++++++++++++++--------
 Source/astcenccli_toplevel.cpp     |   4 +-
 4 files changed, 88 insertions(+), 38 deletions(-)

diff --git a/.gitignore b/.gitignore
index ee2ba210a..3aee11ae4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,7 @@
 __pycache__
 Scratch
 Proto
+.idea
 
 # Precompiled reference binaries for comparison tests
 bin
@@ -19,6 +20,7 @@ Binaries
 # Build artifacts
 astcenc
 build*
+cmake-build*
 
 # General build artifacts
 Test/DocOut
diff --git a/Source/astcenc_entry.cpp b/Source/astcenc_entry.cpp
index faf8e94b5..4f62e92fb 100644
--- a/Source/astcenc_entry.cpp
+++ b/Source/astcenc_entry.cpp
@@ -412,9 +412,6 @@ static astcenc_error validate_config(
 	config.tune_3partition_early_out_limit_factor = astc::max(config.tune_3partition_early_out_limit_factor, 0.0f);
 	config.tune_2plane_early_out_limit_correlation = astc::max(config.tune_2plane_early_out_limit_correlation, 0.0f);
 
-	//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]
-	config.rdo_level = astc::clamp(config.rdo_level, 0.0f, 1.0f);
-
 	// Specifying a zero weight color component is not allowed; force to small value
 	float max_weight = astc::max(astc::max(config.cw_r_weight, config.cw_g_weight),
 	                             astc::max(config.cw_b_weight, config.cw_a_weight));
diff --git a/Source/astcenc_rate_distortion.cpp b/Source/astcenc_rate_distortion.cpp
index 5facc9ea9..3d127fc24 100644
--- a/Source/astcenc_rate_distortion.cpp
+++ b/Source/astcenc_rate_distortion.cpp
@@ -19,7 +19,7 @@ struct astcenc_rdo_context
 	uint32_t m_image_dimz = 0;
 	uint32_t m_bytes_per_texel = 0;
 	uint32_t m_component_count = 0;
-	astcenc_swizzle m_swizzle;
+	astcenc_swizzle m_swizzle{ ASTCENC_SWZ_R, ASTCENC_SWZ_G, ASTCENC_SWZ_B, ASTCENC_SWZ_A };
 
 	std::vector<uint8_t> m_block_pixels;
 };
@@ -34,7 +34,7 @@ extern "C" void rdo_progress_emitter(
 ) {
 	static float previous_value = 100.0f;
 	const uint32_t bar_size = 25;
-	uint32_t parts = static_cast<uint32_t>(value / 4.0f);
+	auto parts = static_cast<uint32_t>(value / 4.0f);
 
 	char buffer[bar_size + 3];
 	buffer[0] = '[';
@@ -64,15 +64,15 @@ extern "C" void rdo_progress_emitter(
 	fflush(stdout);
 }
 
+template<typename T>
 static uint32_t init_rdo_context(
 	astcenc_contexti& ctx,
 	const astcenc_image& image,
-	const astcenc_swizzle& swizzle
+	const astcenc_swizzle& swz
 ) {
 	ctx.rdo_context = new astcenc_rdo_context;
 	astcenc_rdo_context& rdo_ctx = *ctx.rdo_context;
 
-	// Perform memory allocations for intermediary buffers
 	uint32_t block_dim_x = ctx.bsd->xdim;
 	uint32_t block_dim_y = ctx.bsd->ydim;
 	uint32_t block_dim_z = ctx.bsd->zdim;
@@ -81,12 +81,53 @@ static uint32_t init_rdo_context(
 	uint32_t zblocks = (image.dim_z + block_dim_z - 1u) / block_dim_z;
 	uint32_t total_blocks = xblocks * yblocks * zblocks;
 
+	// Generate quality parameters
+	ert::reduce_entropy_params& ert_params = rdo_ctx.m_ert_params;
+	ert_params.m_lambda = astc::clamp(ctx.config.rdo_level, 0.0f, 1.0f);
+	ert_params.m_color_weights[0] = static_cast<uint32_t>(ctx.config.cw_r_weight * 255.0f);
+	ert_params.m_color_weights[1] = static_cast<uint32_t>(ctx.config.cw_g_weight * 255.0f);
+	ert_params.m_color_weights[2] = static_cast<uint32_t>(ctx.config.cw_b_weight * 255.0f);
+	ert_params.m_color_weights[3] = static_cast<uint32_t>(ctx.config.cw_a_weight * 255.0f);
+	ert_params.m_lookback_window_size = (ctx.config.rdo_lookback ? astc::min(ctx.config.rdo_lookback, total_blocks) : total_blocks) * ASTCENC_BYTES_PER_BLOCK;
+
+	ert_params.m_try_two_matches = true;
+	// ert_params.m_allow_relative_movement = true;
+	// ert_params.m_debug_output = true;
+
+	// Deduce conversion swizzles, ERT requires sample components to be contiguous
+	uint32_t num_components = 0;
+    astcenc_swz encoding_swzes[4];
+    astcenc_swz decoding_swzes[4];
+    uint32_t* w = ert_params.m_color_weights;
+
+	if (w[0]) w[num_components] = w[0], decoding_swzes[num_components] = ASTCENC_SWZ_R, encoding_swzes[num_components++] = swz.r;
+	if (w[1]) w[num_components] = w[1], decoding_swzes[num_components] = ASTCENC_SWZ_G, encoding_swzes[num_components++] = swz.g;
+    if (w[2]) w[num_components] = w[2], decoding_swzes[num_components] = ASTCENC_SWZ_B, encoding_swzes[num_components++] = swz.b;
+	if (w[3]) w[num_components] = w[3], decoding_swzes[num_components] = ASTCENC_SWZ_A, encoding_swzes[num_components++] = swz.a;
+
+    for (uint32_t idx = num_components; idx < 4; ++idx)
+    {
+        w[idx] = 0;
+        decoding_swzes[idx] = encoding_swzes[idx] = ASTCENC_SWZ_1;
+    }
+
+	bool needs_swz = // As long as weights are the same it actually doesn't matter
+		ert_params.m_color_weights[0] != ert_params.m_color_weights[1] ||
+		ert_params.m_color_weights[1] != ert_params.m_color_weights[2] ||
+		ert_params.m_color_weights[2] != ert_params.m_color_weights[3];
+
+	T scratch[6];
+	scratch[ASTCENC_SWZ_0] = 0u;
+	scratch[ASTCENC_SWZ_1] = ASTCENC_RDO_PADDING_PIXEL;
+
+	// Perform memory allocations for intermediary buffers
 	uint32_t bytes_per_texel = 4u;
 	if (image.data_type == ASTCENC_TYPE_F16) bytes_per_texel *= 2;
 	else if (image.data_type == ASTCENC_TYPE_F32) bytes_per_texel *= 4;
-	uint32_t texels_per_block = block_dim_x * block_dim_y * block_dim_z;
 
 	rdo_ctx.m_block_pixels.resize(total_blocks * ctx.bsd->texel_count * bytes_per_texel);
+    auto* block_pixels = static_cast<T*>(rdo_ctx.m_block_pixels.data());
+
 	for (uint32_t block_z = 0; block_z < zblocks; ++block_z)
 	{
 		const uint32_t valid_z = astc::min(image.dim_z - block_z * block_dim_z, block_dim_z);
@@ -94,32 +135,47 @@ static uint32_t init_rdo_context(
 	
 		for (uint32_t offset_z = 0; offset_z < valid_z; offset_z++)
 		{
-			const auto* src_data = static_cast<const uint8_t*>(image.data[block_z * block_dim_z + offset_z]);
+			const auto* src_data = static_cast<const T*>(image.data[block_z * block_dim_z + offset_z]);
 	
 			for (uint32_t block_y = 0, block_idx = 0; block_y < yblocks; ++block_y)
 			{
-				const uint8_t* src_block_row = src_data + block_y * block_dim_y * image.dim_x * bytes_per_texel;
+				const T* src_block_row = src_data + block_y * block_dim_y * image.dim_x * 4;
 				const uint32_t valid_y = astc::min(image.dim_y - block_y * block_dim_y, block_dim_y);
 				const uint32_t padding_y = block_dim_y - valid_y;
 	
 				for (uint32_t block_x = 0; block_x < xblocks; ++block_x, ++block_idx)
 				{
-					const uint8_t* src_block = src_block_row + block_x * block_dim_x * bytes_per_texel;
+					const T* src_block = src_block_row + block_x * block_dim_x * 4;
 					const uint32_t valid_x = astc::min(image.dim_x - block_x * block_dim_x, block_dim_x);
 					const uint32_t padding_x = block_dim_x - valid_x;
 	
-					uint8_t* dst_block = rdo_ctx.m_block_pixels.data() + block_idx * texels_per_block * bytes_per_texel;
+					T* dst_block = block_pixels + (block_idx * block_dim_z + offset_z) * block_dim_x * block_dim_y * 4;
 					for (uint32_t offset_y = 0; offset_y < valid_y; ++offset_y)
 					{
-						memcpy(dst_block, src_block, valid_x * bytes_per_texel);
-						if (padding_x) memset(dst_block + valid_x * bytes_per_texel, ASTCENC_RDO_PADDING_PIXEL, padding_x * bytes_per_texel);
-						src_block += image.dim_x * bytes_per_texel;
-						dst_block += block_dim_x * bytes_per_texel;
+						if (needs_swz)
+						{
+							for (uint32_t offset_x = 0; offset_x < valid_x; ++offset_x)
+							{
+								memcpy(scratch, src_block + offset_x * 4, bytes_per_texel);
+
+								for (uint32_t idx = 0; idx < 4; ++idx)
+								{
+									dst_block[offset_x * 4 + idx] = scratch[encoding_swzes[idx]];
+								}
+							}
+						}
+						else
+						{
+							memcpy(dst_block, src_block, valid_x * bytes_per_texel);
+						}
+						if (padding_x) memset(dst_block + valid_x * 4, ASTCENC_RDO_PADDING_PIXEL, padding_x * bytes_per_texel);
+						src_block += image.dim_x * 4;
+						dst_block += block_dim_x * 4;
 					}
 					if (padding_y)
 					{
 						memset(dst_block, ASTCENC_RDO_PADDING_PIXEL, padding_y * block_dim_x * bytes_per_texel);
-						dst_block += padding_y * block_dim_x * bytes_per_texel;
+						dst_block += padding_y * block_dim_x * 4;
 					}
 					if (padding_z) memset(dst_block, ASTCENC_RDO_PADDING_PIXEL, padding_z * block_dim_y * block_dim_x * bytes_per_texel);
 				}
@@ -127,19 +183,6 @@ static uint32_t init_rdo_context(
 		}
 	}
 
-	// Generate quality parameters
-	ert::reduce_entropy_params& ert_params = rdo_ctx.m_ert_params;
-	ert_params.m_lambda = astc::clamp(ctx.config.rdo_level, 0.0f, 1.0f);
-	ert_params.m_color_weights[0] = static_cast<uint32_t>(ctx.config.cw_r_weight * 255.0f);
-	ert_params.m_color_weights[1] = static_cast<uint32_t>(ctx.config.cw_g_weight * 255.0f);
-	ert_params.m_color_weights[2] = static_cast<uint32_t>(ctx.config.cw_b_weight * 255.0f);
-	ert_params.m_color_weights[3] = static_cast<uint32_t>(ctx.config.cw_a_weight * 255.0f);
-	ert_params.m_lookback_window_size = astc::min(ctx.config.rdo_lookback, total_blocks) * ASTCENC_BYTES_PER_BLOCK;
-
-	ert_params.m_try_two_matches = true;
-	// ert_params.m_allow_relative_movement = true;
-	// ert_params.m_debug_output = true;
-
 	rdo_ctx.m_image_dimx = image.dim_x;
 	rdo_ctx.m_image_dimy = image.dim_y;
 	rdo_ctx.m_image_dimz = image.dim_z;
@@ -148,8 +191,15 @@ static uint32_t init_rdo_context(
 	rdo_ctx.m_zblocks = zblocks;
 	rdo_ctx.m_total_blocks = total_blocks;
 	rdo_ctx.m_bytes_per_texel = bytes_per_texel;
-	rdo_ctx.m_component_count = 4;
-	rdo_ctx.m_swizzle = swizzle;
+	rdo_ctx.m_component_count = num_components;
+
+    if (needs_swz)
+    {
+        rdo_ctx.m_swizzle.r = decoding_swzes[0];
+        rdo_ctx.m_swizzle.g = decoding_swzes[1];
+        rdo_ctx.m_swizzle.b = decoding_swzes[2];
+        rdo_ctx.m_swizzle.a = decoding_swzes[3];
+    }
 
 	return rdo_ctx.m_total_blocks;
 }
@@ -270,7 +320,7 @@ static bool store_image_block_u8(
 
 struct local_rdo_context
 {
-	const astcenc_contexti* ctx;
+	const astcenc_contexti* ctx = nullptr;
 	uint32_t base_block_idx = 0;
 };
 
@@ -329,7 +379,9 @@ void rate_distortion_optimize(
 	// Only the first thread actually runs the initializer
 	ctxo.manage_rdo.init([&ctxo, &image, &swizzle]
 		{
-			return init_rdo_context(ctxo.context, image, swizzle);
+			// if (image.data_type == ASTCENC_TYPE_F16) return init_rdo_context<uint16_t>(ctxo.context, image, swizzle);
+			// if (image.data_type == ASTCENC_TYPE_F32) return init_rdo_context<uint32_t>(ctxo.context, image, swizzle);
+			return init_rdo_context<uint8_t>(ctxo.context, image, swizzle);
 		},
 		ctxo.context.config.progress_callback ? rdo_progress_emitter : nullptr);
 
@@ -338,8 +390,7 @@ void rate_distortion_optimize(
 	uint32_t block_dim_z = ctxo.context.bsd->zdim;
 	uint32_t texels_per_block = block_dim_x * block_dim_y * block_dim_z;
 	astcenc_rdo_context& rdo_ctx = *ctxo.context.rdo_context;
-	uint32_t blocks_per_task = ctxo.context.config.rdo_lookback;
-	if (!blocks_per_task) blocks_per_task = rdo_ctx.m_total_blocks; // Effectively single-threaded
+	uint32_t blocks_per_task = rdo_ctx.m_ert_params.m_lookback_window_size / ASTCENC_BYTES_PER_BLOCK;
 
 	uint32_t total_modified = 0;
 	while (true)
diff --git a/Source/astcenccli_toplevel.cpp b/Source/astcenccli_toplevel.cpp
index 3bf9787c6..128109ba6 100644
--- a/Source/astcenccli_toplevel.cpp
+++ b/Source/astcenccli_toplevel.cpp
@@ -1187,8 +1187,8 @@ static int edit_astcenc_config(
 
 			config.rdo_level = static_cast<float>(atof(argv[argidx - 1]));
 
-			// RDO trials blocks will require full initializations
-			if (static_cast<bool>(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY))
+			// Unpacking RDO trials blocks requires full initialization
+			if (config.rdo_level > 0.0f && static_cast<bool>(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY))
 			{
 				config.flags &= ~ASTCENC_FLG_SELF_DECOMPRESS_ONLY;
 			}

From 79cd6acb7fb0c5247956581878881cfd5500215b Mon Sep 17 00:00:00 2001
From: Yun Hsiao Wu <yunhsiaow@gmail.com>
Date: Tue, 26 Mar 2024 12:47:49 +0800
Subject: [PATCH 04/16] support multiple slices

---
 Source/astcenc_rate_distortion.cpp | 31 ++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/Source/astcenc_rate_distortion.cpp b/Source/astcenc_rate_distortion.cpp
index 3d127fc24..a63ea689d 100644
--- a/Source/astcenc_rate_distortion.cpp
+++ b/Source/astcenc_rate_distortion.cpp
@@ -126,29 +126,29 @@ static uint32_t init_rdo_context(
 	else if (image.data_type == ASTCENC_TYPE_F32) bytes_per_texel *= 4;
 
 	rdo_ctx.m_block_pixels.resize(total_blocks * ctx.bsd->texel_count * bytes_per_texel);
-    auto* block_pixels = static_cast<T*>(rdo_ctx.m_block_pixels.data());
 
 	for (uint32_t block_z = 0; block_z < zblocks; ++block_z)
 	{
 		const uint32_t valid_z = astc::min(image.dim_z - block_z * block_dim_z, block_dim_z);
 		const uint32_t padding_z = block_dim_z - valid_z;
-	
+        auto* block_pixels = static_cast<T*>(rdo_ctx.m_block_pixels.data()) + block_z * yblocks * xblocks * ctx.bsd->texel_count * 4;
+
 		for (uint32_t offset_z = 0; offset_z < valid_z; offset_z++)
 		{
 			const auto* src_data = static_cast<const T*>(image.data[block_z * block_dim_z + offset_z]);
-	
+
 			for (uint32_t block_y = 0, block_idx = 0; block_y < yblocks; ++block_y)
 			{
 				const T* src_block_row = src_data + block_y * block_dim_y * image.dim_x * 4;
 				const uint32_t valid_y = astc::min(image.dim_y - block_y * block_dim_y, block_dim_y);
 				const uint32_t padding_y = block_dim_y - valid_y;
-	
+
 				for (uint32_t block_x = 0; block_x < xblocks; ++block_x, ++block_idx)
 				{
 					const T* src_block = src_block_row + block_x * block_dim_x * 4;
 					const uint32_t valid_x = astc::min(image.dim_x - block_x * block_dim_x, block_dim_x);
 					const uint32_t padding_x = block_dim_x - valid_x;
-	
+
 					T* dst_block = block_pixels + (block_idx * block_dim_z + offset_z) * block_dim_x * block_dim_y * 4;
 					for (uint32_t offset_y = 0; offset_y < valid_y; ++offset_y)
 					{
@@ -338,12 +338,15 @@ static bool unpack_block(
 	uint32_t block_idx = local_ctx.base_block_idx + local_block_idx;
 
 	uint32_t block_dim_x = ctx.bsd->xdim;
-	uint32_t block_dim_y = ctx.bsd->ydim;
-	assert(block_idx < rdo_ctx.m_xblocks * rdo_ctx.m_yblocks);
+    uint32_t block_dim_y = ctx.bsd->ydim;
+    uint32_t block_dim_z = ctx.bsd->zdim;
+	assert(block_idx < rdo_ctx.m_xblocks * rdo_ctx.m_yblocks * rdo_ctx.m_zblocks);
 
-	uint32_t block_y = block_idx / rdo_ctx.m_xblocks;
-	uint32_t block_x = block_idx - block_y * rdo_ctx.m_xblocks;
-	assert(block_y * rdo_ctx.m_xblocks + block_x == block_idx);
+    uint32_t block_z = block_idx / (rdo_ctx.m_xblocks * rdo_ctx.m_yblocks);
+    uint32_t slice_idx = block_idx - block_z * rdo_ctx.m_xblocks * rdo_ctx.m_yblocks;
+    uint32_t block_y = slice_idx / rdo_ctx.m_xblocks;
+	uint32_t block_x = slice_idx - block_y * rdo_ctx.m_xblocks;
+	assert(block_y * rdo_ctx.m_xblocks + block_x == slice_idx);
 
 	uint8_t pcb[ASTCENC_BYTES_PER_BLOCK];
 	memcpy(pcb, block, sizeof(pcb));
@@ -357,12 +360,12 @@ static bool unpack_block(
 
 	image_block blk;
 	decompress_symbolic_block(ctx.config.profile, *ctx.bsd,
-		block_x * block_dim_x, block_y * block_dim_y, 0,
+		block_x * block_dim_x, block_y * block_dim_y, block_z * block_dim_z,
 		scb, blk);
 
 	return store_image_block_u8(reinterpret_cast<uint8_t*>(pixels), blk, *ctx.bsd,
-		block_x * block_dim_x, block_y * block_dim_y, 0,
-		rdo_ctx.m_image_dimx, rdo_ctx.m_image_dimy, 1, rdo_ctx.m_swizzle, active_x, active_y);
+		block_x * block_dim_x, block_y * block_dim_y, block_z * block_dim_z,
+		rdo_ctx.m_image_dimx, rdo_ctx.m_image_dimy, rdo_ctx.m_image_dimz, rdo_ctx.m_swizzle, active_x, active_y);
 }
 
 void rate_distortion_optimize(
@@ -371,7 +374,7 @@ void rate_distortion_optimize(
 	const astcenc_swizzle& swizzle,
 	uint8_t* buffer
 ) {
-	if (ctxo.context.config.rdo_level == 0.0f || image.data_type != ASTCENC_TYPE_U8 || image.dim_z != 1)
+	if (ctxo.context.config.rdo_level == 0.0f || image.data_type != ASTCENC_TYPE_U8 || ctxo.context.bsd->zdim > 1)
 	{
 		return;
 	}

From 00e4d02311356eac9f77884871e30cab1e0606a0 Mon Sep 17 00:00:00 2001
From: Yun Hsiao Wu <yunhsiaow@gmail.com>
Date: Tue, 26 Mar 2024 15:05:12 +0800
Subject: [PATCH 05/16] license

---
 Source/astcenc.h                    |  2 -
 Source/astcenc_entry.cpp            |  5 +--
 Source/astcenc_internal.h           |  3 --
 Source/astcenc_internal_entry.h     |  5 +--
 Source/astcenc_rate_distortion.cpp  | 61 ++++++++++++++++++-----------
 Source/astcenccli_toplevel.cpp      |  4 --
 Source/astcenccli_toplevel_help.cpp |  2 -
 Source/cmake_core.cmake             |  2 -
 Source/ert.cpp                      |  2 +-
 9 files changed, 42 insertions(+), 44 deletions(-)

diff --git a/Source/astcenc.h b/Source/astcenc.h
index a54e27fe4..b718c6786 100644
--- a/Source/astcenc.h
+++ b/Source/astcenc.h
@@ -571,7 +571,6 @@ struct astcenc_config
 	 */
 	float tune_search_mode0_enable;
 
-	//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]:[BEGIN]
 	/**
 	 * @brief The compression level for rate-distortion optimization.
 	 */
@@ -580,7 +579,6 @@ struct astcenc_config
 	 * @brief Number of blocks for each entropy reduction batch.
 	 */
 	unsigned int rdo_lookback;
-	//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]:[END]
 
 	/**
 	 * @brief The progress callback, can be @c nullptr.
diff --git a/Source/astcenc_entry.cpp b/Source/astcenc_entry.cpp
index 4f62e92fb..ba92bea88 100644
--- a/Source/astcenc_entry.cpp
+++ b/Source/astcenc_entry.cpp
@@ -650,7 +650,7 @@ astcenc_error astcenc_config_init(
 	}
 	config.flags = flags;
 
-	//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]
+	// This is a reasonable default that leverages performance & compression rate
 	config.rdo_lookback = 64;
 
 	return ASTCENC_SUCCESS;
@@ -1102,7 +1102,6 @@ astcenc_error astcenc_compress_image(
 	// Only the first thread to arrive actually runs the term
 	ctxo->manage_compress.term(term_compress);
 
-	//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]
 	rate_distortion_optimize(*ctxo, image, *swizzle, data_out);
 
 	return ASTCENC_SUCCESS;
@@ -1125,7 +1124,7 @@ astcenc_error astcenc_compress_reset(
 
 	ctxo->manage_avg.reset();
 	ctxo->manage_compress.reset();
-	ctxo->manage_rdo.reset(); //SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]
+	ctxo->manage_rdo.reset();
 	return ASTCENC_SUCCESS;
 #endif
 }
diff --git a/Source/astcenc_internal.h b/Source/astcenc_internal.h
index 28a88b564..172de8404 100644
--- a/Source/astcenc_internal.h
+++ b/Source/astcenc_internal.h
@@ -1232,7 +1232,6 @@ struct astcenc_contexti
 	/** @brief The pixel region and variance worker arguments. */
 	avg_args avg_preprocess_args;
 
-	//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]
 	struct astcenc_rdo_context* rdo_context;
 #endif
 
@@ -2174,7 +2173,6 @@ void physical_to_symbolic(
 	const uint8_t pcb[16],
 	symbolic_compressed_block& scb);
 
-//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]:[BEGIN]
 /**
  * @brief Rate-distortion optimization main entry.
  *
@@ -2185,7 +2183,6 @@ void rate_distortion_optimize(
 	const astcenc_image& image,
 	const astcenc_swizzle& swizzle,
 	uint8_t* buffer);
-//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]:[END]
 
 /* ============================================================================
 Platform-specific functions.
diff --git a/Source/astcenc_internal_entry.h b/Source/astcenc_internal_entry.h
index 6092a52a2..d60278e37 100644
--- a/Source/astcenc_internal_entry.h
+++ b/Source/astcenc_internal_entry.h
@@ -164,7 +164,6 @@ class ParallelManager
 	 * @param init_func   Callable which executes the stage initialization. It must return the
 	 *                    total number of tasks in the stage.
 	 */
-	//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]
 	void init(std::function<unsigned int(void)> init_func, astcenc_progress_callback callback = nullptr)
 	{
 		std::lock_guard<std::mutex> lck(m_lock);
@@ -172,7 +171,7 @@ class ParallelManager
 		{
 			m_task_count = init_func();
 			m_init_done = true;
-			if (callback) m_callback = callback; //SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]
+			if (callback) m_callback = callback;
 		}
 	}
 
@@ -325,10 +324,8 @@ struct astcenc_context
 	/** @brief The parallel manager for compression. */
 	ParallelManager manage_compress;
 
-	//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]:[BEGIN]
 	/** @brief The parallel manager for rate-distortion optimization. */
 	ParallelManager manage_rdo;
-	//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]:[END]
 #endif
 
 	/** @brief The parallel manager for decompression. */
diff --git a/Source/astcenc_rate_distortion.cpp b/Source/astcenc_rate_distortion.cpp
index a63ea689d..46169f1bd 100644
--- a/Source/astcenc_rate_distortion.cpp
+++ b/Source/astcenc_rate_distortion.cpp
@@ -1,4 +1,19 @@
-//Copyright Tencent Timi-J1&F1 Studio, Inc. All Rights Reserved
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2024 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
 
 #if !defined(ASTCENC_DECOMPRESS_ONLY)
 
@@ -96,20 +111,20 @@ static uint32_t init_rdo_context(
 
 	// Deduce conversion swizzles, ERT requires sample components to be contiguous
 	uint32_t num_components = 0;
-    astcenc_swz encoding_swzes[4];
-    astcenc_swz decoding_swzes[4];
-    uint32_t* w = ert_params.m_color_weights;
+	astcenc_swz encoding_swzes[4];
+	astcenc_swz decoding_swzes[4];
+	uint32_t* w = ert_params.m_color_weights;
 
 	if (w[0]) w[num_components] = w[0], decoding_swzes[num_components] = ASTCENC_SWZ_R, encoding_swzes[num_components++] = swz.r;
 	if (w[1]) w[num_components] = w[1], decoding_swzes[num_components] = ASTCENC_SWZ_G, encoding_swzes[num_components++] = swz.g;
-    if (w[2]) w[num_components] = w[2], decoding_swzes[num_components] = ASTCENC_SWZ_B, encoding_swzes[num_components++] = swz.b;
+	if (w[2]) w[num_components] = w[2], decoding_swzes[num_components] = ASTCENC_SWZ_B, encoding_swzes[num_components++] = swz.b;
 	if (w[3]) w[num_components] = w[3], decoding_swzes[num_components] = ASTCENC_SWZ_A, encoding_swzes[num_components++] = swz.a;
 
-    for (uint32_t idx = num_components; idx < 4; ++idx)
-    {
-        w[idx] = 0;
-        decoding_swzes[idx] = encoding_swzes[idx] = ASTCENC_SWZ_1;
-    }
+	for (uint32_t idx = num_components; idx < 4; ++idx)
+	{
+		w[idx] = 0;
+		decoding_swzes[idx] = encoding_swzes[idx] = ASTCENC_SWZ_1;
+	}
 
 	bool needs_swz = // As long as weights are the same it actually doesn't matter
 		ert_params.m_color_weights[0] != ert_params.m_color_weights[1] ||
@@ -131,7 +146,7 @@ static uint32_t init_rdo_context(
 	{
 		const uint32_t valid_z = astc::min(image.dim_z - block_z * block_dim_z, block_dim_z);
 		const uint32_t padding_z = block_dim_z - valid_z;
-        auto* block_pixels = static_cast<T*>(rdo_ctx.m_block_pixels.data()) + block_z * yblocks * xblocks * ctx.bsd->texel_count * 4;
+		auto* block_pixels = static_cast<T*>(rdo_ctx.m_block_pixels.data()) + block_z * yblocks * xblocks * ctx.bsd->texel_count * 4;
 
 		for (uint32_t offset_z = 0; offset_z < valid_z; offset_z++)
 		{
@@ -193,13 +208,13 @@ static uint32_t init_rdo_context(
 	rdo_ctx.m_bytes_per_texel = bytes_per_texel;
 	rdo_ctx.m_component_count = num_components;
 
-    if (needs_swz)
-    {
-        rdo_ctx.m_swizzle.r = decoding_swzes[0];
-        rdo_ctx.m_swizzle.g = decoding_swzes[1];
-        rdo_ctx.m_swizzle.b = decoding_swzes[2];
-        rdo_ctx.m_swizzle.a = decoding_swzes[3];
-    }
+	if (needs_swz)
+	{
+		rdo_ctx.m_swizzle.r = decoding_swzes[0];
+		rdo_ctx.m_swizzle.g = decoding_swzes[1];
+		rdo_ctx.m_swizzle.b = decoding_swzes[2];
+		rdo_ctx.m_swizzle.a = decoding_swzes[3];
+	}
 
 	return rdo_ctx.m_total_blocks;
 }
@@ -338,13 +353,13 @@ static bool unpack_block(
 	uint32_t block_idx = local_ctx.base_block_idx + local_block_idx;
 
 	uint32_t block_dim_x = ctx.bsd->xdim;
-    uint32_t block_dim_y = ctx.bsd->ydim;
-    uint32_t block_dim_z = ctx.bsd->zdim;
+	uint32_t block_dim_y = ctx.bsd->ydim;
+	uint32_t block_dim_z = ctx.bsd->zdim;
 	assert(block_idx < rdo_ctx.m_xblocks * rdo_ctx.m_yblocks * rdo_ctx.m_zblocks);
 
-    uint32_t block_z = block_idx / (rdo_ctx.m_xblocks * rdo_ctx.m_yblocks);
-    uint32_t slice_idx = block_idx - block_z * rdo_ctx.m_xblocks * rdo_ctx.m_yblocks;
-    uint32_t block_y = slice_idx / rdo_ctx.m_xblocks;
+	uint32_t block_z = block_idx / (rdo_ctx.m_xblocks * rdo_ctx.m_yblocks);
+	uint32_t slice_idx = block_idx - block_z * rdo_ctx.m_xblocks * rdo_ctx.m_yblocks;
+	uint32_t block_y = slice_idx / rdo_ctx.m_xblocks;
 	uint32_t block_x = slice_idx - block_y * rdo_ctx.m_xblocks;
 	assert(block_y * rdo_ctx.m_xblocks + block_x == slice_idx);
 
diff --git a/Source/astcenccli_toplevel.cpp b/Source/astcenccli_toplevel.cpp
index 128109ba6..ed27c0492 100644
--- a/Source/astcenccli_toplevel.cpp
+++ b/Source/astcenccli_toplevel.cpp
@@ -1175,7 +1175,6 @@ static int edit_astcenc_config(
 			argidx += 1;
 			cli_config.diagnostic_images = true;
 		}
-		//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]:[BEGIN]
 		else if (!strcmp(argv[argidx], "-rdo-level"))
 		{
 			argidx += 2;
@@ -1204,7 +1203,6 @@ static int edit_astcenc_config(
 
 			config.rdo_lookback = atoi(argv[argidx - 1]);
 		}
-		//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]:[END]
 		else // check others as well
 		{
 			print_error("ERROR: Argument '%s' not recognized\n", argv[argidx]);
@@ -1295,10 +1293,8 @@ static void print_astcenc_config(
 		printf("    Candidate cutoff:           %u candidates\n", config.tune_candidate_limit);
 		printf("    Refinement cutoff:          %u iterations\n", config.tune_refinement_limit);
 		printf("    Compressor thread count:    %d\n", cli_config.thread_count);
-		//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]:[BEGIN]
 		printf("    Rate-distortion level:      %g\n", static_cast<double>(config.rdo_level));
 		printf("    Rate-distortion lookback:   %u blocks\n", config.rdo_lookback);
-		//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]:[END]
 		printf("\n");
 	}
 }
diff --git a/Source/astcenccli_toplevel_help.cpp b/Source/astcenccli_toplevel_help.cpp
index c660cf2a9..e2b52dd01 100644
--- a/Source/astcenccli_toplevel_help.cpp
+++ b/Source/astcenccli_toplevel_help.cpp
@@ -386,7 +386,6 @@ ADVANCED COMPRESSION
                -verythorough : 0.98
                -exhaustive   : 0.99
 )"
-//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]:[BEGIN]
 R"(
        -rdo-level <factor>
            Rate-distortion optimization level.
@@ -398,7 +397,6 @@ R"(
            The larger this value, the slower the encoder but the higher the quality per LZ compressed bit.
            Default to 64 blocks.
 )"
-//SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]:[END]
 // This split in the literals is needed for Visual Studio; the compiler
 // will concatenate these two strings together ...
 R"(
diff --git a/Source/cmake_core.cmake b/Source/cmake_core.cmake
index 0fd4fc35f..4063931ec 100644
--- a/Source/cmake_core.cmake
+++ b/Source/cmake_core.cmake
@@ -42,10 +42,8 @@ set(is_clang "$<AND:${is_gnu_fe},$<CXX_COMPILER_ID:Clang,AppleClang>>")
 
 add_library(${ASTCENC_TARGET}-static
     STATIC
-        #SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]:[BEGIN]
         ert.cpp
         astcenc_rate_distortion.cpp
-        #SpeedEngine: AstcEnc:RateDistortion:[yunhsiaowu]:[END]
         astcenc_averages_and_directions.cpp
         astcenc_block_sizes.cpp
         astcenc_color_quantize.cpp
diff --git a/Source/ert.cpp b/Source/ert.cpp
index 21e0c5f13..2cf6c85ad 100644
--- a/Source/ert.cpp
+++ b/Source/ert.cpp
@@ -702,4 +702,4 @@ namespace ert
 		return true;
 	}
 
-} // namespace ert
\ No newline at end of file
+} // namespace ert

From 923dcfbd7d53a47b8e716a90d7f70502e441d5f2 Mon Sep 17 00:00:00 2001
From: Yun Hsiao Wu <yunhsiaow@gmail.com>
Date: Wed, 27 Mar 2024 23:17:58 +0800
Subject: [PATCH 06/16] ert refactor

---
 Source/astcenc.h                    |  29 +-
 Source/astcenc_entry.cpp            |  12 +-
 Source/astcenc_internal.h           |  11 +-
 Source/astcenc_rate_distortion.cpp  | 525 ++++++++++++++--------------
 Source/astcenccli_toplevel.cpp      |  58 ++-
 Source/astcenccli_toplevel_help.cpp |  37 +-
 Source/cmake_core.cmake             |   2 +-
 Source/ert.cpp                      | 271 +++-----------
 Source/ert.h                        |  72 ++--
 9 files changed, 481 insertions(+), 536 deletions(-)

diff --git a/Source/astcenc.h b/Source/astcenc.h
index b718c6786..dc99bc64a 100644
--- a/Source/astcenc.h
+++ b/Source/astcenc.h
@@ -572,13 +572,34 @@ struct astcenc_config
 	float tune_search_mode0_enable;
 
 	/**
-	 * @brief The compression level for rate-distortion optimization.
+	 * @brief Enable Rate Distortion Optimization (RDO) post-processing.
 	 */
-	float rdo_level;
+	bool rdo_enabled;
+
+	/**
+	 * @brief Disable RDO multithreading (slightly higher compression, deterministic).
+	 */
+	bool rdo_no_multithreading;
+
+	/**
+	 * @brief RDO quality scalar (lambda).
+	 */
+	float rdo_quality;
+
+	/**
+	 * @brief RDO dictionary size in bytes.
+	 */
+	unsigned int rdo_dict_size;
+
+	/**
+	 * @brief RDO max smooth block error scale.
+	 */
+	float rdo_max_smooth_block_error_scale;
+
 	/**
-	 * @brief Number of blocks for each entropy reduction batch.
+	 * @brief RDO max smooth block standard deviation.
 	 */
-	unsigned int rdo_lookback;
+	float rdo_max_smooth_block_std_dev;
 
 	/**
 	 * @brief The progress callback, can be @c nullptr.
diff --git a/Source/astcenc_entry.cpp b/Source/astcenc_entry.cpp
index ba92bea88..5901a5467 100644
--- a/Source/astcenc_entry.cpp
+++ b/Source/astcenc_entry.cpp
@@ -412,6 +412,11 @@ static astcenc_error validate_config(
 	config.tune_3partition_early_out_limit_factor = astc::max(config.tune_3partition_early_out_limit_factor, 0.0f);
 	config.tune_2plane_early_out_limit_correlation = astc::max(config.tune_2plane_early_out_limit_correlation, 0.0f);
 
+	config.rdo_quality = astc::clamp(config.rdo_quality, 0.001f, 50.0f);
+	config.rdo_dict_size = astc::clamp(config.rdo_dict_size, 64u, 65536u);
+	config.rdo_max_smooth_block_error_scale = astc::clamp(config.rdo_max_smooth_block_error_scale, 1.0f, 300.0f);
+	config.rdo_max_smooth_block_std_dev = astc::clamp(config.rdo_max_smooth_block_std_dev, 0.01f, 65536.0f);
+
 	// Specifying a zero weight color component is not allowed; force to small value
 	float max_weight = astc::max(astc::max(config.cw_r_weight, config.cw_g_weight),
 	                             astc::max(config.cw_b_weight, config.cw_a_weight));
@@ -650,8 +655,11 @@ astcenc_error astcenc_config_init(
 	}
 	config.flags = flags;
 
-	// This is a reasonable default that leverages performance & compression rate
-	config.rdo_lookback = 64;
+	// These are the reasonable defaults that leverages performance & compression rate
+	config.rdo_quality = 1.0f;
+	config.rdo_dict_size = 4096;
+	config.rdo_max_smooth_block_error_scale = 10.0f;
+	config.rdo_max_smooth_block_std_dev = 18.0f;
 
 	return ASTCENC_SUCCESS;
 }
diff --git a/Source/astcenc_internal.h b/Source/astcenc_internal.h
index 172de8404..741876258 100644
--- a/Source/astcenc_internal.h
+++ b/Source/astcenc_internal.h
@@ -1968,7 +1968,7 @@ unsigned int compute_ideal_endpoint_formats(
  * @param         pi                   The partition info for the current trial.
  * @param         di                   The weight grid decimation table.
  * @param         dec_weights_uquant   The quantized weight set.
- * @param[in,out] ep                   The color endpoints (modifed in place).
+ * @param[in,out] ep                   The color endpoints (modified in place).
  * @param[out]    rgbs_vectors         The RGB+scale vectors for LDR blocks.
  * @param[out]    rgbo_vectors         The RGB+offset vectors for HDR blocks.
  */
@@ -1992,7 +1992,7 @@ void recompute_ideal_colors_1plane(
  * @param         di                          The weight grid decimation table.
  * @param         dec_weights_uquant_plane1   The quantized weight set for plane 1.
  * @param         dec_weights_uquant_plane2   The quantized weight set for plane 2.
- * @param[in,out] ep                          The color endpoints (modifed in place).
+ * @param[in,out] ep                          The color endpoints (modified in place).
  * @param[out]    rgbs_vector                 The RGB+scale color for LDR blocks.
  * @param[out]    rgbo_vector                 The RGB+offset color for HDR blocks.
  * @param         plane2_component            The component assigned to plane 2.
@@ -2165,7 +2165,7 @@ void symbolic_to_physical(
  * flagged as an error block if the encoding is invalid.
  *
  * @param      bsd   The block size information.
- * @param      pcb   The physical compresesd block input.
+ * @param      pcb   The physical compressed block input.
  * @param[out] scb   The output symbolic representation.
  */
 void physical_to_symbolic(
@@ -2176,7 +2176,10 @@ void physical_to_symbolic(
 /**
  * @brief Rate-distortion optimization main entry.
  *
- * @param[inout] buffer   The output compressed buffer.
+ * @param         ctxo     The compressor context and configuration.
+ * @param         image    The input image data.
+ * @param         swizzle  The swizzle applied on store.
+ * @param[in,out] buffer   The compressed buffer to be optimized (modified in place)
  */
 void rate_distortion_optimize(
 	astcenc_context& ctxo,
diff --git a/Source/astcenc_rate_distortion.cpp b/Source/astcenc_rate_distortion.cpp
index 46169f1bd..4ed869a0c 100644
--- a/Source/astcenc_rate_distortion.cpp
+++ b/Source/astcenc_rate_distortion.cpp
@@ -20,29 +20,25 @@
 #include "astcenc_internal_entry.h"
 #include "ert.h"
 
+#include <vector>
+
 struct astcenc_rdo_context
 {
 	ert::reduce_entropy_params m_ert_params;
 	std::atomic<uint32_t> m_total_modified;
+	std::vector<image_block> m_blocks;
 
+	uint32_t m_total_blocks = 0;
 	uint32_t m_xblocks = 0;
 	uint32_t m_yblocks = 0;
 	uint32_t m_zblocks = 0;
-	uint32_t m_total_blocks = 0;
-	uint32_t m_image_dimx = 0;
-	uint32_t m_image_dimy = 0;
-	uint32_t m_image_dimz = 0;
-	uint32_t m_bytes_per_texel = 0;
-	uint32_t m_component_count = 0;
-	astcenc_swizzle m_swizzle{ ASTCENC_SWZ_R, ASTCENC_SWZ_G, ASTCENC_SWZ_B, ASTCENC_SWZ_A };
-
-	std::vector<uint8_t> m_block_pixels;
 };
 
-static constexpr uint8_t ASTCENC_RDO_PADDING_PIXEL = 0xff;
+#define ASTCENC_RDO_SPECIALIZE_DIFF 1
+
 static constexpr uint32_t ASTCENC_BYTES_PER_BLOCK = 16;
 
-template<typename T, typename U> T lerp(T a, T b, U s) { return (T)(a + (b - a) * s); }
+template<typename T> T sqr(T v) { return v * v; }
 
 extern "C" void rdo_progress_emitter(
 	float value
@@ -79,11 +75,10 @@ extern "C" void rdo_progress_emitter(
 	fflush(stdout);
 }
 
-template<typename T>
 static uint32_t init_rdo_context(
-	astcenc_contexti& ctx,
-	const astcenc_image& image,
-	const astcenc_swizzle& swz
+		astcenc_contexti& ctx,
+		const astcenc_image& image,
+		const astcenc_swizzle& swz
 ) {
 	ctx.rdo_context = new astcenc_rdo_context;
 	astcenc_rdo_context& rdo_ctx = *ctx.rdo_context;
@@ -97,290 +92,307 @@ static uint32_t init_rdo_context(
 	uint32_t total_blocks = xblocks * yblocks * zblocks;
 
 	// Generate quality parameters
-	ert::reduce_entropy_params& ert_params = rdo_ctx.m_ert_params;
-	ert_params.m_lambda = astc::clamp(ctx.config.rdo_level, 0.0f, 1.0f);
-	ert_params.m_color_weights[0] = static_cast<uint32_t>(ctx.config.cw_r_weight * 255.0f);
-	ert_params.m_color_weights[1] = static_cast<uint32_t>(ctx.config.cw_g_weight * 255.0f);
-	ert_params.m_color_weights[2] = static_cast<uint32_t>(ctx.config.cw_b_weight * 255.0f);
-	ert_params.m_color_weights[3] = static_cast<uint32_t>(ctx.config.cw_a_weight * 255.0f);
-	ert_params.m_lookback_window_size = (ctx.config.rdo_lookback ? astc::min(ctx.config.rdo_lookback, total_blocks) : total_blocks) * ASTCENC_BYTES_PER_BLOCK;
+	auto& ert_params = rdo_ctx.m_ert_params;
+	ert_params.m_lambda = ctx.config.rdo_quality;
+	ert_params.m_lookback_window_size = ctx.config.rdo_dict_size;
+	ert_params.m_smooth_block_max_mse_scale = ctx.config.rdo_max_smooth_block_error_scale;
+	ert_params.m_max_smooth_block_std_dev = ctx.config.rdo_max_smooth_block_std_dev;
 
 	ert_params.m_try_two_matches = true;
-	// ert_params.m_allow_relative_movement = true;
-	// ert_params.m_debug_output = true;
-
-	// Deduce conversion swizzles, ERT requires sample components to be contiguous
-	uint32_t num_components = 0;
-	astcenc_swz encoding_swzes[4];
-	astcenc_swz decoding_swzes[4];
-	uint32_t* w = ert_params.m_color_weights;
-
-	if (w[0]) w[num_components] = w[0], decoding_swzes[num_components] = ASTCENC_SWZ_R, encoding_swzes[num_components++] = swz.r;
-	if (w[1]) w[num_components] = w[1], decoding_swzes[num_components] = ASTCENC_SWZ_G, encoding_swzes[num_components++] = swz.g;
-	if (w[2]) w[num_components] = w[2], decoding_swzes[num_components] = ASTCENC_SWZ_B, encoding_swzes[num_components++] = swz.b;
-	if (w[3]) w[num_components] = w[3], decoding_swzes[num_components] = ASTCENC_SWZ_A, encoding_swzes[num_components++] = swz.a;
-
-	for (uint32_t idx = num_components; idx < 4; ++idx)
-	{
-		w[idx] = 0;
-		decoding_swzes[idx] = encoding_swzes[idx] = ASTCENC_SWZ_1;
-	}
 
-	bool needs_swz = // As long as weights are the same it actually doesn't matter
-		ert_params.m_color_weights[0] != ert_params.m_color_weights[1] ||
-		ert_params.m_color_weights[1] != ert_params.m_color_weights[2] ||
-		ert_params.m_color_weights[2] != ert_params.m_color_weights[3];
-
-	T scratch[6];
-	scratch[ASTCENC_SWZ_0] = 0u;
-	scratch[ASTCENC_SWZ_1] = ASTCENC_RDO_PADDING_PIXEL;
-
-	// Perform memory allocations for intermediary buffers
-	uint32_t bytes_per_texel = 4u;
-	if (image.data_type == ASTCENC_TYPE_F16) bytes_per_texel *= 2;
-	else if (image.data_type == ASTCENC_TYPE_F32) bytes_per_texel *= 4;
+	rdo_ctx.m_blocks.resize(total_blocks);
+	rdo_ctx.m_total_blocks = total_blocks;
+	rdo_ctx.m_xblocks = xblocks;
+	rdo_ctx.m_yblocks = yblocks;
+	rdo_ctx.m_zblocks = zblocks;
 
-	rdo_ctx.m_block_pixels.resize(total_blocks * ctx.bsd->texel_count * bytes_per_texel);
+	vfloat4 channel_weight = vfloat4(ctx.config.cw_r_weight,
+									 ctx.config.cw_g_weight,
+									 ctx.config.cw_b_weight,
+									 ctx.config.cw_a_weight);
+	channel_weight = channel_weight / hadd_s(channel_weight);
 
-	for (uint32_t block_z = 0; block_z < zblocks; ++block_z)
+	for (uint32_t block_z = 0, block_idx = 0; block_z < zblocks; ++block_z)
 	{
-		const uint32_t valid_z = astc::min(image.dim_z - block_z * block_dim_z, block_dim_z);
-		const uint32_t padding_z = block_dim_z - valid_z;
-		auto* block_pixels = static_cast<T*>(rdo_ctx.m_block_pixels.data()) + block_z * yblocks * xblocks * ctx.bsd->texel_count * 4;
-
-		for (uint32_t offset_z = 0; offset_z < valid_z; offset_z++)
+		for (uint32_t block_y = 0; block_y < yblocks; ++block_y)
 		{
-			const auto* src_data = static_cast<const T*>(image.data[block_z * block_dim_z + offset_z]);
-
-			for (uint32_t block_y = 0, block_idx = 0; block_y < yblocks; ++block_y)
+			for (uint32_t block_x = 0; block_x < xblocks; ++block_x, ++block_idx)
 			{
-				const T* src_block_row = src_data + block_y * block_dim_y * image.dim_x * 4;
-				const uint32_t valid_y = astc::min(image.dim_y - block_y * block_dim_y, block_dim_y);
-				const uint32_t padding_y = block_dim_y - valid_y;
+				image_block& blk = rdo_ctx.m_blocks[block_idx];
+				blk.decode_unorm8 = ctx.config.flags & ASTCENC_FLG_USE_DECODE_UNORM8;
+				blk.texel_count = ctx.bsd->texel_count;
 
-				for (uint32_t block_x = 0; block_x < xblocks; ++block_x, ++block_idx)
+				if (ctx.config.flags & ASTCENC_FLG_USE_ALPHA_WEIGHT)
+				{
+					float alpha_scale = blk.data_max.lane<3>() * (1.0f / 65535.0f);
+					blk.channel_weight = vfloat4(ctx.config.cw_r_weight * alpha_scale,
+												 ctx.config.cw_g_weight * alpha_scale,
+												 ctx.config.cw_b_weight * alpha_scale,
+												 ctx.config.cw_a_weight);
+					blk.channel_weight = blk.channel_weight / hadd_s(blk.channel_weight);
+				}
+				else
 				{
-					const T* src_block = src_block_row + block_x * block_dim_x * 4;
-					const uint32_t valid_x = astc::min(image.dim_x - block_x * block_dim_x, block_dim_x);
-					const uint32_t padding_x = block_dim_x - valid_x;
-
-					T* dst_block = block_pixels + (block_idx * block_dim_z + offset_z) * block_dim_x * block_dim_y * 4;
-					for (uint32_t offset_y = 0; offset_y < valid_y; ++offset_y)
-					{
-						if (needs_swz)
-						{
-							for (uint32_t offset_x = 0; offset_x < valid_x; ++offset_x)
-							{
-								memcpy(scratch, src_block + offset_x * 4, bytes_per_texel);
-
-								for (uint32_t idx = 0; idx < 4; ++idx)
-								{
-									dst_block[offset_x * 4 + idx] = scratch[encoding_swzes[idx]];
-								}
-							}
-						}
-						else
-						{
-							memcpy(dst_block, src_block, valid_x * bytes_per_texel);
-						}
-						if (padding_x) memset(dst_block + valid_x * 4, ASTCENC_RDO_PADDING_PIXEL, padding_x * bytes_per_texel);
-						src_block += image.dim_x * 4;
-						dst_block += block_dim_x * 4;
-					}
-					if (padding_y)
-					{
-						memset(dst_block, ASTCENC_RDO_PADDING_PIXEL, padding_y * block_dim_x * bytes_per_texel);
-						dst_block += padding_y * block_dim_x * 4;
-					}
-					if (padding_z) memset(dst_block, ASTCENC_RDO_PADDING_PIXEL, padding_z * block_dim_y * block_dim_x * bytes_per_texel);
+					blk.channel_weight = channel_weight;
 				}
+
+				load_image_block(ctx.config.profile, image, blk, *ctx.bsd,
+								 block_dim_x * block_x, block_dim_y * block_y, block_dim_z * block_z, swz);
 			}
 		}
 	}
 
-	rdo_ctx.m_image_dimx = image.dim_x;
-	rdo_ctx.m_image_dimy = image.dim_y;
-	rdo_ctx.m_image_dimz = image.dim_z;
-	rdo_ctx.m_xblocks = xblocks;
-	rdo_ctx.m_yblocks = yblocks;
-	rdo_ctx.m_zblocks = zblocks;
-	rdo_ctx.m_total_blocks = total_blocks;
-	rdo_ctx.m_bytes_per_texel = bytes_per_texel;
-	rdo_ctx.m_component_count = num_components;
+	return total_blocks;
+}
+
+static float compute_block_std_dev(
+	const image_block& blk,
+	float scale
+) {
+	if (all(blk.data_min == blk.data_max)) return 0.0f;
+
+	vfloatacc summav = vfloatacc::zero();
+	vint lane_id = vint::lane_id();
+	uint32_t texel_count = blk.texel_count;
+	vfloat color_mean_r(blk.data_mean.lane<0>() * scale);
+	vfloat color_mean_g(blk.data_mean.lane<1>() * scale);
+	vfloat color_mean_b(blk.data_mean.lane<2>() * scale);
+	vfloat color_mean_a(blk.data_mean.lane<3>() * scale);
 
-	if (needs_swz)
+	for (uint32_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
 	{
-		rdo_ctx.m_swizzle.r = decoding_swzes[0];
-		rdo_ctx.m_swizzle.g = decoding_swzes[1];
-		rdo_ctx.m_swizzle.b = decoding_swzes[2];
-		rdo_ctx.m_swizzle.a = decoding_swzes[3];
+		vfloat color_orig_r = loada(blk.data_r + i) * scale;
+		vfloat color_orig_g = loada(blk.data_g + i) * scale;
+		vfloat color_orig_b = loada(blk.data_b + i) * scale;
+		vfloat color_orig_a = loada(blk.data_a + i) * scale;
+
+		vfloat color_error_r = min(abs(color_orig_r - color_mean_r), vfloat(1e15f));
+		vfloat color_error_g = min(abs(color_orig_g - color_mean_g), vfloat(1e15f));
+		vfloat color_error_b = min(abs(color_orig_b - color_mean_b), vfloat(1e15f));
+		vfloat color_error_a = min(abs(color_orig_a - color_mean_a), vfloat(1e15f));
+
+		// Compute squared error metric
+		color_error_r = color_error_r * color_error_r;
+		color_error_g = color_error_g * color_error_g;
+		color_error_b = color_error_b * color_error_b;
+		color_error_a = color_error_a * color_error_a;
+
+		vfloat metric = astc::max(color_error_r * blk.channel_weight.lane<0>(),
+								  color_error_g * blk.channel_weight.lane<1>(),
+								  color_error_b * blk.channel_weight.lane<2>(),
+								  color_error_a * blk.channel_weight.lane<3>());
+
+		// Mask off bad lanes
+		vmask mask = lane_id < vint(texel_count);
+		lane_id += vint(ASTCENC_SIMD_WIDTH);
+		haccumulate(summav, metric, mask);
 	}
 
-	return rdo_ctx.m_total_blocks;
+	return sqrtf(hadd_s(summav) / texel_count);
 }
 
-static bool store_image_block_u8(
-	uint8_t* output,
-	const image_block& blk,
-	const block_size_descriptor& bsd,
-	uint32_t x_start,
-	uint32_t y_start,
-	uint32_t z_start,
-	uint32_t x_size,
-	uint32_t y_size,
-	uint32_t z_size,
-	const astcenc_swizzle& swz,
-	uint32_t& x_count,
-	uint32_t& y_count
+#if ASTCENC_RDO_SPECIALIZE_DIFF
+static float compute_symbolic_block_difference_constant(
+		const astcenc_config& config,
+		const block_size_descriptor& bsd,
+		symbolic_compressed_block scb,
+		const image_block& blk
 ) {
-	uint32_t x_end = astc::min(x_size, x_start + bsd.xdim);
-	x_count = x_end - x_start;
-	uint32_t x_nudge = bsd.xdim - x_count;
-
-	uint32_t y_end = astc::min(y_size, y_start + bsd.ydim);
-	y_count = y_end - y_start;
-	uint32_t y_nudge = (bsd.ydim - y_count) * bsd.xdim;
+	vfloat4 color(0.0f);
 
-	uint32_t z_end = astc::min(z_size, z_start + bsd.zdim);
+	// UNORM16 constant color block
+	if (scb.block_type == SYM_BTYPE_CONST_U16)
+	{
+		vint4 colori(scb.constant_color);
 
-	// True if any non-identity swizzle
-	bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
-					 (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);
+		// Determine the UNORM8 rounding on the decode
+		vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
 
-	// True if any swizzle uses Z reconstruct
-	bool needs_z = (swz.r == ASTCENC_SWZ_Z) || (swz.g == ASTCENC_SWZ_Z) ||
-				   (swz.b == ASTCENC_SWZ_Z) || (swz.a == ASTCENC_SWZ_Z);
+		// The real decoder would just use the top 8 bits, but we rescale
+		// in to a 16-bit value that rounds correctly.
+		vint4 colori_u8 = asr<8>(colori) * 257;
+		colori = select(colori, colori_u8, u8_mask);
 
-	int idx = 0;
-	for (uint32_t z = z_start; z < z_end; z++)
+		vint4 colorf16 = unorm16_to_sf16(colori);
+		color = float16_to_float(colorf16);
+	}
+	// FLOAT16 constant color block
+	else
 	{
-		uint8_t* data_slice = output + z * bsd.xdim * bsd.ydim * 4;
-
-		for (uint32_t y = y_start; y < y_end; ++y)
+		switch (config.profile)
 		{
-			for (uint32_t x = 0; x < x_count; x += ASTCENC_SIMD_WIDTH)
-			{
-				uint32_t max_texels = ASTCENC_SIMD_WIDTH;
-				uint32_t used_texels = astc::min(x_count - x, max_texels);
-
-				// Unaligned load as rows are not always SIMD_WIDTH long
-				vfloat data_r(blk.data_r + idx);
-				vfloat data_g(blk.data_g + idx);
-				vfloat data_b(blk.data_b + idx);
-				vfloat data_a(blk.data_a + idx);
-
-				// Errors are NaN encoded - abort this sample immediately
-				// Branch is OK here - it is almost never true so predicts well
-				vmask nan_mask = data_r != data_r;
-				if (any(nan_mask))
-				{
-					return false;
-				}
+			case ASTCENC_PRF_LDR_SRGB:
+			case ASTCENC_PRF_LDR:
+				return -ERROR_CALC_DEFAULT;
+			case ASTCENC_PRF_HDR_RGB_LDR_A:
+			case ASTCENC_PRF_HDR:
+				// Constant-color block; unpack from FP16 to FP32.
+				color = float16_to_float(vint4(scb.constant_color));
+				break;
+		}
+	}
 
-				vint data_ri = float_to_int_rtn(min(data_r, 1.0f) * 255.0f);
-				vint data_gi = float_to_int_rtn(min(data_g, 1.0f) * 255.0f);
-				vint data_bi = float_to_int_rtn(min(data_b, 1.0f) * 255.0f);
-				vint data_ai = float_to_int_rtn(min(data_a, 1.0f) * 255.0f);
+	if (all(blk.data_min == blk.data_max)) // Original block is also constant
+	{
+		vfloat4 color_error = min(abs(blk.origin_texel - color) * 65535.0f, vfloat4(1e15f));
+		return dot_s(color_error * color_error, blk.channel_weight) * bsd.texel_count;
+	}
 
-				if (needs_swz)
-				{
-					vint swizzle_table[7];
-					swizzle_table[ASTCENC_SWZ_0] = vint(0);
-					swizzle_table[ASTCENC_SWZ_1] = vint(255);
-					swizzle_table[ASTCENC_SWZ_R] = data_ri;
-					swizzle_table[ASTCENC_SWZ_G] = data_gi;
-					swizzle_table[ASTCENC_SWZ_B] = data_bi;
-					swizzle_table[ASTCENC_SWZ_A] = data_ai;
-
-					if (needs_z)
-					{
-						vfloat data_x = (data_r * vfloat(2.0f)) - vfloat(1.0f);
-						vfloat data_y = (data_a * vfloat(2.0f)) - vfloat(1.0f);
-						vfloat data_z = vfloat(1.0f) - (data_x * data_x) - (data_y * data_y);
-						data_z = max(data_z, 0.0f);
-						data_z = (sqrt(data_z) * vfloat(0.5f)) + vfloat(0.5f);
-
-						swizzle_table[ASTCENC_SWZ_Z] = float_to_int_rtn(min(data_z, 1.0f) * 255.0f);
-					}
-
-					data_ri = swizzle_table[swz.r];
-					data_gi = swizzle_table[swz.g];
-					data_bi = swizzle_table[swz.b];
-					data_ai = swizzle_table[swz.a];
-				}
+	vfloatacc summav = vfloatacc::zero();
+	vint lane_id = vint::lane_id();
+	uint32_t texel_count = bsd.texel_count;
 
-				vint data_rgbai = interleave_rgba8(data_ri, data_gi, data_bi, data_ai);
-				vmask store_mask = vint::lane_id() < vint(used_texels);
-				store_lanes_masked(data_slice + idx * 4, data_rgbai, store_mask);
+	vfloat color_r(color.lane<0>() * 65535.0f);
+	vfloat color_g(color.lane<1>() * 65535.0f);
+	vfloat color_b(color.lane<2>() * 65535.0f);
+	vfloat color_a(color.lane<3>() * 65535.0f);
 
-				idx += used_texels;
-			}
+	for (uint32_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+	{
+		vfloat color_orig_r = loada(blk.data_r + i);
+		vfloat color_orig_g = loada(blk.data_g + i);
+		vfloat color_orig_b = loada(blk.data_b + i);
+		vfloat color_orig_a = loada(blk.data_a + i);
+
+		vfloat color_error_r = min(abs(color_orig_r - color_r), vfloat(1e15f));
+		vfloat color_error_g = min(abs(color_orig_g - color_g), vfloat(1e15f));
+		vfloat color_error_b = min(abs(color_orig_b - color_b), vfloat(1e15f));
+		vfloat color_error_a = min(abs(color_orig_a - color_a), vfloat(1e15f));
+
+		// Compute squared error metric
+		color_error_r = color_error_r * color_error_r;
+		color_error_g = color_error_g * color_error_g;
+		color_error_b = color_error_b * color_error_b;
+		color_error_a = color_error_a * color_error_a;
+
+		vfloat metric = color_error_r * blk.channel_weight.lane<0>()
+					  + color_error_g * blk.channel_weight.lane<1>()
+					  + color_error_b * blk.channel_weight.lane<2>()
+					  + color_error_a * blk.channel_weight.lane<3>();
+
+		// Mask off bad lanes
+		vmask mask = lane_id < vint(texel_count);
+		lane_id += vint(ASTCENC_SIMD_WIDTH);
+		haccumulate(summav, metric, mask);
+	}
 
-			if (x_nudge)
-			{
-				memset(data_slice + idx * 4, ASTCENC_RDO_PADDING_PIXEL, x_nudge * 4);
-				idx += x_nudge;
-			}
-		}
+	return hadd_s(summav);
+}
+#else
+static float compute_block_mse(
+	const image_block& orig,
+	const image_block& cmp,
+	float orig_scale,
+	float cmp_scale
+) {
+	vfloatacc summav = vfloatacc::zero();
+	vint lane_id = vint::lane_id();
+	uint32_t texel_count = orig.texel_count;
 
-		if (y_nudge)
-		{
-			memset(data_slice + idx * 4, ASTCENC_RDO_PADDING_PIXEL, y_nudge * 4);
-			idx += y_nudge;
-		}
+	for (uint32_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+	{
+		vfloat color_orig_r = loada(orig.data_r + i) * orig_scale;
+		vfloat color_orig_g = loada(orig.data_g + i) * orig_scale;
+		vfloat color_orig_b = loada(orig.data_b + i) * orig_scale;
+		vfloat color_orig_a = loada(orig.data_a + i) * orig_scale;
+
+		vfloat color_cmp_r = loada(cmp.data_r + i) * cmp_scale;
+		vfloat color_cmp_g = loada(cmp.data_g + i) * cmp_scale;
+		vfloat color_cmp_b = loada(cmp.data_b + i) * cmp_scale;
+		vfloat color_cmp_a = loada(cmp.data_a + i) * cmp_scale;
+
+		vfloat color_error_r = min(abs(color_orig_r - color_cmp_r), vfloat(1e15f));
+		vfloat color_error_g = min(abs(color_orig_g - color_cmp_g), vfloat(1e15f));
+		vfloat color_error_b = min(abs(color_orig_b - color_cmp_b), vfloat(1e15f));
+		vfloat color_error_a = min(abs(color_orig_a - color_cmp_a), vfloat(1e15f));
+
+		// Compute squared error metric
+		color_error_r = color_error_r * color_error_r;
+		color_error_g = color_error_g * color_error_g;
+		color_error_b = color_error_b * color_error_b;
+		color_error_a = color_error_a * color_error_a;
+
+		vfloat metric = color_error_r * orig.channel_weight.lane<0>()
+					  + color_error_g * orig.channel_weight.lane<1>()
+					  + color_error_b * orig.channel_weight.lane<2>()
+					  + color_error_a * orig.channel_weight.lane<3>();
+
+		// Mask off bad lanes
+		vmask mask = lane_id < vint(texel_count);
+		lane_id += vint(ASTCENC_SIMD_WIDTH);
+		haccumulate(summav, metric, mask);
 	}
 
-	return true;
+	return hadd_s(summav) / texel_count;
 }
+#endif
 
 struct local_rdo_context
 {
-	const astcenc_contexti* ctx = nullptr;
-	uint32_t base_block_idx = 0;
+	const astcenc_contexti* ctx;
+	uint32_t base_offset;
 };
 
-static bool unpack_block(
-	const void* block,
-	ert::color_rgba* pixels,
+static float compute_block_difference(
+	void* user_data,
+	const uint8_t* pcb,
 	uint32_t local_block_idx,
-	uint32_t& active_x,
-	uint32_t& active_y,
-	void* user_data
+	float* out_max_std_dev
 ) {
 	const local_rdo_context& local_ctx = *(local_rdo_context*)user_data;
 	const astcenc_contexti& ctx = *local_ctx.ctx;
+
+	symbolic_compressed_block scb;
+	physical_to_symbolic(*ctx.bsd, pcb, scb);
+
+	if (scb.block_type == SYM_BTYPE_ERROR)
+	{
+		// Trial blocks may not be valid at all
+		return -ERROR_CALC_DEFAULT;
+	}
+
 	const astcenc_rdo_context& rdo_ctx = *ctx.rdo_context;
-	uint32_t block_idx = local_ctx.base_block_idx + local_block_idx;
+	uint32_t block_idx = local_block_idx + local_ctx.base_offset;
+	const image_block& blk = rdo_ctx.m_blocks[block_idx];
 
-	uint32_t block_dim_x = ctx.bsd->xdim;
-	uint32_t block_dim_y = ctx.bsd->ydim;
-	uint32_t block_dim_z = ctx.bsd->zdim;
-	assert(block_idx < rdo_ctx.m_xblocks * rdo_ctx.m_yblocks * rdo_ctx.m_zblocks);
+	if (out_max_std_dev)
+	{
+		// ERT expects texel values to be in [0, 255]
+		*out_max_std_dev = compute_block_std_dev(blk, 255.0f / 65535.0f);
+	}
 
+#if ASTCENC_RDO_SPECIALIZE_DIFF
+	float squared_error = 0.0f;
+
+	if (scb.block_type != SYM_BTYPE_NONCONST)
+		squared_error = compute_symbolic_block_difference_constant(ctx.config, *ctx.bsd, scb, blk);
+	else if (ctx.bsd->get_block_mode(scb.block_mode).is_dual_plane)
+		squared_error = compute_symbolic_block_difference_2plane(ctx.config, *ctx.bsd, scb, blk);
+	else if (ctx.bsd->get_partition_info(scb.partition_count, scb.partition_index).partition_count == 1)
+		squared_error = compute_symbolic_block_difference_1plane_1partition(ctx.config, *ctx.bsd, scb, blk);
+	else
+		squared_error = compute_symbolic_block_difference_1plane(ctx.config, *ctx.bsd, scb, blk);
+
+	// ERT expects texel values to be in [0, 255]
+	return squared_error / blk.texel_count * sqr(255.0f / 65535.0f);
+#else
 	uint32_t block_z = block_idx / (rdo_ctx.m_xblocks * rdo_ctx.m_yblocks);
 	uint32_t slice_idx = block_idx - block_z * rdo_ctx.m_xblocks * rdo_ctx.m_yblocks;
 	uint32_t block_y = slice_idx / rdo_ctx.m_xblocks;
 	uint32_t block_x = slice_idx - block_y * rdo_ctx.m_xblocks;
-	assert(block_y * rdo_ctx.m_xblocks + block_x == slice_idx);
 
-	uint8_t pcb[ASTCENC_BYTES_PER_BLOCK];
-	memcpy(pcb, block, sizeof(pcb));
-	symbolic_compressed_block scb;
-	physical_to_symbolic(*ctx.bsd, pcb, scb);
+	image_block decoded_blk;
+	decoded_blk.decode_unorm8 = blk.decode_unorm8;
+	decoded_blk.texel_count = blk.texel_count;
+	decoded_blk.channel_weight = blk.channel_weight;
 
-	if (scb.block_type == SYM_BTYPE_ERROR)
-	{
-		return false;
-	}
-
-	image_block blk;
 	decompress_symbolic_block(ctx.config.profile, *ctx.bsd,
-		block_x * block_dim_x, block_y * block_dim_y, block_z * block_dim_z,
-		scb, blk);
+							  block_x * ctx.bsd->xdim, block_y * ctx.bsd->ydim, block_z * ctx.bsd->zdim,
+							  scb, decoded_blk);
 
-	return store_image_block_u8(reinterpret_cast<uint8_t*>(pixels), blk, *ctx.bsd,
-		block_x * block_dim_x, block_y * block_dim_y, block_z * block_dim_z,
-		rdo_ctx.m_image_dimx, rdo_ctx.m_image_dimy, rdo_ctx.m_image_dimz, rdo_ctx.m_swizzle, active_x, active_y);
+	// ERT expects texel values to be in [0, 255]
+	return compute_block_mse(blk, decoded_blk, 255.0f / 65535.0f, 255.0f);
+#endif
 }
 
 void rate_distortion_optimize(
@@ -389,7 +401,7 @@ void rate_distortion_optimize(
 	const astcenc_swizzle& swizzle,
 	uint8_t* buffer
 ) {
-	if (ctxo.context.config.rdo_level == 0.0f || image.data_type != ASTCENC_TYPE_U8 || ctxo.context.bsd->zdim > 1)
+	if (!ctxo.context.config.rdo_enabled)
 	{
 		return;
 	}
@@ -397,18 +409,19 @@ void rate_distortion_optimize(
 	// Only the first thread actually runs the initializer
 	ctxo.manage_rdo.init([&ctxo, &image, &swizzle]
 		{
-			// if (image.data_type == ASTCENC_TYPE_F16) return init_rdo_context<uint16_t>(ctxo.context, image, swizzle);
-			// if (image.data_type == ASTCENC_TYPE_F32) return init_rdo_context<uint32_t>(ctxo.context, image, swizzle);
-			return init_rdo_context<uint8_t>(ctxo.context, image, swizzle);
+			return init_rdo_context(ctxo.context, image, swizzle);
 		},
 		ctxo.context.config.progress_callback ? rdo_progress_emitter : nullptr);
 
-	uint32_t block_dim_x = ctxo.context.bsd->xdim;
-	uint32_t block_dim_y = ctxo.context.bsd->ydim;
-	uint32_t block_dim_z = ctxo.context.bsd->zdim;
-	uint32_t texels_per_block = block_dim_x * block_dim_y * block_dim_z;
 	astcenc_rdo_context& rdo_ctx = *ctxo.context.rdo_context;
-	uint32_t blocks_per_task = rdo_ctx.m_ert_params.m_lookback_window_size / ASTCENC_BYTES_PER_BLOCK;
+	uint32_t blocks_per_task = rdo_ctx.m_total_blocks;
+	if (!ctxo.context.config.rdo_no_multithreading)
+	{
+		blocks_per_task = astc::min(rdo_ctx.m_ert_params.m_lookback_window_size / ASTCENC_BYTES_PER_BLOCK, rdo_ctx.m_total_blocks);
+		// There is no way to losslessly partition the job (sequential dependency on previous output)
+		// So we reserve only one task for each thread to minimize the quality impact.
+		blocks_per_task = astc::max(blocks_per_task, (rdo_ctx.m_total_blocks - 1) / ctxo.context.thread_count + 1);
+	}
 
 	uint32_t total_modified = 0;
 	while (true)
@@ -420,13 +433,11 @@ void rate_distortion_optimize(
 			break;
 		}
 
-		local_rdo_context local_context{ &ctxo.context, base };
+		local_rdo_context local_ctx{ &ctxo.context, base };
 
-		ert::reduce_entropy(buffer + base * ASTCENC_BYTES_PER_BLOCK, count, ASTCENC_BYTES_PER_BLOCK, ASTCENC_BYTES_PER_BLOCK,
-			block_dim_x, block_dim_y, rdo_ctx.m_component_count, 
-			reinterpret_cast<const ert::color_rgba*>(rdo_ctx.m_block_pixels.data() + base * texels_per_block * rdo_ctx.m_bytes_per_texel),
-			rdo_ctx.m_ert_params, total_modified, unpack_block, &local_context
-		);
+		ert::reduce_entropy(buffer + base * ASTCENC_BYTES_PER_BLOCK, count,
+							ASTCENC_BYTES_PER_BLOCK, ASTCENC_BYTES_PER_BLOCK,
+							rdo_ctx.m_ert_params, total_modified, compute_block_difference, &local_ctx);
 
 		ctxo.manage_rdo.complete_task_assignment(count);
 	}
diff --git a/Source/astcenccli_toplevel.cpp b/Source/astcenccli_toplevel.cpp
index ed27c0492..67c905493 100644
--- a/Source/astcenccli_toplevel.cpp
+++ b/Source/astcenccli_toplevel.cpp
@@ -1175,33 +1175,65 @@ static int edit_astcenc_config(
 			argidx += 1;
 			cli_config.diagnostic_images = true;
 		}
-		else if (!strcmp(argv[argidx], "-rdo-level"))
+		else if (!strcmp(argv[argidx], "-rdo"))
+		{
+			argidx += 1;
+			config.rdo_enabled = true;
+
+			// Unpacking RDO trials blocks requires full initialization
+			if (config.rdo_quality > 0.0f && static_cast<bool>(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY))
+			{
+				config.flags &= ~ASTCENC_FLG_SELF_DECOMPRESS_ONLY;
+			}
+		}
+		else if (!strcmp(argv[argidx], "-rdo-no-multithreading"))
+		{
+			argidx += 1;
+			config.rdo_no_multithreading = true;
+		}
+		else if (!strcmp(argv[argidx], "-rdo-quality"))
 		{
 			argidx += 2;
 			if (argidx > argc)
 			{
-				print_error("ERROR: -rdo-level switch with no argument\n");
+				print_error("ERROR: -rdo-quality switch with no argument\n");
 				return 1;
 			}
 
-			config.rdo_level = static_cast<float>(atof(argv[argidx - 1]));
+			config.rdo_quality = static_cast<float>(atof(argv[argidx - 1]));
+		}
+		else if (!strcmp(argv[argidx], "-rdo-dict-size"))
+		{
+			argidx += 2;
+			if (argidx > argc)
+			{
+				print_error("ERROR: -rdo-dict-size switch with no argument\n");
+				return 1;
+			}
 
-			// Unpacking RDO trials blocks requires full initialization
-			if (config.rdo_level > 0.0f && static_cast<bool>(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY))
+			config.rdo_dict_size = atoi(argv[argidx - 1]);
+		}
+		else if (!strcmp(argv[argidx], "-rdo-max-smooth-block-error-scale"))
+		{
+			argidx += 2;
+			if (argidx > argc)
 			{
-				config.flags &= ~ASTCENC_FLG_SELF_DECOMPRESS_ONLY;
+				print_error("ERROR: -rdo-max-smooth-block-error-scale switch with no argument\n");
+				return 1;
 			}
+
+			config.rdo_max_smooth_block_error_scale = static_cast<float>(atof(argv[argidx - 1]));
 		}
-		else if (!strcmp(argv[argidx], "-rdo-lookback"))
+		else if (!strcmp(argv[argidx], "-rdo-max-smooth-block-std-dev"))
 		{
 			argidx += 2;
 			if (argidx > argc)
 			{
-				print_error("ERROR: -rdo-lookback switch with no argument\n");
+				print_error("ERROR: -rdo-max-smooth-block-std-dev switch with no argument\n");
 				return 1;
 			}
 
-			config.rdo_lookback = atoi(argv[argidx - 1]);
+			config.rdo_max_smooth_block_std_dev = static_cast<float>(atof(argv[argidx - 1]));
 		}
 		else // check others as well
 		{
@@ -1293,8 +1325,12 @@ static void print_astcenc_config(
 		printf("    Candidate cutoff:           %u candidates\n", config.tune_candidate_limit);
 		printf("    Refinement cutoff:          %u iterations\n", config.tune_refinement_limit);
 		printf("    Compressor thread count:    %d\n", cli_config.thread_count);
-		printf("    Rate-distortion level:      %g\n", static_cast<double>(config.rdo_level));
-		printf("    Rate-distortion lookback:   %u blocks\n", config.rdo_lookback);
+		printf("    RDO:                        %s\n", config.rdo_enabled ? "Enabled" : "Disabled");
+		printf("    RDO multithreading:         %s\n", config.rdo_no_multithreading ? "Disabled" : "Enabled");
+		printf("    RDO quality:                %g\n", static_cast<double>(config.rdo_quality));
+		printf("    RDO dictionary size:        %u bytes\n", config.rdo_dict_size);
+		printf("    RDO max error scale:        %g\n", static_cast<double>(config.rdo_max_smooth_block_error_scale));
+		printf("    RDO max standard deviation: %g\n", static_cast<double>(config.rdo_max_smooth_block_std_dev));
 		printf("\n");
 	}
 }
diff --git a/Source/astcenccli_toplevel_help.cpp b/Source/astcenccli_toplevel_help.cpp
index e2b52dd01..c7266d3cf 100644
--- a/Source/astcenccli_toplevel_help.cpp
+++ b/Source/astcenccli_toplevel_help.cpp
@@ -385,17 +385,32 @@ ADVANCED COMPRESSION
                -thorough     : 0.95
                -verythorough : 0.98
                -exhaustive   : 0.99
-)"
-R"(
-       -rdo-level <factor>
-           Rate-distortion optimization level.
-           Larger values push the postprocessor towards optimizing more for lower rate,
-           and smaller values more for distortion. Default to 0, which skips RDO entirely.
-
-       -rdo-lookback <number>
-           Rate-distortion optimization lookback window size in blocks.
-           The larger this value, the slower the encoder but the higher the quality per LZ compressed bit.
-           Default to 64 blocks.
+
+       -rdo
+           Enable Rate Distortion Optimization (RDO) post-processing.
+
+       -rdo-no-multithreading
+           Disable RDO multithreading (slightly higher compression, deterministic).
+
+       -rdo-quality <factor>
+           RDO quality scalar (lambda). Lower values yield higher
+           quality/larger LZ compressed files, higher values yield lower
+           quality/smaller LZ compressed files. A good range to try is [.2,4].
+           Full range is [.001,50.0]. Default is 1.0.
+
+       -rdo-dict-size <number>
+           RDO dictionary size in bytes. Default is 4096. Lower
+           values=faster, but give less compression. Range is [64,65536].
+
+       -rdo-max-smooth-block-error-scale <factor>
+           RDO max smooth block error scale. Range is [1,300].
+           Default is 10.0, 1.0 is disabled. Larger values suppress more
+           artifacts (and allocate more bits) on smooth blocks.
+
+       -rdo-max-smooth-block-std-dev <factor>
+           RDO max smooth block standard deviation. Range is
+           [.01,65536.0]. Default is 18.0. Larger values expand the range of
+           blocks considered smooth.
 )"
 // This split in the literals is needed for Visual Studio; the compiler
 // will concatenate these two strings together ...
diff --git a/Source/cmake_core.cmake b/Source/cmake_core.cmake
index 4063931ec..12d5c4cce 100644
--- a/Source/cmake_core.cmake
+++ b/Source/cmake_core.cmake
@@ -166,7 +166,7 @@ macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_IS_VENEER)
             $<${is_gnu_fe}:-Wall>
             $<${is_gnu_fe}:-Wextra>
             $<${is_gnu_fe}:-Wpedantic>
-            #$<${is_gnu_fe}:-Werror>
+            $<${is_gnu_fe}:-Werror>
             $<${is_gnu_fe}:-Wshadow>
             $<${is_gnu_fe}:-Wdouble-promotion>
             $<${is_clang}:-Wdocumentation>
diff --git a/Source/ert.cpp b/Source/ert.cpp
index 2cf6c85ad..de48ba8a6 100644
--- a/Source/ert.cpp
+++ b/Source/ert.cpp
@@ -1,14 +1,33 @@
+// SPDX-License-Identifier: Unlicense
+// ----------------------------------------------------------------------------
+// This is free and unencumbered software released into the public domain.
+// Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+// software, either in source code form or as a compiled binary, for any purpose,
+// commercial or non - commercial, and by any means.
+// In jurisdictions that recognize copyright laws, the author or authors of this
+// software dedicate any and all copyright interest in the software to the public
+// domain. We make this dedication for the benefit of the public at large and to
+// the detriment of our heirs and successors.We intend this dedication to be an
+// overt act of relinquishment in perpetuity of all present and future rights to
+// this software under copyright law.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+// AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+// ----------------------------------------------------------------------------
+
 #include "ert.h"
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
+
+#include <cassert>
 
 #define ERT_FAVOR_CONT_AND_REP0_MATCHES (1)
 #define ERT_FAVOR_REP0_MATCHES (0)
+#define ERT_ENABLE_DEBUG (0)
 
 namespace ert
 {
-	const uint32_t MAX_BLOCK_PIXELS = 12 * 12;
 	const uint32_t MAX_BLOCK_SIZE_IN_BYTES = 256;
 	const uint32_t MIN_MATCH_LEN = 3;
 	const float LITERAL_BITS = 13.0f;
@@ -64,162 +83,6 @@ namespace ert
 		return len_cost + dist_cost;
 	}
 
-	class tracked_stat
-	{
-	public:
-		tracked_stat() { clear(); }
-
-		void clear() { m_num = 0; m_total = 0; m_total2 = 0; }
-
-		void update(uint32_t val) { m_num++; m_total += val; m_total2 += val * val; }
-
-		tracked_stat& operator += (uint32_t val) { update(val); return *this; }
-
-		uint32_t get_number_of_values() { return m_num; }
-		uint64_t get_total() const { return m_total; }
-		uint64_t get_total2() const { return m_total2; }
-
-		float get_average() const { return m_num ? (float)m_total / m_num : 0.0f; };
-		float get_std_dev() const { return m_num ? sqrtf((float)(m_num * m_total2 - m_total * m_total)) / m_num : 0.0f; }
-		float get_variance() const { float s = get_std_dev(); return s * s; }
-
-	private:
-		uint32_t m_num;
-		uint64_t m_total;
-		uint64_t m_total2;
-	};
-
-	static inline float compute_block_max_std_dev(const color_rgba* pPixels, uint32_t block_width, uint32_t active_x, uint32_t active_y, uint32_t num_comps)
-	{
-		tracked_stat comp_stats[4];
-
-		for (uint32_t y = 0; y < active_y; y++)
-		{
-			for (uint32_t x = 0; x < active_x; x++)
-			{
-				const color_rgba* pPixel = pPixels + x + y * block_width;
-
-				for (uint32_t c = 0; c < num_comps; c++)
-					comp_stats[c].update(pPixel->m_c[c]);
-			}
-		}
-
-		float max_std_dev = 0.0f;
-		for (uint32_t i = 0; i < num_comps; i++)
-			max_std_dev = std::max(max_std_dev, comp_stats[i].get_std_dev());
-		return max_std_dev;
-	}
-
-	static inline float compute_block_mse(const color_rgba* pPixelsA, const color_rgba* pPixelsB, uint32_t block_width, uint32_t block_height, uint32_t total_block_pixels, uint32_t num_comps, const uint32_t weights[4], float one_over_total_color_weight)
-	{
-		uint64_t total_err = 0;
-
-		if ((block_width == 4) && (block_height == 4) && (num_comps == 4))
-		{
-			if ((weights[0] == 1) && (weights[1] == 1) && (weights[2] == 1) && (weights[3] == 1))
-			{
-				for (uint32_t i = 0; i < 16; i++)
-				{
-					const color_rgba* pA = pPixelsA + i;
-					const color_rgba* pB = pPixelsB + i;
-
-					const int dr = pA->m_c[0] - pB->m_c[0];
-					const int dg = pA->m_c[1] - pB->m_c[1];
-					const int db = pA->m_c[2] - pB->m_c[2];
-					const int da = pA->m_c[3] - pB->m_c[3];
-
-					total_err += dr * dr + dg * dg + db * db + da * da;
-				}
-			}
-			else
-			{
-				for (uint32_t i = 0; i < 16; i++)
-				{
-					const color_rgba* pA = pPixelsA + i;
-					const color_rgba* pB = pPixelsB + i;
-
-					const int dr = pA->m_c[0] - pB->m_c[0];
-					const int dg = pA->m_c[1] - pB->m_c[1];
-					const int db = pA->m_c[2] - pB->m_c[2];
-					const int da = pA->m_c[3] - pB->m_c[3];
-
-					total_err += weights[0] * dr * dr + weights[1] * dg * dg + weights[2] * db * db + weights[3] * da * da;
-				}
-			}
-		}
-		else if ((block_width == 4) && (block_height == 4) && (num_comps == 3))
-		{
-			for (uint32_t y = 0; y < 4; y++)
-			{
-				const uint32_t y_ofs = y * 4;
-				for (uint32_t x = 0; x < 4; x++)
-				{
-					const color_rgba* pA = pPixelsA + x + y_ofs;
-					const color_rgba* pB = pPixelsB + x + y_ofs;
-
-					const int dr = pA->m_c[0] - pB->m_c[0];
-					const int dg = pA->m_c[1] - pB->m_c[1];
-					const int db = pA->m_c[2] - pB->m_c[2];
-
-					total_err += weights[0] * dr * dr + weights[1] * dg * dg + weights[2] * db * db;
-				}
-			}
-		}
-		else if ((block_width == 4) && (block_height == 4) && (num_comps == 2))
-		{
-			for (uint32_t y = 0; y < 4; y++)
-			{
-				const uint32_t y_ofs = y * 4;
-				for (uint32_t x = 0; x < 4; x++)
-				{
-					const color_rgba* pA = pPixelsA + x + y_ofs;
-					const color_rgba* pB = pPixelsB + x + y_ofs;
-
-					const int dr = pA->m_c[0] - pB->m_c[0];
-					const int dg = pA->m_c[1] - pB->m_c[1];
-
-					total_err += weights[0] * dr * dr + weights[1] * dg * dg;
-				}
-			}
-		}
-		else if ((block_width == 4) && (block_height == 4) && (num_comps == 1))
-		{
-			for (uint32_t y = 0; y < 4; y++)
-			{
-				const uint32_t y_ofs = y * 4;
-				for (uint32_t x = 0; x < 4; x++)
-				{
-					const color_rgba* pA = pPixelsA + x + y_ofs;
-					const color_rgba* pB = pPixelsB + x + y_ofs;
-
-					const int dr = pA->m_c[0] - pB->m_c[0];
-
-					total_err += weights[0] * dr * dr;
-				}
-			}
-		}
-		else
-		{
-			for (uint32_t y = 0; y < block_height; y++)
-			{
-				const uint32_t y_ofs = y * block_width;
-				for (uint32_t x = 0; x < block_width; x++)
-				{
-					const color_rgba* pA = pPixelsA + x + y_ofs;
-					const color_rgba* pB = pPixelsB + x + y_ofs;
-
-					for (uint32_t c = 0; c < num_comps; c++)
-					{
-						const int d = pA->m_c[c] - pB->m_c[c];
-						total_err += weights[c] * d * d;
-					}
-				}
-			}
-		}
-
-		return total_err * (one_over_total_color_weight / total_block_pixels);
-	}	
-
 	uint32_t hash_hsieh(const uint8_t* pBuf, size_t len, uint32_t salt)
 	{
 		if (!pBuf || !len)
@@ -277,46 +140,29 @@ namespace ert
 	}
 
 	// BC7 entropy reduction transform with Deflate/LZMA/LZHAM optimizations
-	bool reduce_entropy(void* pBlocks, uint32_t num_blocks,
-		uint32_t total_block_stride_in_bytes, uint32_t block_size_to_optimize_in_bytes, uint32_t block_width, uint32_t block_height, uint32_t num_comps,
-		const color_rgba* pBlock_pixels, const reduce_entropy_params& params, uint32_t& total_modified,
-		pUnpack_block_func pUnpack_block_func, void* pUnpack_block_func_user_data,
-		std::vector<float>* pBlock_mse_scales)
+	bool reduce_entropy(uint8_t* pBlock_bytes, uint32_t num_blocks,
+		uint32_t total_block_stride_in_bytes, uint32_t block_size_to_optimize_in_bytes,
+		const reduce_entropy_params& params, uint32_t& total_modified,
+		pDiff_block_func pDiff_block_func, void* pDiff_block_func_user_data,
+		const float* pBlock_mse_scales)
 	{
 		assert(total_block_stride_in_bytes && block_size_to_optimize_in_bytes);
 		assert(total_block_stride_in_bytes >= block_size_to_optimize_in_bytes);
-		
-		assert(num_comps >= 1 && num_comps <= 4);
-		for (uint32_t i = num_comps; i < 4; i++)
-		{
-			assert(!params.m_color_weights[i]);
-			if (params.m_color_weights[i])
-				return false;
-		}
-
-		const uint32_t total_color_weight = params.m_color_weights[0] + params.m_color_weights[1] + params.m_color_weights[2] + params.m_color_weights[3];
-		assert(total_color_weight);
-		const float one_over_total_color_weight = 1.0f / total_color_weight;
 
 		assert((block_size_to_optimize_in_bytes >= MIN_MATCH_LEN) && (block_size_to_optimize_in_bytes <= MAX_BLOCK_SIZE_IN_BYTES));
 		if ((block_size_to_optimize_in_bytes < MIN_MATCH_LEN) || (block_size_to_optimize_in_bytes > MAX_BLOCK_SIZE_IN_BYTES))
 			return false;
 
-		uint8_t* pBlock_bytes = (uint8_t*)pBlocks;
-
-		const uint32_t total_block_pixels = block_width * block_height;
-		if (total_block_pixels > MAX_BLOCK_PIXELS)
-			return false;
-
 		const int total_blocks_to_check = std::max<uint32_t>(1U, params.m_lookback_window_size / total_block_stride_in_bytes);
 
-		std::vector<uint32_t> len_hist(MAX_BLOCK_SIZE_IN_BYTES + 1);
-		std::vector<uint32_t> second_len_hist(MAX_BLOCK_SIZE_IN_BYTES + 1);
+#if ERT_ENABLE_DEBUG
+		uint32_t len_hist[MAX_BLOCK_SIZE_IN_BYTES + 1];
+		uint32_t second_len_hist[MAX_BLOCK_SIZE_IN_BYTES + 1];
 		uint32_t total_second_matches = 0;
+		uint32_t total_smooth_blocks = 0;
+#endif
 
 		int prev_match_window_ofs_to_favor_cont = -1, prev_match_dist_to_favor = -1;
-				
-		uint32_t total_smooth_blocks = 0;
 
 		const uint32_t HASH_SIZE = 8192;
 		uint32_t hash[HASH_SIZE];
@@ -327,36 +173,32 @@ namespace ert
 				memset(hash, 0, sizeof(hash));
 
 			uint8_t* pOrig_block = &pBlock_bytes[block_index * total_block_stride_in_bytes];
-			const color_rgba* pPixels = &pBlock_pixels[block_index * total_block_pixels];
-			uint32_t active_x = 0, active_y = 0;
 
-			color_rgba decoded_block[MAX_BLOCK_PIXELS];
-			if (!(*pUnpack_block_func)(pOrig_block, decoded_block, block_index, active_x, active_y, pUnpack_block_func_user_data))
+			float max_std_dev = 0.0f;
+			float cur_mse = (*pDiff_block_func)(pDiff_block_func_user_data, pOrig_block, block_index, &max_std_dev);
+			if (cur_mse < 0.0f)
 				return false;
 
-			uint32_t current_block_pixels = active_x * active_y;
-			float cur_mse = compute_block_mse(pPixels, decoded_block, block_width, block_height, current_block_pixels, num_comps, params.m_color_weights, one_over_total_color_weight);
-
 			if ((params.m_skip_zero_mse_blocks) && (cur_mse == 0.0f))
 				continue;
 
-			const float max_std_dev = compute_block_max_std_dev(pPixels, block_width, active_x, active_y, num_comps);
-			
 			float yl = clampf(max_std_dev / params.m_max_smooth_block_std_dev, 0.0f, 1.0f);
 			yl = yl * yl;
 			float smooth_block_mse_scale = lerp(params.m_smooth_block_max_mse_scale, 1.0f, yl);
 
 			if (pBlock_mse_scales)
 			{
-				if ((*pBlock_mse_scales)[block_index] > 0.0f)
+				if (pBlock_mse_scales[block_index] > 0.0f)
 				{
-					smooth_block_mse_scale = (*pBlock_mse_scales)[block_index];
+					smooth_block_mse_scale = pBlock_mse_scales[block_index];
 				}
 			}
 			
+#if ERT_ENABLE_DEBUG
 			if (smooth_block_mse_scale > 1.0f)
 				total_smooth_blocks++;
-						
+#endif
+
 			float cur_bits = (LITERAL_BITS * block_size_to_optimize_in_bytes);
 			float cur_t = cur_mse * smooth_block_mse_scale + cur_bits * params.m_lambda;
 
@@ -367,7 +209,7 @@ namespace ert
 			memcpy(best_block, pOrig_block, block_size_to_optimize_in_bytes);
 
 			float best_t = cur_t;
-			uint32_t best_match_len = 0, best_match_src_window_ofs = 0, best_match_dst_window_ofs = 0, best_match_src_block_ofs = 0, best_match_dst_block_ofs = 0;
+			uint32_t best_match_len = 0, best_match_src_window_ofs = 0, best_match_dst_window_ofs = 0, best_match_dst_block_ofs = 0;
 			float best_match_bits = 0;
 
 			// Don't let thresh_ms_err be 0 to let zero error blocks have slightly increased distortion
@@ -441,12 +283,10 @@ namespace ert
 								memcpy(trial_block, pOrig_block, block_size_to_optimize_in_bytes);
 								memcpy(trial_block + dst_ofs, pPrev_blk + src_ofs, len);
 
-								color_rgba decoded_trial_block[MAX_BLOCK_PIXELS];
-								if (!(*pUnpack_block_func)(trial_block, decoded_trial_block, block_index, active_x, active_y, pUnpack_block_func_user_data))
+								float trial_mse = (*pDiff_block_func)(pDiff_block_func_user_data, trial_block, block_index, nullptr);
+								if (trial_mse < 0.0f)
 									continue;
 
-								float trial_mse = compute_block_mse(pPixels, decoded_trial_block, block_width, block_height, current_block_pixels, num_comps, params.m_color_weights, one_over_total_color_weight);
-
 								if (trial_mse < thresh_ms_err)
 								{
 									float t = trial_mse * smooth_block_mse_scale + trial_total_bits_times_lambda;
@@ -458,7 +298,6 @@ namespace ert
 										best_match_len = len;
 										best_match_src_window_ofs = src_match_window_ofs;
 										best_match_dst_window_ofs = dst_match_window_ofs;
-										best_match_src_block_ofs = src_ofs;
 										best_match_dst_block_ofs = dst_ofs;
 										best_match_bits = trial_match_bits;
 									}
@@ -527,12 +366,10 @@ namespace ert
 							memcpy(trial_block, pOrig_block, block_size_to_optimize_in_bytes);
 							memcpy(trial_block + ofs, pPrev_blk + ofs, len);
 
-							color_rgba decoded_trial_block[MAX_BLOCK_PIXELS];
-							if (!(*pUnpack_block_func)(trial_block, decoded_trial_block, block_index, active_x, active_y, pUnpack_block_func_user_data))
+							float trial_mse = (*pDiff_block_func)(pDiff_block_func_user_data, trial_block, block_index, nullptr);
+							if (trial_mse < 0.0f)
 								continue;
 
-							float trial_mse = compute_block_mse(pPixels, decoded_trial_block, block_width, block_height, current_block_pixels, num_comps, params.m_color_weights, one_over_total_color_weight);
-
 							if (trial_mse < thresh_ms_err)
 							{
 								float t = trial_mse * smooth_block_mse_scale + trial_total_bits_times_lambda_to_use;
@@ -544,7 +381,6 @@ namespace ert
 									best_match_len = len;
 									best_match_src_window_ofs = src_match_window_ofs;
 									best_match_dst_window_ofs = dst_match_window_ofs;
-									best_match_src_block_ofs = ofs;
 									best_match_dst_block_ofs = ofs;
 									best_match_bits = trial_match_bits_to_use;
 								}
@@ -558,7 +394,7 @@ namespace ert
 
 			if (best_t < cur_t)
 			{
-				uint32_t best_second_match_len = 0, best_second_match_src_window_ofs = 0, best_second_match_dst_window_ofs = 0, best_second_match_src_block_ofs = 0, best_second_match_dst_block_ofs = 0;
+				uint32_t best_second_match_len = 0, best_second_match_src_window_ofs = 0, best_second_match_dst_window_ofs = 0, best_second_match_dst_block_ofs = 0;
 								
 				// Try injecting a second match, being sure it does't overlap with the first.
 				if ((params.m_try_two_matches) && (best_match_len <= (block_size_to_optimize_in_bytes - 3)))
@@ -600,12 +436,10 @@ namespace ert
 								memcpy(trial_block, orig_best_block, block_size_to_optimize_in_bytes);
 								memcpy(trial_block + ofs, pPrev_blk + ofs, len);
 
-								color_rgba decoded_trial_block[MAX_BLOCK_PIXELS];
-								if (!(*pUnpack_block_func)(trial_block, decoded_trial_block, block_index, active_x, active_y, pUnpack_block_func_user_data))
+								float trial_mse = (*pDiff_block_func)(pDiff_block_func_user_data, trial_block, block_index, nullptr);
+								if (trial_mse < 0.0f)
 									continue;
 
-								float trial_mse = compute_block_mse(pPixels, decoded_trial_block, block_width, block_height, current_block_pixels, num_comps, params.m_color_weights, one_over_total_color_weight);
-
 								if (trial_mse < thresh_ms_err)
 								{
 									float t = trial_mse * smooth_block_mse_scale + trial_total_bits_times_lambda;
@@ -617,7 +451,6 @@ namespace ert
 										best_second_match_len = len;
 										best_second_match_src_window_ofs = src_match_window_ofs;
 										best_second_match_dst_window_ofs = dst_match_window_ofs;
-										best_second_match_src_block_ofs = ofs;
 										best_second_match_dst_block_ofs = ofs;
 									}
 								}
@@ -670,6 +503,7 @@ namespace ert
 #endif
 				}
 
+#if ERT_ENABLE_DEBUG
 				len_hist[best_match_len]++;
 
 				if (best_second_match_len)
@@ -677,6 +511,7 @@ namespace ert
 					second_len_hist[best_second_match_len]++;
 					total_second_matches++;
 				}
+#endif
 			}
 			else
 			{
@@ -684,7 +519,7 @@ namespace ert
 			}
 						
 		} // block_index
-				
+#if ERT_ENABLE_DEBUG
 		if (params.m_debug_output)
 		{
 			printf("Total smooth blocks: %3.2f%%\n", total_smooth_blocks * 100.0f / num_blocks);
@@ -698,7 +533,7 @@ namespace ert
 			for (uint32_t i = MIN_MATCH_LEN; i <= block_size_to_optimize_in_bytes; i++)
 				printf("%u%c", second_len_hist[i], (i < block_size_to_optimize_in_bytes) ? ',' : '\n');
 		}
-		
+#endif
 		return true;
 	}
 
diff --git a/Source/ert.h b/Source/ert.h
index 4c5e84144..865514440 100644
--- a/Source/ert.h
+++ b/Source/ert.h
@@ -1,19 +1,32 @@
+// SPDX-License-Identifier: Unlicense
+// ----------------------------------------------------------------------------
+// This is free and unencumbered software released into the public domain.
+// Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+// software, either in source code form or as a compiled binary, for any purpose,
+// commercial or non - commercial, and by any means.
+// In jurisdictions that recognize copyright laws, the author or authors of this
+// software dedicate any and all copyright interest in the software to the public
+// domain. We make this dedication for the benefit of the public at large and to
+// the detriment of our heirs and successors.We intend this dedication to be an
+// overt act of relinquishment in perpetuity of all present and future rights to
+// this software under copyright law.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+// AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+// ----------------------------------------------------------------------------
+
 #pragma once
 
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
 #include <algorithm>
-#include <assert.h>
-#include <time.h>
-#include <vector>
-#include <string>
+
+// Based on https://github.com/richgel999/bc7enc_rdo/blob/master/ert.h
+// With interface tweaks that can make format integration more general & performant.
 
 namespace ert
 {
-	struct color_rgba { uint8_t m_c[4]; };
-
 	struct reduce_entropy_params
 	{
 		// m_lambda: The post-processor tries to reduce distortion*smooth_block_scale + rate*lambda (rate is approximate LZ bits and distortion is scaled MS error multiplied against the smooth block MSE weighting factor).
@@ -29,8 +42,6 @@ namespace ert
 		float m_max_smooth_block_std_dev;
 		float m_smooth_block_max_mse_scale;
 
-		uint32_t m_color_weights[4];
-				
 		bool m_try_two_matches;
 		bool m_allow_relative_movement;
 		bool m_skip_zero_mse_blocks;
@@ -45,37 +56,42 @@ namespace ert
 			m_max_allowed_rms_increase_ratio = 10.0f;
 			m_max_smooth_block_std_dev = 18.0f;
 			m_smooth_block_max_mse_scale = 10.0f;
-			m_color_weights[0] = 1;
-			m_color_weights[1] = 1;
-			m_color_weights[2] = 1;
-			m_color_weights[3] = 1;
 			m_try_two_matches = false;
 			m_allow_relative_movement = false;
 			m_skip_zero_mse_blocks = false;
 			m_debug_output = false;
 		}
 
-		void print()
+		void print() const
 		{
-			printf("lambda: %f\n", m_lambda);
+			printf("lambda: %f\n", (double)m_lambda);
 			printf("Lookback window size: %u\n", m_lookback_window_size);
-			printf("Max allowed RMS increase ratio: %f\n", m_max_allowed_rms_increase_ratio);
-			printf("Max smooth block std dev: %f\n", m_max_smooth_block_std_dev);
-			printf("Smooth block max MSE scale: %f\n", m_smooth_block_max_mse_scale);
-			printf("Color weights: %u %u %u %u\n", m_color_weights[0], m_color_weights[1], m_color_weights[2], m_color_weights[3]);
+			printf("Max allowed RMS increase ratio: %f\n", (double)m_max_allowed_rms_increase_ratio);
+			printf("Max smooth block std dev: %f\n", (double)m_max_smooth_block_std_dev);
+			printf("Smooth block max MSE scale: %f\n", (double)m_smooth_block_max_mse_scale);
 			printf("Try two matches: %u\n", m_try_two_matches);
 			printf("Allow relative movement: %u\n", m_allow_relative_movement);
 			printf("Skip zero MSE blocks: %u\n", m_skip_zero_mse_blocks);
 		}
 	};
 
-	typedef bool (*pUnpack_block_func)(const void* pBlock, color_rgba* pPixels, uint32_t block_index, uint32_t& active_x, uint32_t& active_y, void* pUser_data);
+	/**
+	 * @brief Callback to compute trial block differences.
+	 *
+	 * All comparing texel values should be in range [0, 255].
+	 *
+	 * @param[out] out_max_std_dev The channel-wise maximum standard deviation for the original block,
+	 * 	can be @c nullptr if not requested.
+	 *
+	 * @return Should return the mean squared error for current trial block, or any negative value to indicate errors.
+	 */
+	typedef float (*pDiff_block_func)(void* pUser_data, const uint8_t* pBlock, uint32_t block_index, float* out_max_std_dev);
 
 	// BC7 entropy reduction transform with Deflate/LZMA/LZHAM optimizations
-	bool reduce_entropy(void* pBlocks, uint32_t num_blocks,
-		uint32_t total_block_stride_in_bytes, uint32_t block_size_to_optimize_in_bytes, uint32_t block_width, uint32_t block_height, uint32_t num_comps,
-		const color_rgba* pBlock_pixels, const reduce_entropy_params& params, uint32_t& total_modified,
-		pUnpack_block_func pUnpack_block_func, void* pUnpack_block_func_user_data,
-		std::vector<float>* pBlock_mse_scales = nullptr);
+	bool reduce_entropy(uint8_t* pBlock_bytes, uint32_t num_blocks,
+		uint32_t total_block_stride_in_bytes, uint32_t block_size_to_optimize_in_bytes,
+		const reduce_entropy_params& params, uint32_t& total_modified,
+		pDiff_block_func pDiff_block_func, void* pDiff_block_func_user_data,
+		const float* pBlock_mse_scales = nullptr);
 
 } // namespace ert

From 0999cf11ba89b396746a4382ee996bc91be0f20a Mon Sep 17 00:00:00 2001
From: Yun Hsiao Wu <yunhsiaow@gmail.com>
Date: Fri, 29 Mar 2024 10:29:20 +0800
Subject: [PATCH 07/16] fix linux build

---
 Source/ert.cpp | 11 ++++++-----
 Source/ert.h   |  6 ++++--
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/Source/ert.cpp b/Source/ert.cpp
index de48ba8a6..75a4f9b66 100644
--- a/Source/ert.cpp
+++ b/Source/ert.cpp
@@ -21,6 +21,7 @@
 #include "ert.h"
 
 #include <cassert>
+#include <cstring>
 
 #define ERT_FAVOR_CONT_AND_REP0_MATCHES (1)
 #define ERT_FAVOR_REP0_MATCHES (0)
@@ -143,7 +144,7 @@ namespace ert
 	bool reduce_entropy(uint8_t* pBlock_bytes, uint32_t num_blocks,
 		uint32_t total_block_stride_in_bytes, uint32_t block_size_to_optimize_in_bytes,
 		const reduce_entropy_params& params, uint32_t& total_modified,
-		pDiff_block_func pDiff_block_func, void* pDiff_block_func_user_data,
+		diff_block_func_type pDiff_block_func, void* pDiff_block_func_user_data,
 		const float* pBlock_mse_scales)
 	{
 		assert(total_block_stride_in_bytes && block_size_to_optimize_in_bytes);
@@ -175,7 +176,7 @@ namespace ert
 			uint8_t* pOrig_block = &pBlock_bytes[block_index * total_block_stride_in_bytes];
 
 			float max_std_dev = 0.0f;
-			float cur_mse = (*pDiff_block_func)(pDiff_block_func_user_data, pOrig_block, block_index, &max_std_dev);
+			float cur_mse = pDiff_block_func(pDiff_block_func_user_data, pOrig_block, block_index, &max_std_dev);
 			if (cur_mse < 0.0f)
 				return false;
 
@@ -283,7 +284,7 @@ namespace ert
 								memcpy(trial_block, pOrig_block, block_size_to_optimize_in_bytes);
 								memcpy(trial_block + dst_ofs, pPrev_blk + src_ofs, len);
 
-								float trial_mse = (*pDiff_block_func)(pDiff_block_func_user_data, trial_block, block_index, nullptr);
+								float trial_mse = pDiff_block_func(pDiff_block_func_user_data, trial_block, block_index, nullptr);
 								if (trial_mse < 0.0f)
 									continue;
 
@@ -366,7 +367,7 @@ namespace ert
 							memcpy(trial_block, pOrig_block, block_size_to_optimize_in_bytes);
 							memcpy(trial_block + ofs, pPrev_blk + ofs, len);
 
-							float trial_mse = (*pDiff_block_func)(pDiff_block_func_user_data, trial_block, block_index, nullptr);
+							float trial_mse = pDiff_block_func(pDiff_block_func_user_data, trial_block, block_index, nullptr);
 							if (trial_mse < 0.0f)
 								continue;
 
@@ -436,7 +437,7 @@ namespace ert
 								memcpy(trial_block, orig_best_block, block_size_to_optimize_in_bytes);
 								memcpy(trial_block + ofs, pPrev_blk + ofs, len);
 
-								float trial_mse = (*pDiff_block_func)(pDiff_block_func_user_data, trial_block, block_index, nullptr);
+								float trial_mse = pDiff_block_func(pDiff_block_func_user_data, trial_block, block_index, nullptr);
 								if (trial_mse < 0.0f)
 									continue;
 
diff --git a/Source/ert.h b/Source/ert.h
index 865514440..ca8ecf378 100644
--- a/Source/ert.h
+++ b/Source/ert.h
@@ -21,6 +21,8 @@
 #pragma once
 
 #include <algorithm>
+#include <cstdint>
+#include <cstdio>
 
 // Based on https://github.com/richgel999/bc7enc_rdo/blob/master/ert.h
 // With interface tweaks that can make format integration more general & performant.
@@ -85,13 +87,13 @@ namespace ert
 	 *
 	 * @return Should return the mean squared error for current trial block, or any negative value to indicate errors.
 	 */
-	typedef float (*pDiff_block_func)(void* pUser_data, const uint8_t* pBlock, uint32_t block_index, float* out_max_std_dev);
+	typedef float diff_block_func_type(void* pUser_data, const uint8_t* pBlock, uint32_t block_index, float* out_max_std_dev);
 
 	// BC7 entropy reduction transform with Deflate/LZMA/LZHAM optimizations
 	bool reduce_entropy(uint8_t* pBlock_bytes, uint32_t num_blocks,
 		uint32_t total_block_stride_in_bytes, uint32_t block_size_to_optimize_in_bytes,
 		const reduce_entropy_params& params, uint32_t& total_modified,
-		pDiff_block_func pDiff_block_func, void* pDiff_block_func_user_data,
+		diff_block_func_type pDiff_block_func, void* pDiff_block_func_user_data,
 		const float* pBlock_mse_scales = nullptr);
 
 } // namespace ert

From 9eddb96743b132d3c2a05a570b1cc2e84bff6522 Mon Sep 17 00:00:00 2001
From: Yun Hsiao Wu <yunhsiaow@gmail.com>
Date: Fri, 29 Mar 2024 12:03:15 +0800
Subject: [PATCH 08/16] handle border pixels in non-specialized diff path

---
 Source/astcenc_rate_distortion.cpp | 42 ++++++++++++++++--------------
 1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/Source/astcenc_rate_distortion.cpp b/Source/astcenc_rate_distortion.cpp
index 4ed869a0c..663e33fe1 100644
--- a/Source/astcenc_rate_distortion.cpp
+++ b/Source/astcenc_rate_distortion.cpp
@@ -29,9 +29,9 @@ struct astcenc_rdo_context
 	std::vector<image_block> m_blocks;
 
 	uint32_t m_total_blocks = 0;
-	uint32_t m_xblocks = 0;
-	uint32_t m_yblocks = 0;
-	uint32_t m_zblocks = 0;
+	uint32_t m_image_x = 0;
+	uint32_t m_image_y = 0;
+	uint32_t m_image_z = 0;
 };
 
 #define ASTCENC_RDO_SPECIALIZE_DIFF 1
@@ -102,9 +102,9 @@ static uint32_t init_rdo_context(
 
 	rdo_ctx.m_blocks.resize(total_blocks);
 	rdo_ctx.m_total_blocks = total_blocks;
-	rdo_ctx.m_xblocks = xblocks;
-	rdo_ctx.m_yblocks = yblocks;
-	rdo_ctx.m_zblocks = zblocks;
+	rdo_ctx.m_image_x = image.dim_x;
+	rdo_ctx.m_image_y = image.dim_y;
+	rdo_ctx.m_image_z = image.dim_z;
 
 	vfloat4 channel_weight = vfloat4(ctx.config.cw_r_weight,
 									 ctx.config.cw_g_weight,
@@ -282,6 +282,10 @@ static float compute_symbolic_block_difference_constant(
 static float compute_block_mse(
 	const image_block& orig,
 	const image_block& cmp,
+	const block_size_descriptor& bsd,
+	uint32_t image_x,
+	uint32_t image_y,
+	uint32_t image_z,
 	float orig_scale,
 	float cmp_scale
 ) {
@@ -289,7 +293,11 @@ static float compute_block_mse(
 	vint lane_id = vint::lane_id();
 	uint32_t texel_count = orig.texel_count;
 
-	for (uint32_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+	uint32_t block_x = astc::min(image_x - orig.xpos, (uint32_t)bsd.xdim);
+	uint32_t block_y = astc::min(image_y - orig.ypos, (uint32_t)bsd.ydim);
+	uint32_t block_z = astc::min(image_z - orig.zpos, (uint32_t)bsd.zdim);
+
+	for (uint32_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH, lane_id += vint(ASTCENC_SIMD_WIDTH))
 	{
 		vfloat color_orig_r = loada(orig.data_r + i) * orig_scale;
 		vfloat color_orig_g = loada(orig.data_g + i) * orig_scale;
@@ -318,12 +326,15 @@ static float compute_block_mse(
 					  + color_error_a * orig.channel_weight.lane<3>();
 
 		// Mask off bad lanes
-		vmask mask = lane_id < vint(texel_count);
-		lane_id += vint(ASTCENC_SIMD_WIDTH);
+		vint lane_id_z(float_to_int(int_to_float(lane_id) / float(bsd.xdim * bsd.ydim)));
+		vint rem_idx = lane_id - lane_id_z * vint(bsd.xdim * bsd.ydim);
+		vint lane_id_y = float_to_int(int_to_float(rem_idx) / bsd.xdim);
+		vint lane_id_x = rem_idx - lane_id_y * vint(bsd.xdim);
+		vmask mask = (lane_id_x < vint(block_x)) & (lane_id_y < vint(block_y)) & (lane_id_z < vint(block_z));
 		haccumulate(summav, metric, mask);
 	}
 
-	return hadd_s(summav) / texel_count;
+	return hadd_s(summav) / (block_x * block_y * block_z);
 }
 #endif
 
@@ -376,22 +387,15 @@ static float compute_block_difference(
 	// ERT expects texel values to be in [0, 255]
 	return squared_error / blk.texel_count * sqr(255.0f / 65535.0f);
 #else
-	uint32_t block_z = block_idx / (rdo_ctx.m_xblocks * rdo_ctx.m_yblocks);
-	uint32_t slice_idx = block_idx - block_z * rdo_ctx.m_xblocks * rdo_ctx.m_yblocks;
-	uint32_t block_y = slice_idx / rdo_ctx.m_xblocks;
-	uint32_t block_x = slice_idx - block_y * rdo_ctx.m_xblocks;
-
 	image_block decoded_blk;
 	decoded_blk.decode_unorm8 = blk.decode_unorm8;
 	decoded_blk.texel_count = blk.texel_count;
 	decoded_blk.channel_weight = blk.channel_weight;
 
-	decompress_symbolic_block(ctx.config.profile, *ctx.bsd,
-							  block_x * ctx.bsd->xdim, block_y * ctx.bsd->ydim, block_z * ctx.bsd->zdim,
-							  scb, decoded_blk);
+	decompress_symbolic_block(ctx.config.profile, *ctx.bsd, blk.xpos, blk.ypos, blk.zpos, scb, decoded_blk);
 
 	// ERT expects texel values to be in [0, 255]
-	return compute_block_mse(blk, decoded_blk, 255.0f / 65535.0f, 255.0f);
+	return compute_block_mse(blk, decoded_blk, *ctx.bsd, rdo_ctx.m_image_x, rdo_ctx.m_image_y, rdo_ctx.m_image_z, 255.0f / 65535.0f, 255.0f);
 #endif
 }
 

From 94cbb7346dd13290709f349eecf4bebcef568671 Mon Sep 17 00:00:00 2001
From: Yun Hsiao Wu <yunhsiaow@gmail.com>
Date: Fri, 29 Mar 2024 16:00:07 +0800
Subject: [PATCH 09/16] minor

---
 Source/astcenc_rate_distortion.cpp |  2 +-
 Source/astcenccli_toplevel.cpp     | 13 ++++++++-----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/Source/astcenc_rate_distortion.cpp b/Source/astcenc_rate_distortion.cpp
index 663e33fe1..e99124904 100644
--- a/Source/astcenc_rate_distortion.cpp
+++ b/Source/astcenc_rate_distortion.cpp
@@ -422,7 +422,7 @@ void rate_distortion_optimize(
 	if (!ctxo.context.config.rdo_no_multithreading)
 	{
 		blocks_per_task = astc::min(rdo_ctx.m_ert_params.m_lookback_window_size / ASTCENC_BYTES_PER_BLOCK, rdo_ctx.m_total_blocks);
-		// There is no way to losslessly partition the job (sequential dependency on previous output)
+		// There is no way to losslessly partition the job (sequentially dependent on previous output)
 		// So we reserve only one task for each thread to minimize the quality impact.
 		blocks_per_task = astc::max(blocks_per_task, (rdo_ctx.m_total_blocks - 1) / ctxo.context.thread_count + 1);
 	}
diff --git a/Source/astcenccli_toplevel.cpp b/Source/astcenccli_toplevel.cpp
index 67c905493..770fcfd78 100644
--- a/Source/astcenccli_toplevel.cpp
+++ b/Source/astcenccli_toplevel.cpp
@@ -1326,11 +1326,14 @@ static void print_astcenc_config(
 		printf("    Refinement cutoff:          %u iterations\n", config.tune_refinement_limit);
 		printf("    Compressor thread count:    %d\n", cli_config.thread_count);
 		printf("    RDO:                        %s\n", config.rdo_enabled ? "Enabled" : "Disabled");
-		printf("    RDO multithreading:         %s\n", config.rdo_no_multithreading ? "Disabled" : "Enabled");
-		printf("    RDO quality:                %g\n", static_cast<double>(config.rdo_quality));
-		printf("    RDO dictionary size:        %u bytes\n", config.rdo_dict_size);
-		printf("    RDO max error scale:        %g\n", static_cast<double>(config.rdo_max_smooth_block_error_scale));
-		printf("    RDO max standard deviation: %g\n", static_cast<double>(config.rdo_max_smooth_block_std_dev));
+		if (config.rdo_enabled)
+		{
+			printf("    RDO multithreading:         %s\n", config.rdo_no_multithreading ? "Disabled" : "Enabled");
+			printf("    RDO quality:                %g\n", static_cast<double>(config.rdo_quality));
+			printf("    RDO dictionary size:        %u bytes\n", config.rdo_dict_size);
+			printf("    RDO max error scale:        %g\n", static_cast<double>(config.rdo_max_smooth_block_error_scale));
+			printf("    RDO max standard deviation: %g\n", static_cast<double>(config.rdo_max_smooth_block_std_dev));
+		}
 		printf("\n");
 	}
 }

From fb82b855eef87c98bcbf375fe625f1cc1f7ee414 Mon Sep 17 00:00:00 2001
From: Yun Hsiao Wu <yunhsiaow@gmail.com>
Date: Sat, 30 Mar 2024 13:27:05 +0800
Subject: [PATCH 10/16] quality & dict size presets

---
 Source/astcenc_entry.cpp            | 51 +++++++++++++++--------------
 Source/astcenccli_toplevel.cpp      |  4 +--
 Source/astcenccli_toplevel_help.cpp | 20 +++++++++--
 3 files changed, 46 insertions(+), 29 deletions(-)

diff --git a/Source/astcenc_entry.cpp b/Source/astcenc_entry.cpp
index 5901a5467..bca4b1261 100644
--- a/Source/astcenc_entry.cpp
+++ b/Source/astcenc_entry.cpp
@@ -56,6 +56,8 @@ struct astcenc_preset_config
 	float tune_3partition_early_out_limit_factor;
 	float tune_2plane_early_out_limit_correlation;
 	float tune_search_mode0_enable;
+	float rdo_quality;
+	unsigned int rdo_dict_size;
 };
 
 /**
@@ -64,22 +66,22 @@ struct astcenc_preset_config
 static const std::array<astcenc_preset_config, 6> preset_configs_high {{
 	{
 		ASTCENC_PRE_FASTEST,
-		2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f, 0.0f
+		2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f, 0.0f, 4.0f, 256
 	}, {
 		ASTCENC_PRE_FAST,
-		3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.90f, 0.0f
+		3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.90f, 0.0f, 2.0f, 1024
 	}, {
 		ASTCENC_PRE_MEDIUM,
-		4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 1.1f, 1.05f, 0.95f, 0.0f
+		4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 1.1f, 1.05f, 0.95f, 0.0f, 1.0f, 4096
 	}, {
 		ASTCENC_PRE_THOROUGH,
-		4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.35f, 1.15f, 0.97f, 0.0f
+		4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.35f, 1.15f, 0.97f, 0.0f, 0.5f, 4096
 	}, {
 		ASTCENC_PRE_VERYTHOROUGH,
-		4, 256, 128, 64, 98, 4, 6, 8, 6, 4, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 0.0f
+		4, 256, 128, 64, 98, 4, 6, 8, 6, 4, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 0.0f, 0.4f, 4096
 	}, {
 		ASTCENC_PRE_EXHAUSTIVE,
-		4, 512, 512, 512, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 0.0f
+		4, 512, 512, 512, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 0.0f, 0.2f, 4096
 	}
 }};
 
@@ -89,22 +91,22 @@ static const std::array<astcenc_preset_config, 6> preset_configs_high {{
 static const std::array<astcenc_preset_config, 6> preset_configs_mid {{
 	{
 		ASTCENC_PRE_FASTEST,
-		2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.80f, 1.0f
+		2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.80f, 1.0f, 4.0f, 256
 	}, {
 		ASTCENC_PRE_FAST,
-		3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f, 1.0f
+		3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f, 1.0f, 2.0f, 1024
 	}, {
 		ASTCENC_PRE_MEDIUM,
-		3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 1.1f, 1.05f, 0.90f, 1.0f
+		3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 1.1f, 1.05f, 0.90f, 1.0f, 1.0f, 4096
 	}, {
 		ASTCENC_PRE_THOROUGH,
-		4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.4f, 1.2f, 0.95f, 0.0f
+		4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.4f, 1.2f, 0.95f, 0.0f, 0.5f, 4096
 	}, {
 		ASTCENC_PRE_VERYTHOROUGH,
-		4, 256, 128, 64, 98, 4, 6, 8, 6, 3, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 0.0f
+		4, 256, 128, 64, 98, 4, 6, 8, 6, 3, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 0.0f, 0.4f, 4096
 	}, {
 		ASTCENC_PRE_EXHAUSTIVE,
-		4, 256, 256, 256, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 0.0f
+		4, 256, 256, 256, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 0.0f, 0.2f, 4096
 	}
 }};
 
@@ -114,22 +116,22 @@ static const std::array<astcenc_preset_config, 6> preset_configs_mid {{
 static const std::array<astcenc_preset_config, 6> preset_configs_low {{
 	{
 		ASTCENC_PRE_FASTEST,
-		2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.80f, 1.0f
+		2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.80f, 1.0f, 4.0f, 256
 	}, {
 		ASTCENC_PRE_FAST,
-		2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.85f, 1.0f
+		2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.85f, 1.0f, 2.0f, 1024
 	}, {
 		ASTCENC_PRE_MEDIUM,
-		3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 1.1f, 1.05f, 0.90f, 1.0f
+		3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 1.1f, 1.05f, 0.90f, 1.0f, 1.0f, 4096
 	}, {
 		ASTCENC_PRE_THOROUGH,
-		4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.3f, 1.2f, 0.97f, 1.0f
+		4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.3f, 1.2f, 0.97f, 1.0f, 0.5f, 4096
 	}, {
 		ASTCENC_PRE_VERYTHOROUGH,
-		4, 256, 128, 64, 98, 4, 6, 8, 5, 2, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 1.0f
+		4, 256, 128, 64, 98, 4, 6, 8, 5, 2, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 1.0f, 0.4f, 4096
 	}, {
 		ASTCENC_PRE_EXHAUSTIVE,
-		4, 256, 256, 256, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 1.0f
+		4, 256, 256, 256, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 1.0f, 0.2f, 4096
 	}
 }};
 
@@ -533,6 +535,8 @@ astcenc_error astcenc_config_init(
 		config.tune_3partition_early_out_limit_factor = (*preset_configs)[start].tune_3partition_early_out_limit_factor;
 		config.tune_2plane_early_out_limit_correlation = (*preset_configs)[start].tune_2plane_early_out_limit_correlation;
 		config.tune_search_mode0_enable = (*preset_configs)[start].tune_search_mode0_enable;
+		config.rdo_quality = (*preset_configs)[start].rdo_quality;
+		config.rdo_dict_size = (*preset_configs)[start].rdo_dict_size;
 	}
 	// Start and end node are not the same - so interpolate between them
 	else
@@ -572,11 +576,16 @@ astcenc_error astcenc_config_init(
 		config.tune_3partition_early_out_limit_factor = LERP(tune_3partition_early_out_limit_factor);
 		config.tune_2plane_early_out_limit_correlation = LERP(tune_2plane_early_out_limit_correlation);
 		config.tune_search_mode0_enable = LERP(tune_search_mode0_enable);
+		config.rdo_quality = LERP(rdo_quality);
+		config.rdo_dict_size = LERPUI(rdo_dict_size);
 		#undef LERP
 		#undef LERPI
 		#undef LERPUI
 	}
 
+	config.rdo_max_smooth_block_error_scale = 10.0f;
+	config.rdo_max_smooth_block_std_dev = 18.0f;
+
 	// Set heuristics to the defaults for each color profile
 	config.cw_r_weight = 1.0f;
 	config.cw_g_weight = 1.0f;
@@ -655,12 +664,6 @@ astcenc_error astcenc_config_init(
 	}
 	config.flags = flags;
 
-	// These are the reasonable defaults that leverages performance & compression rate
-	config.rdo_quality = 1.0f;
-	config.rdo_dict_size = 4096;
-	config.rdo_max_smooth_block_error_scale = 10.0f;
-	config.rdo_max_smooth_block_std_dev = 18.0f;
-
 	return ASTCENC_SUCCESS;
 }
 
diff --git a/Source/astcenccli_toplevel.cpp b/Source/astcenccli_toplevel.cpp
index 770fcfd78..6e7146eb8 100644
--- a/Source/astcenccli_toplevel.cpp
+++ b/Source/astcenccli_toplevel.cpp
@@ -1181,7 +1181,7 @@ static int edit_astcenc_config(
 			config.rdo_enabled = true;
 
 			// Unpacking RDO trials blocks requires full initialization
-			if (config.rdo_quality > 0.0f && static_cast<bool>(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY))
+			if (static_cast<bool>(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY))
 			{
 				config.flags &= ~ASTCENC_FLG_SELF_DECOMPRESS_ONLY;
 			}
@@ -1325,7 +1325,7 @@ static void print_astcenc_config(
 		printf("    Candidate cutoff:           %u candidates\n", config.tune_candidate_limit);
 		printf("    Refinement cutoff:          %u iterations\n", config.tune_refinement_limit);
 		printf("    Compressor thread count:    %d\n", cli_config.thread_count);
-		printf("    RDO:                        %s\n", config.rdo_enabled ? "Enabled" : "Disabled");
+		printf("    Rate-distortion opt:        %s\n", config.rdo_enabled ? "Enabled" : "Disabled");
 		if (config.rdo_enabled)
 		{
 			printf("    RDO multithreading:         %s\n", config.rdo_no_multithreading ? "Disabled" : "Enabled");
diff --git a/Source/astcenccli_toplevel_help.cpp b/Source/astcenccli_toplevel_help.cpp
index c7266d3cf..6a0b906b0 100644
--- a/Source/astcenccli_toplevel_help.cpp
+++ b/Source/astcenccli_toplevel_help.cpp
@@ -396,11 +396,25 @@ ADVANCED COMPRESSION
            RDO quality scalar (lambda). Lower values yield higher
            quality/larger LZ compressed files, higher values yield lower
            quality/smaller LZ compressed files. A good range to try is [.2,4].
-           Full range is [.001,50.0]. Default is 1.0.
+           Full range is [.001,50.0]. Preset defaults are:
+
+               -fastest      : 4.0
+               -fast         : 2.0
+               -medium       : 1.0
+               -thorough     : 0.5
+               -verythorough : 0.4
+               -exhaustive   : 0.2
 
        -rdo-dict-size <number>
-           RDO dictionary size in bytes. Default is 4096. Lower
-           values=faster, but give less compression. Range is [64,65536].
+           RDO dictionary size in bytes. Lower values=faster,
+           but give less compression. Range is [64,65536]. Preset defaults are:
+
+               -fastest      : 256
+               -fast         : 1024
+               -medium       : 4096
+               -thorough     : 4096
+               -verythorough : 4096
+               -exhaustive   : 4096
 
        -rdo-max-smooth-block-error-scale <factor>
            RDO max smooth block error scale. Range is [1,300].

From 0343ce0f40a6bd48d680c45065c4f22c1968c4ea Mon Sep 17 00:00:00 2001
From: Yun Hsiao Wu <yunhsiaow@gmail.com>
Date: Sat, 30 Mar 2024 16:39:48 +0800
Subject: [PATCH 11/16] handle more corner cases

---
 Source/astcenc_rate_distortion.cpp | 38 +++++++++++++++---------------
 Source/ert.cpp                     |  3 +--
 2 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/Source/astcenc_rate_distortion.cpp b/Source/astcenc_rate_distortion.cpp
index e99124904..a7706a1f8 100644
--- a/Source/astcenc_rate_distortion.cpp
+++ b/Source/astcenc_rate_distortion.cpp
@@ -25,10 +25,8 @@
 struct astcenc_rdo_context
 {
 	ert::reduce_entropy_params m_ert_params;
-	std::atomic<uint32_t> m_total_modified;
 	std::vector<image_block> m_blocks;
 
-	uint32_t m_total_blocks = 0;
 	uint32_t m_image_x = 0;
 	uint32_t m_image_y = 0;
 	uint32_t m_image_z = 0;
@@ -101,7 +99,6 @@ static uint32_t init_rdo_context(
 	ert_params.m_try_two_matches = true;
 
 	rdo_ctx.m_blocks.resize(total_blocks);
-	rdo_ctx.m_total_blocks = total_blocks;
 	rdo_ctx.m_image_x = image.dim_x;
 	rdo_ctx.m_image_y = image.dim_y;
 	rdo_ctx.m_image_z = image.dim_z;
@@ -356,11 +353,10 @@ static float compute_block_difference(
 	symbolic_compressed_block scb;
 	physical_to_symbolic(*ctx.bsd, pcb, scb);
 
-	if (scb.block_type == SYM_BTYPE_ERROR)
-	{
-		// Trial blocks may not be valid at all
-		return -ERROR_CALC_DEFAULT;
-	}
+	// Trial blocks may not be valid at all
+	if (scb.block_type == SYM_BTYPE_ERROR) return -ERROR_CALC_DEFAULT;
+	bool is_dual_plane = scb.block_type == SYM_BTYPE_NONCONST && ctx.bsd->get_block_mode(scb.block_mode).is_dual_plane;
+	if (is_dual_plane && scb.partition_count != 1) return -ERROR_CALC_DEFAULT;
 
 	const astcenc_rdo_context& rdo_ctx = *ctx.rdo_context;
 	uint32_t block_idx = local_block_idx + local_ctx.base_offset;
@@ -377,9 +373,9 @@ static float compute_block_difference(
 
 	if (scb.block_type != SYM_BTYPE_NONCONST)
 		squared_error = compute_symbolic_block_difference_constant(ctx.config, *ctx.bsd, scb, blk);
-	else if (ctx.bsd->get_block_mode(scb.block_mode).is_dual_plane)
+	else if (is_dual_plane)
 		squared_error = compute_symbolic_block_difference_2plane(ctx.config, *ctx.bsd, scb, blk);
-	else if (ctx.bsd->get_partition_info(scb.partition_count, scb.partition_index).partition_count == 1)
+	else if (scb.partition_count == 1)
 		squared_error = compute_symbolic_block_difference_1plane_1partition(ctx.config, *ctx.bsd, scb, blk);
 	else
 		squared_error = compute_symbolic_block_difference_1plane(ctx.config, *ctx.bsd, scb, blk);
@@ -417,14 +413,19 @@ void rate_distortion_optimize(
 		},
 		ctxo.context.config.progress_callback ? rdo_progress_emitter : nullptr);
 
-	astcenc_rdo_context& rdo_ctx = *ctxo.context.rdo_context;
-	uint32_t blocks_per_task = rdo_ctx.m_total_blocks;
-	if (!ctxo.context.config.rdo_no_multithreading)
+	const astcenc_contexti& ctx = ctxo.context;
+	uint32_t xblocks = (image.dim_x + ctx.bsd->xdim - 1u) / ctx.bsd->xdim;
+	uint32_t yblocks = (image.dim_y + ctx.bsd->ydim - 1u) / ctx.bsd->ydim;
+	uint32_t zblocks = (image.dim_z + ctx.bsd->zdim - 1u) / ctx.bsd->zdim;
+	uint32_t total_blocks = xblocks * yblocks * zblocks;
+
+	uint32_t blocks_per_task = total_blocks;
+	if (!ctx.config.rdo_no_multithreading)
 	{
-		blocks_per_task = astc::min(rdo_ctx.m_ert_params.m_lookback_window_size / ASTCENC_BYTES_PER_BLOCK, rdo_ctx.m_total_blocks);
+		blocks_per_task = astc::min(ctx.config.rdo_dict_size / ASTCENC_BYTES_PER_BLOCK, total_blocks);
 		// There is no way to losslessly partition the job (sequentially dependent on previous output)
 		// So we reserve only one task for each thread to minimize the quality impact.
-		blocks_per_task = astc::max(blocks_per_task, (rdo_ctx.m_total_blocks - 1) / ctxo.context.thread_count + 1);
+		blocks_per_task = astc::max(blocks_per_task, (total_blocks - 1) / ctx.thread_count + 1);
 	}
 
 	uint32_t total_modified = 0;
@@ -437,17 +438,16 @@ void rate_distortion_optimize(
 			break;
 		}
 
-		local_rdo_context local_ctx{ &ctxo.context, base };
+		local_rdo_context local_ctx{ &ctx, base };
 
 		ert::reduce_entropy(buffer + base * ASTCENC_BYTES_PER_BLOCK, count,
 							ASTCENC_BYTES_PER_BLOCK, ASTCENC_BYTES_PER_BLOCK,
-							rdo_ctx.m_ert_params, total_modified, compute_block_difference, &local_ctx);
+							ctx.rdo_context->m_ert_params, total_modified,
+							compute_block_difference, &local_ctx);
 
 		ctxo.manage_rdo.complete_task_assignment(count);
 	}
 
-	rdo_ctx.m_total_modified.fetch_add(total_modified, std::memory_order_relaxed);
-
 	// Wait for rdo to complete before freeing memory
 	ctxo.manage_rdo.wait();
 
diff --git a/Source/ert.cpp b/Source/ert.cpp
index 75a4f9b66..ce065d6de 100644
--- a/Source/ert.cpp
+++ b/Source/ert.cpp
@@ -400,8 +400,7 @@ namespace ert
 				// Try injecting a second match, being sure it does't overlap with the first.
 				if ((params.m_try_two_matches) && (best_match_len <= (block_size_to_optimize_in_bytes - 3)))
 				{
-					uint8_t matched_flags[MAX_BLOCK_SIZE_IN_BYTES];
-					memset(matched_flags, 0, sizeof(matched_flags));
+					uint8_t matched_flags[MAX_BLOCK_SIZE_IN_BYTES]{};
 					memset(matched_flags + best_match_dst_block_ofs, 1, best_match_len);
 
 					uint8_t orig_best_block[MAX_BLOCK_SIZE_IN_BYTES];

From 47c99ba82b1331b694c54ed50216a22df59808ef Mon Sep 17 00:00:00 2001
From: Yun Hsiao Wu <yunhsiaow@gmail.com>
Date: Mon, 1 Apr 2024 10:43:19 +0800
Subject: [PATCH 12/16] customize partitions

---
 Source/astcenc.h                    |  5 +++++
 Source/astcenc_rate_distortion.cpp  |  3 ++-
 Source/astcenccli_toplevel.cpp      | 12 ++++++++++++
 Source/astcenccli_toplevel_help.cpp |  4 ++++
 4 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/Source/astcenc.h b/Source/astcenc.h
index dc99bc64a..647ab0ee5 100644
--- a/Source/astcenc.h
+++ b/Source/astcenc.h
@@ -591,6 +591,11 @@ struct astcenc_config
 	 */
 	unsigned int rdo_dict_size;
 
+	/**
+	 * @brief RDO task partitions.
+	 */
+	unsigned int rdo_partitions;
+
 	/**
 	 * @brief RDO max smooth block error scale.
 	 */
diff --git a/Source/astcenc_rate_distortion.cpp b/Source/astcenc_rate_distortion.cpp
index a7706a1f8..beba48e80 100644
--- a/Source/astcenc_rate_distortion.cpp
+++ b/Source/astcenc_rate_distortion.cpp
@@ -425,7 +425,8 @@ void rate_distortion_optimize(
 		blocks_per_task = astc::min(ctx.config.rdo_dict_size / ASTCENC_BYTES_PER_BLOCK, total_blocks);
 		// There is no way to losslessly partition the job (sequentially dependent on previous output)
 		// So we reserve only one task for each thread to minimize the quality impact.
-		blocks_per_task = astc::max(blocks_per_task, (total_blocks - 1) / ctx.thread_count + 1);
+		uint32_t partitions = ctx.config.rdo_partitions ? ctx.config.rdo_partitions : ctx.thread_count;
+		blocks_per_task = astc::max(blocks_per_task, (total_blocks - 1) / partitions + 1);
 	}
 
 	uint32_t total_modified = 0;
diff --git a/Source/astcenccli_toplevel.cpp b/Source/astcenccli_toplevel.cpp
index 6e7146eb8..84303228d 100644
--- a/Source/astcenccli_toplevel.cpp
+++ b/Source/astcenccli_toplevel.cpp
@@ -1213,6 +1213,17 @@ static int edit_astcenc_config(
 
 			config.rdo_dict_size = atoi(argv[argidx - 1]);
 		}
+		else if (!strcmp(argv[argidx], "-rdo-partitions"))
+		{
+			argidx += 2;
+			if (argidx > argc)
+			{
+				print_error("ERROR: -rdo-partitions switch with no argument\n");
+				return 1;
+			}
+
+			config.rdo_partitions = atoi(argv[argidx - 1]);
+		}
 		else if (!strcmp(argv[argidx], "-rdo-max-smooth-block-error-scale"))
 		{
 			argidx += 2;
@@ -1333,6 +1344,7 @@ static void print_astcenc_config(
 			printf("    RDO dictionary size:        %u bytes\n", config.rdo_dict_size);
 			printf("    RDO max error scale:        %g\n", static_cast<double>(config.rdo_max_smooth_block_error_scale));
 			printf("    RDO max standard deviation: %g\n", static_cast<double>(config.rdo_max_smooth_block_std_dev));
+			if (config.rdo_partitions) printf("    RDO partitions:             %u\n", cli_config.thread_count);
 		}
 		printf("\n");
 	}
diff --git a/Source/astcenccli_toplevel_help.cpp b/Source/astcenccli_toplevel_help.cpp
index 6a0b906b0..8592487e8 100644
--- a/Source/astcenccli_toplevel_help.cpp
+++ b/Source/astcenccli_toplevel_help.cpp
@@ -416,6 +416,10 @@ ADVANCED COMPRESSION
                -verythorough : 4096
                -exhaustive   : 4096
 
+       -rdo-partitions <number>
+           RDO task partitions. Default to current number of threads.
+           Customize this for a deterministic output regardless of the running hardware.
+
        -rdo-max-smooth-block-error-scale <factor>
            RDO max smooth block error scale. Range is [1,300].
            Default is 10.0, 1.0 is disabled. Larger values suppress more

From 936ac2d805283d34ec93bafd25aa2b7f189cfaf3 Mon Sep 17 00:00:00 2001
From: Yun Hsiao Wu <yunhsiaow@gmail.com>
Date: Sun, 7 Apr 2024 12:08:32 +0800
Subject: [PATCH 13/16] lookback param in blocks

---
 Source/astcenc.h                    |  4 +--
 Source/astcenc_entry.cpp            | 48 ++++++++++++++---------------
 Source/astcenc_rate_distortion.cpp  |  4 +--
 Source/astcenccli_toplevel.cpp      | 35 +++++++++++++++------
 Source/astcenccli_toplevel_help.cpp | 28 ++++++++---------
 5 files changed, 65 insertions(+), 54 deletions(-)

diff --git a/Source/astcenc.h b/Source/astcenc.h
index 647ab0ee5..4a00241bd 100644
--- a/Source/astcenc.h
+++ b/Source/astcenc.h
@@ -587,9 +587,9 @@ struct astcenc_config
 	float rdo_quality;
 
 	/**
-	 * @brief RDO dictionary size in bytes.
+	 * @brief RDO lookback size in blocks.
 	 */
-	unsigned int rdo_dict_size;
+	unsigned int rdo_lookback;
 
 	/**
 	 * @brief RDO task partitions.
diff --git a/Source/astcenc_entry.cpp b/Source/astcenc_entry.cpp
index bca4b1261..bd088c855 100644
--- a/Source/astcenc_entry.cpp
+++ b/Source/astcenc_entry.cpp
@@ -56,8 +56,7 @@ struct astcenc_preset_config
 	float tune_3partition_early_out_limit_factor;
 	float tune_2plane_early_out_limit_correlation;
 	float tune_search_mode0_enable;
-	float rdo_quality;
-	unsigned int rdo_dict_size;
+	unsigned int rdo_lookback;
 };
 
 /**
@@ -66,22 +65,22 @@ struct astcenc_preset_config
 static const std::array<astcenc_preset_config, 6> preset_configs_high {{
 	{
 		ASTCENC_PRE_FASTEST,
-		2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f, 0.0f, 4.0f, 256
+		2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f, 0.0f, 64
 	}, {
 		ASTCENC_PRE_FAST,
-		3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.90f, 0.0f, 2.0f, 1024
+		3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.90f, 0.0f, 128
 	}, {
 		ASTCENC_PRE_MEDIUM,
-		4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 1.1f, 1.05f, 0.95f, 0.0f, 1.0f, 4096
+		4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 1.1f, 1.05f, 0.95f, 0.0f, 256
 	}, {
 		ASTCENC_PRE_THOROUGH,
-		4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.35f, 1.15f, 0.97f, 0.0f, 0.5f, 4096
+		4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.35f, 1.15f, 0.97f, 0.0f, 256
 	}, {
 		ASTCENC_PRE_VERYTHOROUGH,
-		4, 256, 128, 64, 98, 4, 6, 8, 6, 4, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 0.0f, 0.4f, 4096
+		4, 256, 128, 64, 98, 4, 6, 8, 6, 4, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 0.0f, 256
 	}, {
 		ASTCENC_PRE_EXHAUSTIVE,
-		4, 512, 512, 512, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 0.0f, 0.2f, 4096
+		4, 512, 512, 512, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 0.0f, 256
 	}
 }};
 
@@ -91,22 +90,22 @@ static const std::array<astcenc_preset_config, 6> preset_configs_high {{
 static const std::array<astcenc_preset_config, 6> preset_configs_mid {{
 	{
 		ASTCENC_PRE_FASTEST,
-		2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.80f, 1.0f, 4.0f, 256
+		2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.80f, 1.0f, 64
 	}, {
 		ASTCENC_PRE_FAST,
-		3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f, 1.0f, 2.0f, 1024
+		3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f, 1.0f, 128
 	}, {
 		ASTCENC_PRE_MEDIUM,
-		3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 1.1f, 1.05f, 0.90f, 1.0f, 1.0f, 4096
+		3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 1.1f, 1.05f, 0.90f, 1.0f, 256
 	}, {
 		ASTCENC_PRE_THOROUGH,
-		4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.4f, 1.2f, 0.95f, 0.0f, 0.5f, 4096
+		4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.4f, 1.2f, 0.95f, 0.0f, 256
 	}, {
 		ASTCENC_PRE_VERYTHOROUGH,
-		4, 256, 128, 64, 98, 4, 6, 8, 6, 3, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 0.0f, 0.4f, 4096
+		4, 256, 128, 64, 98, 4, 6, 8, 6, 3, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 0.0f, 256
 	}, {
 		ASTCENC_PRE_EXHAUSTIVE,
-		4, 256, 256, 256, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 0.0f, 0.2f, 4096
+		4, 256, 256, 256, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 0.0f, 256
 	}
 }};
 
@@ -116,22 +115,22 @@ static const std::array<astcenc_preset_config, 6> preset_configs_mid {{
 static const std::array<astcenc_preset_config, 6> preset_configs_low {{
 	{
 		ASTCENC_PRE_FASTEST,
-		2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.80f, 1.0f, 4.0f, 256
+		2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.80f, 1.0f, 64
 	}, {
 		ASTCENC_PRE_FAST,
-		2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.85f, 1.0f, 2.0f, 1024
+		2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.85f, 1.0f, 128
 	}, {
 		ASTCENC_PRE_MEDIUM,
-		3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 1.1f, 1.05f, 0.90f, 1.0f, 1.0f, 4096
+		3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 1.1f, 1.05f, 0.90f, 1.0f, 256
 	}, {
 		ASTCENC_PRE_THOROUGH,
-		4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.3f, 1.2f, 0.97f, 1.0f, 0.5f, 4096
+		4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.3f, 1.2f, 0.97f, 1.0f, 256
 	}, {
 		ASTCENC_PRE_VERYTHOROUGH,
-		4, 256, 128, 64, 98, 4, 6, 8, 5, 2, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 1.0f, 0.4f, 4096
+		4, 256, 128, 64, 98, 4, 6, 8, 5, 2, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 1.0f, 256
 	}, {
 		ASTCENC_PRE_EXHAUSTIVE,
-		4, 256, 256, 256, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 1.0f, 0.2f, 4096
+		4, 256, 256, 256, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 1.0f, 256
 	}
 }};
 
@@ -415,7 +414,7 @@ static astcenc_error validate_config(
 	config.tune_2plane_early_out_limit_correlation = astc::max(config.tune_2plane_early_out_limit_correlation, 0.0f);
 
 	config.rdo_quality = astc::clamp(config.rdo_quality, 0.001f, 50.0f);
-	config.rdo_dict_size = astc::clamp(config.rdo_dict_size, 64u, 65536u);
+	config.rdo_lookback = astc::clamp(config.rdo_lookback, 4u, 4096u);
 	config.rdo_max_smooth_block_error_scale = astc::clamp(config.rdo_max_smooth_block_error_scale, 1.0f, 300.0f);
 	config.rdo_max_smooth_block_std_dev = astc::clamp(config.rdo_max_smooth_block_std_dev, 0.01f, 65536.0f);
 
@@ -535,8 +534,7 @@ astcenc_error astcenc_config_init(
 		config.tune_3partition_early_out_limit_factor = (*preset_configs)[start].tune_3partition_early_out_limit_factor;
 		config.tune_2plane_early_out_limit_correlation = (*preset_configs)[start].tune_2plane_early_out_limit_correlation;
 		config.tune_search_mode0_enable = (*preset_configs)[start].tune_search_mode0_enable;
-		config.rdo_quality = (*preset_configs)[start].rdo_quality;
-		config.rdo_dict_size = (*preset_configs)[start].rdo_dict_size;
+		config.rdo_lookback = (*preset_configs)[start].rdo_lookback;
 	}
 	// Start and end node are not the same - so interpolate between them
 	else
@@ -576,13 +574,13 @@ astcenc_error astcenc_config_init(
 		config.tune_3partition_early_out_limit_factor = LERP(tune_3partition_early_out_limit_factor);
 		config.tune_2plane_early_out_limit_correlation = LERP(tune_2plane_early_out_limit_correlation);
 		config.tune_search_mode0_enable = LERP(tune_search_mode0_enable);
-		config.rdo_quality = LERP(rdo_quality);
-		config.rdo_dict_size = LERPUI(rdo_dict_size);
+		config.rdo_lookback = LERPUI(rdo_lookback);
 		#undef LERP
 		#undef LERPI
 		#undef LERPUI
 	}
 
+	config.rdo_quality = 0.5f;
 	config.rdo_max_smooth_block_error_scale = 10.0f;
 	config.rdo_max_smooth_block_std_dev = 18.0f;
 
diff --git a/Source/astcenc_rate_distortion.cpp b/Source/astcenc_rate_distortion.cpp
index beba48e80..bfeb338ea 100644
--- a/Source/astcenc_rate_distortion.cpp
+++ b/Source/astcenc_rate_distortion.cpp
@@ -92,7 +92,7 @@ static uint32_t init_rdo_context(
 	// Generate quality parameters
 	auto& ert_params = rdo_ctx.m_ert_params;
 	ert_params.m_lambda = ctx.config.rdo_quality;
-	ert_params.m_lookback_window_size = ctx.config.rdo_dict_size;
+	ert_params.m_lookback_window_size = ctx.config.rdo_lookback * ASTCENC_BYTES_PER_BLOCK;
 	ert_params.m_smooth_block_max_mse_scale = ctx.config.rdo_max_smooth_block_error_scale;
 	ert_params.m_max_smooth_block_std_dev = ctx.config.rdo_max_smooth_block_std_dev;
 
@@ -422,7 +422,7 @@ void rate_distortion_optimize(
 	uint32_t blocks_per_task = total_blocks;
 	if (!ctx.config.rdo_no_multithreading)
 	{
-		blocks_per_task = astc::min(ctx.config.rdo_dict_size / ASTCENC_BYTES_PER_BLOCK, total_blocks);
+		blocks_per_task = astc::min(ctx.config.rdo_lookback, total_blocks);
 		// There is no way to losslessly partition the job (sequentially dependent on previous output)
 		// So we reserve only one task for each thread to minimize the quality impact.
 		uint32_t partitions = ctx.config.rdo_partitions ? ctx.config.rdo_partitions : ctx.thread_count;
diff --git a/Source/astcenccli_toplevel.cpp b/Source/astcenccli_toplevel.cpp
index 84303228d..1727ae3cc 100644
--- a/Source/astcenccli_toplevel.cpp
+++ b/Source/astcenccli_toplevel.cpp
@@ -1179,12 +1179,6 @@ static int edit_astcenc_config(
 		{
 			argidx += 1;
 			config.rdo_enabled = true;
-
-			// Unpacking RDO trials blocks requires full initialization
-			if (static_cast<bool>(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY))
-			{
-				config.flags &= ~ASTCENC_FLG_SELF_DECOMPRESS_ONLY;
-			}
 		}
 		else if (!strcmp(argv[argidx], "-rdo-no-multithreading"))
 		{
@@ -1200,6 +1194,7 @@ static int edit_astcenc_config(
 				return 1;
 			}
 
+			config.rdo_enabled = true;
 			config.rdo_quality = static_cast<float>(atof(argv[argidx - 1]));
 		}
 		else if (!strcmp(argv[argidx], "-rdo-dict-size"))
@@ -1211,7 +1206,20 @@ static int edit_astcenc_config(
 				return 1;
 			}
 
-			config.rdo_dict_size = atoi(argv[argidx - 1]);
+			config.rdo_enabled = true;
+			config.rdo_lookback = atoi(argv[argidx - 1]) / 16;
+		}
+		else if (!strcmp(argv[argidx], "-rdo-lookback"))
+		{
+			argidx += 2;
+			if (argidx > argc)
+			{
+				print_error("ERROR: -rdo-dict-size switch with no argument\n");
+				return 1;
+			}
+
+			config.rdo_enabled = true;
+			config.rdo_lookback = atoi(argv[argidx - 1]);
 		}
 		else if (!strcmp(argv[argidx], "-rdo-partitions"))
 		{
@@ -1222,6 +1230,7 @@ static int edit_astcenc_config(
 				return 1;
 			}
 
+			config.rdo_enabled = true;
 			config.rdo_partitions = atoi(argv[argidx - 1]);
 		}
 		else if (!strcmp(argv[argidx], "-rdo-max-smooth-block-error-scale"))
@@ -1233,6 +1242,7 @@ static int edit_astcenc_config(
 				return 1;
 			}
 
+			config.rdo_enabled = true;
 			config.rdo_max_smooth_block_error_scale = static_cast<float>(atof(argv[argidx - 1]));
 		}
 		else if (!strcmp(argv[argidx], "-rdo-max-smooth-block-std-dev"))
@@ -1244,6 +1254,7 @@ static int edit_astcenc_config(
 				return 1;
 			}
 
+			config.rdo_enabled = true;
 			config.rdo_max_smooth_block_std_dev = static_cast<float>(atof(argv[argidx - 1]));
 		}
 		else // check others as well
@@ -1341,10 +1352,10 @@ static void print_astcenc_config(
 		{
 			printf("    RDO multithreading:         %s\n", config.rdo_no_multithreading ? "Disabled" : "Enabled");
 			printf("    RDO quality:                %g\n", static_cast<double>(config.rdo_quality));
-			printf("    RDO dictionary size:        %u bytes\n", config.rdo_dict_size);
+			printf("    RDO lookback:               %u blocks\n", config.rdo_lookback);
 			printf("    RDO max error scale:        %g\n", static_cast<double>(config.rdo_max_smooth_block_error_scale));
 			printf("    RDO max standard deviation: %g\n", static_cast<double>(config.rdo_max_smooth_block_std_dev));
-			if (config.rdo_partitions) printf("    RDO partitions:             %u\n", cli_config.thread_count);
+			if (config.rdo_partitions) printf("    RDO partitions:             %u\n", config.rdo_partitions);
 		}
 		printf("\n");
 	}
@@ -2070,6 +2081,12 @@ int astcenc_main(
 		return 1;
 	}
 
+	// Unpacking RDO trials blocks requires full initialization
+	if (config.rdo_enabled && static_cast<bool>(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY))
+	{
+		config.flags &= ~ASTCENC_FLG_SELF_DECOMPRESS_ONLY;
+	}
+
 	// Enable progress callback if not in silent mode and using a terminal
 	#if defined(_WIN32)
 		int stdoutfno = _fileno(stdout);
diff --git a/Source/astcenccli_toplevel_help.cpp b/Source/astcenccli_toplevel_help.cpp
index 8592487e8..cf15a07cf 100644
--- a/Source/astcenccli_toplevel_help.cpp
+++ b/Source/astcenccli_toplevel_help.cpp
@@ -396,25 +396,21 @@ ADVANCED COMPRESSION
            RDO quality scalar (lambda). Lower values yield higher
            quality/larger LZ compressed files, higher values yield lower
            quality/smaller LZ compressed files. A good range to try is [.2,4].
-           Full range is [.001,50.0]. Preset defaults are:
+           Full range is [.001,50.0]. Default to 0.5.
 
-               -fastest      : 4.0
-               -fast         : 2.0
-               -medium       : 1.0
-               -thorough     : 0.5
-               -verythorough : 0.4
-               -exhaustive   : 0.2
+       -rdo-lookback <number>
+           RDO look back size in blocks. Lower values=faster,
+           but give less compression. Range is [4,4096]. Preset defaults are:
+
+               -fastest      : 64
+               -fast         : 128
+               -medium       : 256
+               -thorough     : 256
+               -verythorough : 256
+               -exhaustive   : 256
 
        -rdo-dict-size <number>
-           RDO dictionary size in bytes. Lower values=faster,
-           but give less compression. Range is [64,65536]. Preset defaults are:
-
-               -fastest      : 256
-               -fast         : 1024
-               -medium       : 4096
-               -thorough     : 4096
-               -verythorough : 4096
-               -exhaustive   : 4096
+           Same as rdo-lookback, but in bytes.
 
        -rdo-partitions <number>
            RDO task partitions. Default to current number of threads.

From 70b76c275e82601afbbc997953897fb7ae057fa3 Mon Sep 17 00:00:00 2001
From: Yun Hsiao Wu <yunhsiaow@gmail.com>
Date: Tue, 9 Apr 2024 11:58:39 +0800
Subject: [PATCH 14/16] rdo-partitions precedes no-multithreading

---
 Source/astcenc.h                    |  5 ----
 Source/astcenc_rate_distortion.cpp  | 38 ++++++-----------------------
 Source/astcenccli_toplevel.cpp      |  6 -----
 Source/astcenccli_toplevel_help.cpp |  3 ---
 4 files changed, 8 insertions(+), 44 deletions(-)

diff --git a/Source/astcenc.h b/Source/astcenc.h
index 4a00241bd..de9752eb2 100644
--- a/Source/astcenc.h
+++ b/Source/astcenc.h
@@ -576,11 +576,6 @@ struct astcenc_config
 	 */
 	bool rdo_enabled;
 
-	/**
-	 * @brief Disable RDO multithreading (slightly higher compression, deterministic).
-	 */
-	bool rdo_no_multithreading;
-
 	/**
 	 * @brief RDO quality scalar (lambda).
 	 */
diff --git a/Source/astcenc_rate_distortion.cpp b/Source/astcenc_rate_distortion.cpp
index bfeb338ea..d0770300c 100644
--- a/Source/astcenc_rate_distortion.cpp
+++ b/Source/astcenc_rate_distortion.cpp
@@ -38,29 +38,12 @@ static constexpr uint32_t ASTCENC_BYTES_PER_BLOCK = 16;
 
 template<typename T> T sqr(T v) { return v * v; }
 
+extern "C" void progress_emitter(float value);
+
 extern "C" void rdo_progress_emitter(
 	float value
 ) {
 	static float previous_value = 100.0f;
-	const uint32_t bar_size = 25;
-	auto parts = static_cast<uint32_t>(value / 4.0f);
-
-	char buffer[bar_size + 3];
-	buffer[0] = '[';
-
-	for (uint32_t i = 0; i < parts; i++)
-	{
-		buffer[i + 1] = '=';
-	}
-
-	for (uint32_t i = parts; i < bar_size; i++)
-	{
-		buffer[i + 1] = ' ';
-	}
-
-	buffer[bar_size + 1] = ']';
-	buffer[bar_size + 2] = '\0';
-
 	if (previous_value == 100.0f)
 	{
 		printf("\n\n");
@@ -69,8 +52,7 @@ extern "C" void rdo_progress_emitter(
 	}
 	previous_value = value;
 
-	printf("    Progress: %s %03.1f%%\r", buffer, static_cast<double>(value));
-	fflush(stdout);
+	progress_emitter(value);
 }
 
 static uint32_t init_rdo_context(
@@ -419,15 +401,11 @@ void rate_distortion_optimize(
 	uint32_t zblocks = (image.dim_z + ctx.bsd->zdim - 1u) / ctx.bsd->zdim;
 	uint32_t total_blocks = xblocks * yblocks * zblocks;
 
-	uint32_t blocks_per_task = total_blocks;
-	if (!ctx.config.rdo_no_multithreading)
-	{
-		blocks_per_task = astc::min(ctx.config.rdo_lookback, total_blocks);
-		// There is no way to losslessly partition the job (sequentially dependent on previous output)
-		// So we reserve only one task for each thread to minimize the quality impact.
-		uint32_t partitions = ctx.config.rdo_partitions ? ctx.config.rdo_partitions : ctx.thread_count;
-		blocks_per_task = astc::max(blocks_per_task, (total_blocks - 1) / partitions + 1);
-	}
+	uint32_t blocks_per_task = astc::min(ctx.config.rdo_lookback, total_blocks);
+	// There is no way to losslessly partition the job (sequentially dependent on previous output)
+	// So we reserve up to one task for each thread to minimize the quality impact.
+	uint32_t partitions = ctx.config.rdo_partitions ? ctx.config.rdo_partitions : ctx.thread_count;
+	blocks_per_task = astc::max(blocks_per_task, (total_blocks - 1) / partitions + 1);
 
 	uint32_t total_modified = 0;
 	while (true)
diff --git a/Source/astcenccli_toplevel.cpp b/Source/astcenccli_toplevel.cpp
index 1727ae3cc..05a6a1778 100644
--- a/Source/astcenccli_toplevel.cpp
+++ b/Source/astcenccli_toplevel.cpp
@@ -1180,11 +1180,6 @@ static int edit_astcenc_config(
 			argidx += 1;
 			config.rdo_enabled = true;
 		}
-		else if (!strcmp(argv[argidx], "-rdo-no-multithreading"))
-		{
-			argidx += 1;
-			config.rdo_no_multithreading = true;
-		}
 		else if (!strcmp(argv[argidx], "-rdo-quality"))
 		{
 			argidx += 2;
@@ -1350,7 +1345,6 @@ static void print_astcenc_config(
 		printf("    Rate-distortion opt:        %s\n", config.rdo_enabled ? "Enabled" : "Disabled");
 		if (config.rdo_enabled)
 		{
-			printf("    RDO multithreading:         %s\n", config.rdo_no_multithreading ? "Disabled" : "Enabled");
 			printf("    RDO quality:                %g\n", static_cast<double>(config.rdo_quality));
 			printf("    RDO lookback:               %u blocks\n", config.rdo_lookback);
 			printf("    RDO max error scale:        %g\n", static_cast<double>(config.rdo_max_smooth_block_error_scale));
diff --git a/Source/astcenccli_toplevel_help.cpp b/Source/astcenccli_toplevel_help.cpp
index cf15a07cf..3aa8767fb 100644
--- a/Source/astcenccli_toplevel_help.cpp
+++ b/Source/astcenccli_toplevel_help.cpp
@@ -389,9 +389,6 @@ ADVANCED COMPRESSION
        -rdo
            Enable Rate Distortion Optimization (RDO) post-processing.
 
-       -rdo-no-multithreading
-           Disable RDO multithreading (slightly higher compression, deterministic).
-
        -rdo-quality <factor>
            RDO quality scalar (lambda). Lower values yield higher
            quality/larger LZ compressed files, higher values yield lower

From c860cc391225cf610f42f88ffdefa6a943468982 Mon Sep 17 00:00:00 2001
From: Yun Hsiao Wu <yunhsiaow@gmail.com>
Date: Thu, 23 May 2024 12:16:51 +0800
Subject: [PATCH 15/16] fix alpha weight

---
 Source/astcenc_rate_distortion.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Source/astcenc_rate_distortion.cpp b/Source/astcenc_rate_distortion.cpp
index d0770300c..146abdd57 100644
--- a/Source/astcenc_rate_distortion.cpp
+++ b/Source/astcenc_rate_distortion.cpp
@@ -101,6 +101,9 @@ static uint32_t init_rdo_context(
 				blk.decode_unorm8 = ctx.config.flags & ASTCENC_FLG_USE_DECODE_UNORM8;
 				blk.texel_count = ctx.bsd->texel_count;
 
+				load_image_block(ctx.config.profile, image, blk, *ctx.bsd,
+								 block_dim_x * block_x, block_dim_y * block_y, block_dim_z * block_z, swz);
+
 				if (ctx.config.flags & ASTCENC_FLG_USE_ALPHA_WEIGHT)
 				{
 					float alpha_scale = blk.data_max.lane<3>() * (1.0f / 65535.0f);
@@ -114,9 +117,6 @@ static uint32_t init_rdo_context(
 				{
 					blk.channel_weight = channel_weight;
 				}
-
-				load_image_block(ctx.config.profile, image, blk, *ctx.bsd,
-								 block_dim_x * block_x, block_dim_y * block_y, block_dim_z * block_z, swz);
 			}
 		}
 	}

From 92a278400461ffaa0177f0c35abc5acee8c392a6 Mon Sep 17 00:00:00 2001
From: Yun Hsiao Wu <yunhsiaow@gmail.com>
Date: Sat, 8 Jun 2024 09:51:57 +0800
Subject: [PATCH 16/16] minor

---
 Source/astcenc_rate_distortion.cpp | 48 +++++++++++++++++++++++++++---
 Source/astcenccli_toplevel.cpp     | 14 +++++++--
 Source/ert.cpp                     | 10 +++----
 Source/ert.h                       |  2 +-
 4 files changed, 61 insertions(+), 13 deletions(-)

diff --git a/Source/astcenc_rate_distortion.cpp b/Source/astcenc_rate_distortion.cpp
index 146abdd57..d6ec828f1 100644
--- a/Source/astcenc_rate_distortion.cpp
+++ b/Source/astcenc_rate_distortion.cpp
@@ -38,8 +38,6 @@ static constexpr uint32_t ASTCENC_BYTES_PER_BLOCK = 16;
 
 template<typename T> T sqr(T v) { return v * v; }
 
-extern "C" void progress_emitter(float value);
-
 extern "C" void rdo_progress_emitter(
 	float value
 ) {
@@ -52,7 +50,27 @@ extern "C" void rdo_progress_emitter(
 	}
 	previous_value = value;
 
-	progress_emitter(value);
+	const unsigned int bar_size = 25;
+	unsigned int parts = static_cast<int>(value / 4.0f);
+
+	char buffer[bar_size + 3];
+	buffer[0] = '[';
+
+	for (unsigned int i = 0; i < parts; i++)
+	{
+		buffer[i + 1] = '=';
+	}
+
+	for (unsigned int i = parts; i < bar_size; i++)
+	{
+		buffer[i + 1] = ' ';
+	}
+
+	buffer[bar_size + 1] = ']';
+	buffer[bar_size + 2] = '\0';
+
+	printf("    Progress: %s %03.1f%%\r", buffer, static_cast<double>(value));
+	fflush(stdout);
 }
 
 static uint32_t init_rdo_context(
@@ -323,6 +341,27 @@ struct local_rdo_context
 	uint32_t base_offset;
 };
 
+static bool is_transparent(int v) { return (v & 0xFF) != 0xFF; }
+
+static bool has_any_transparency(
+	astcenc_profile decode_mode,
+	const symbolic_compressed_block& scb
+) {
+	if (scb.block_type != SYM_BTYPE_NONCONST) return is_transparent(scb.constant_color[3]);
+
+	vint4 ep0;
+	vint4 ep1;
+	bool rgb_lns;
+	bool a_lns;
+
+	for (int i = 0; i < scb.partition_count; i++)
+	{
+		unpack_color_endpoints(decode_mode, scb.color_formats[i], scb.color_values[i], rgb_lns, a_lns, ep0, ep1);
+		if (is_transparent(ep0.lane<3>()) || is_transparent(ep1.lane<3>())) return true;
+	}
+	return false;
+}
+
 static float compute_block_difference(
 	void* user_data,
 	const uint8_t* pcb,
@@ -339,6 +378,7 @@ static float compute_block_difference(
 	if (scb.block_type == SYM_BTYPE_ERROR) return -ERROR_CALC_DEFAULT;
 	bool is_dual_plane = scb.block_type == SYM_BTYPE_NONCONST && ctx.bsd->get_block_mode(scb.block_mode).is_dual_plane;
 	if (is_dual_plane && scb.partition_count != 1) return -ERROR_CALC_DEFAULT;
+	if (ctx.config.cw_a_weight < 0.01f && has_any_transparency(ctx.config.profile, scb)) return -ERROR_CALC_DEFAULT;
 
 	const astcenc_rdo_context& rdo_ctx = *ctx.rdo_context;
 	uint32_t block_idx = local_block_idx + local_ctx.base_offset;
@@ -422,7 +462,7 @@ void rate_distortion_optimize(
 		ert::reduce_entropy(buffer + base * ASTCENC_BYTES_PER_BLOCK, count,
 							ASTCENC_BYTES_PER_BLOCK, ASTCENC_BYTES_PER_BLOCK,
 							ctx.rdo_context->m_ert_params, total_modified,
-							compute_block_difference, &local_ctx);
+							&compute_block_difference, &local_ctx);
 
 		ctxo.manage_rdo.complete_task_assignment(count);
 	}
diff --git a/Source/astcenccli_toplevel.cpp b/Source/astcenccli_toplevel.cpp
index 05a6a1778..41c36a09e 100644
--- a/Source/astcenccli_toplevel.cpp
+++ b/Source/astcenccli_toplevel.cpp
@@ -1209,7 +1209,7 @@ static int edit_astcenc_config(
 			argidx += 2;
 			if (argidx > argc)
 			{
-				print_error("ERROR: -rdo-dict-size switch with no argument\n");
+				print_error("ERROR: -rdo-lookback switch with no argument\n");
 				return 1;
 			}
 
@@ -1655,7 +1655,8 @@ static void print_diagnostic_image(
 static void print_diagnostic_images(
 	astcenc_context* context,
 	const astc_compressed_image& image,
-	const std::string& output_file
+	const std::string& output_file,
+	astcenc_operation operation
 ) {
 	if (image.dim_z != 1)
 	{
@@ -1672,6 +1673,13 @@ static void print_diagnostic_images(
 
 	auto diag_image = alloc_image(8, image.dim_x, image.dim_y, image.dim_z);
 
+	// ---- ---- ---- ---- Compressed Output ---- ---- ---- ----
+	if ((operation & ASTCENC_STAGE_ST_COMP) == 0)
+	{
+		std::string fname = stem + "_diag.astc";
+		store_cimage(image, fname.c_str());
+	}
+
 	// ---- ---- ---- ---- Partitioning ---- ---- ---- ----
 	auto partition_func = [](astcenc_block_info& info, size_t texel_x, size_t texel_y) {
 		const vint4 colors[] {
@@ -2421,7 +2429,7 @@ int astcenc_main(
 	// Store diagnostic images
 	if (cli_config.diagnostic_images && !is_null)
 	{
-		print_diagnostic_images(codec_context, image_comp, output_filename);
+		print_diagnostic_images(codec_context, image_comp, output_filename, operation);
 	}
 
 	free_image(image_uncomp_in);
diff --git a/Source/ert.cpp b/Source/ert.cpp
index ce065d6de..7758cc0c5 100644
--- a/Source/ert.cpp
+++ b/Source/ert.cpp
@@ -144,7 +144,7 @@ namespace ert
 	bool reduce_entropy(uint8_t* pBlock_bytes, uint32_t num_blocks,
 		uint32_t total_block_stride_in_bytes, uint32_t block_size_to_optimize_in_bytes,
 		const reduce_entropy_params& params, uint32_t& total_modified,
-		diff_block_func_type pDiff_block_func, void* pDiff_block_func_user_data,
+		diff_block_func_type* pDiff_block_func, void* pDiff_block_func_user_data,
 		const float* pBlock_mse_scales)
 	{
 		assert(total_block_stride_in_bytes && block_size_to_optimize_in_bytes);
@@ -176,7 +176,7 @@ namespace ert
 			uint8_t* pOrig_block = &pBlock_bytes[block_index * total_block_stride_in_bytes];
 
 			float max_std_dev = 0.0f;
-			float cur_mse = pDiff_block_func(pDiff_block_func_user_data, pOrig_block, block_index, &max_std_dev);
+			float cur_mse = (*pDiff_block_func)(pDiff_block_func_user_data, pOrig_block, block_index, &max_std_dev);
 			if (cur_mse < 0.0f)
 				return false;
 
@@ -284,7 +284,7 @@ namespace ert
 								memcpy(trial_block, pOrig_block, block_size_to_optimize_in_bytes);
 								memcpy(trial_block + dst_ofs, pPrev_blk + src_ofs, len);
 
-								float trial_mse = pDiff_block_func(pDiff_block_func_user_data, trial_block, block_index, nullptr);
+								float trial_mse = (*pDiff_block_func)(pDiff_block_func_user_data, trial_block, block_index, nullptr);
 								if (trial_mse < 0.0f)
 									continue;
 
@@ -367,7 +367,7 @@ namespace ert
 							memcpy(trial_block, pOrig_block, block_size_to_optimize_in_bytes);
 							memcpy(trial_block + ofs, pPrev_blk + ofs, len);
 
-							float trial_mse = pDiff_block_func(pDiff_block_func_user_data, trial_block, block_index, nullptr);
+							float trial_mse = (*pDiff_block_func)(pDiff_block_func_user_data, trial_block, block_index, nullptr);
 							if (trial_mse < 0.0f)
 								continue;
 
@@ -436,7 +436,7 @@ namespace ert
 								memcpy(trial_block, orig_best_block, block_size_to_optimize_in_bytes);
 								memcpy(trial_block + ofs, pPrev_blk + ofs, len);
 
-								float trial_mse = pDiff_block_func(pDiff_block_func_user_data, trial_block, block_index, nullptr);
+								float trial_mse = (*pDiff_block_func)(pDiff_block_func_user_data, trial_block, block_index, nullptr);
 								if (trial_mse < 0.0f)
 									continue;
 
diff --git a/Source/ert.h b/Source/ert.h
index ca8ecf378..1ad1d400a 100644
--- a/Source/ert.h
+++ b/Source/ert.h
@@ -93,7 +93,7 @@ namespace ert
 	bool reduce_entropy(uint8_t* pBlock_bytes, uint32_t num_blocks,
 		uint32_t total_block_stride_in_bytes, uint32_t block_size_to_optimize_in_bytes,
 		const reduce_entropy_params& params, uint32_t& total_modified,
-		diff_block_func_type pDiff_block_func, void* pDiff_block_func_user_data,
+		diff_block_func_type* pDiff_block_func, void* pDiff_block_func_user_data,
 		const float* pBlock_mse_scales = nullptr);
 
 } // namespace ert