refactor: Extract and clarify the 8x8 idct implementation

Marwes · Marwes · commit 70c1668ae490 · 2020-06-27T20:31:00.000+02:00
This code is extracted from https://github.com/Marwes/combine-jpeg which is originally based on this repo. While the parser parts are mostly rewritten, the parts like idct, huffman etc are mostly the same, though with some performance and (in my opinion) clarity improvements to the code. If this PR is accepted I may try and extract some other parts as well. (While this PR doesn't contain any performance improvements, I will just add that `combine-jpeg` has some significant performance improvements. It still lacks some features but it's almost, but not quite as fast as mozjpeg on the images that it accepts (no simd for either). Some of this is because of unsafe, some of it because of less overhead in the decoding). ``` it_works/combine time: [2.6562 ms 2.6633 ms 2.6749 ms] change: [-7.2890% -1.9145% +1.5656%] (p = 0.68 > 0.05) No change in performance detected. Found 2 outliers among 10 measurements (20.00%) 2 (20.00%) high severe it_works/mozjpeg time: [2.0400 ms 2.0415 ms 2.0429 ms] change: [+0.4243% +0.6680% +0.8305%] (p = 0.00 < 0.05) Change within noise threshold. Found 1 outliers among 10 measurements (10.00%) 1 (10.00%) high mild it_works/jpeg-decoder time: [8.9513 ms 9.1526 ms 9.3654 ms] change: [-13.663% -5.0961% +1.6086%] (p = 0.29 > 0.05) No change in performance detected. Found 2 outliers among 10 measurements (20.00%) 1 (10.00%) low severe 1 (10.00%) high mild ```
diff --git a/src/idct.rs b/src/idct.rs
@@ -2,7 +2,10 @@
 // One example is tests/crashtest/images/imagetestsuite/b0b8914cc5f7a6eff409f16d8cc236c5.jpg
 // That's why wrapping operators are needed.
 use crate::parser::Dimensions;
-use std::num::Wrapping;
+use std::{
+    convert::TryFrom,
+    num::Wrapping,
+};
 
 pub(crate) fn choose_idct_size(full_size: Dimensions, requested_size: Dimensions) -> usize {
     fn scaled(len: u16, scale: usize) -> u16 { ((len as u32 * scale as u32 - 1) / 8 + 1) as u16 }
@@ -28,7 +31,7 @@ fn test_choose_idct_size() {
     assert_eq!(choose_idct_size(Dimensions{width: 5472, height: 3648}, Dimensions{width: 685, height: 999}), 2);
     assert_eq!(choose_idct_size(Dimensions{width: 5472, height: 3648}, Dimensions{width: 1000, height: 1000}), 2);
     assert_eq!(choose_idct_size(Dimensions{width: 5472, height: 3648}, Dimensions{width: 1400, height: 1400}), 4);
-    
+
     assert_eq!(choose_idct_size(Dimensions{width: 5472, height: 3648}, Dimensions{width: 5472, height: 3648}), 8);
     assert_eq!(choose_idct_size(Dimensions{width: 5472, height: 3648}, Dimensions{width: 16384, height: 16384}), 8);
     assert_eq!(choose_idct_size(Dimensions{width: 1, height: 1}, Dimensions{width: 65535, height: 65535}), 8);
@@ -45,79 +48,74 @@ pub(crate) fn dequantize_and_idct_block(scale: usize, coefficients: &[i16], quan
     }
 }
 
-// This is based on stb_image's 'stbi__idct_block'.
-fn dequantize_and_idct_block_8x8(coefficients: &[i16], quantization_table: &[u16; 64], output_linestride: usize, output: &mut [u8]) {
+pub fn dequantize_and_idct_block_8x8(
+    coefficients: &[i16],
+    quantization_table: &[u16; 64],
+    output_linestride: usize,
+    output: &mut [u8]
+) {
     debug_assert_eq!(coefficients.len(), 64);
+    let output = output
+        .chunks_mut(output_linestride);
+    dequantize_and_idct_block_8x8_inner(coefficients, quantization_table, output)
+}
+
+// This is based on stb_image's 'stbi__idct_block'.
+fn dequantize_and_idct_block_8x8_inner<'a, I>(
+    coefficients: &[i16],
+    quantization_table: &[u16; 64],
+    output: I,
+) where
+    I: IntoIterator<Item = &'a mut [u8]>,
+    I::IntoIter: ExactSizeIterator<Item = &'a mut [u8]>,
+{
+    let output = output.into_iter();
+    debug_assert!(
+        output.len() >= 8,
+        "Output iterator has the wrong length: {}",
+        output.len()
+    );
 
-    let mut temp = [Wrapping(0i32); 64];
+    let mut temp = [Wrapping(0); 64];
 
     // columns
-    for i in 0 .. 8 {
-        // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
-        if coefficients[i + 8] == 0 && coefficients[i + 16] == 0 && coefficients[i + 24] == 0 &&
-                coefficients[i + 32] == 0 && coefficients[i + 40] == 0 && coefficients[i + 48] == 0 &&
-                coefficients[i + 56] == 0 {
-            let dcterm = Wrapping(coefficients[i] as i32 * quantization_table[i] as i32) << 2;
-            temp[i]      = dcterm;
-            temp[i + 8]  = dcterm;
+    for i in 0..8 {
+        if coefficients[i + 8] == 0
+            && coefficients[i + 16] == 0
+            && coefficients[i + 24] == 0
+            && coefficients[i + 32] == 0
+            && coefficients[i + 40] == 0
+            && coefficients[i + 48] == 0
+            && coefficients[i + 56] == 0
+        {
+            let dcterm = dequantize(coefficients[i], quantization_table[i]) << 2;
+            temp[i] = dcterm;
+            temp[i + 8] = dcterm;
             temp[i + 16] = dcterm;
             temp[i + 24] = dcterm;
             temp[i + 32] = dcterm;
             temp[i + 40] = dcterm;
             temp[i + 48] = dcterm;
             temp[i + 56] = dcterm;
-        }
-        else {
-            let s0 = Wrapping(coefficients[i] as i32 * quantization_table[i] as i32);
-            let s1 = Wrapping(coefficients[i + 8] as i32 * quantization_table[i + 8] as i32);
-            let s2 = Wrapping(coefficients[i + 16] as i32 * quantization_table[i + 16] as i32);
-            let s3 = Wrapping(coefficients[i + 24] as i32 * quantization_table[i + 24] as i32);
-            let s4 = Wrapping(coefficients[i + 32] as i32 * quantization_table[i + 32] as i32);
-            let s5 = Wrapping(coefficients[i + 40] as i32 * quantization_table[i + 40] as i32);
-            let s6 = Wrapping(coefficients[i + 48] as i32 * quantization_table[i + 48] as i32);
-            let s7 = Wrapping(coefficients[i + 56] as i32 * quantization_table[i + 56] as i32);
-
-            let p2 = s2;
-            let p3 = s6;
-            let p1 = (p2 + p3) * stbi_f2f(0.5411961);
-            let t2 = p1 + p3 * stbi_f2f(-1.847759065);
-            let t3 = p1 + p2 * stbi_f2f(0.765366865);
-            let p2 = s0;
-            let p3 = s4;
-            let t0 = stbi_fsh(p2 + p3);
-            let t1 = stbi_fsh(p2 - p3);
-            let x0 = t0 + t3;
-            let x3 = t0 - t3;
-            let x1 = t1 + t2;
-            let x2 = t1 - t2;
-            let t0 = s7;
-            let t1 = s5;
-            let t2 = s3;
-            let t3 = s1;
-            let p3 = t0 + t2;
-            let p4 = t1 + t3;
-            let p1 = t0 + t3;
-            let p2 = t1 + t2;
-            let p5 = (p3 + p4) * stbi_f2f(1.175875602);
-            let t0 = t0 * stbi_f2f(0.298631336);
-            let t1 = t1 * stbi_f2f(2.053119869);
-            let t2 = t2 * stbi_f2f(3.072711026);
-            let t3 = t3 * stbi_f2f(1.501321110);
-            let p1 = p5 + (p1 * stbi_f2f(-0.899976223));
-            let p2 = p5 + (p2 * stbi_f2f(-2.562915447));
-            let p3 = p3 * stbi_f2f(-1.961570560);
-            let p4 = p4 * stbi_f2f(-0.390180644);
-            let t3 = t3 + p1 + p4;
-            let t2 = t2 + p2 + p3;
-            let t1 = t1 + p2 + p4;
-            let t0 = t0 + p1 + p3;
-
-            // constants scaled things up by 1<<12; let's bring them back
-            // down, but keep 2 extra bits of precision
-            let x0 = x0 + Wrapping(512);
-            let x1 = x1 + Wrapping(512);
-            let x2 = x2 + Wrapping(512);
-            let x3 = x3 + Wrapping(512);
+        } else {
+            let s0 = dequantize(coefficients[i], quantization_table[i]);
+            let s1 = dequantize(coefficients[i + 8], quantization_table[i + 8]);
+            let s2 = dequantize(coefficients[i + 16], quantization_table[i + 16]);
+            let s3 = dequantize(coefficients[i + 24], quantization_table[i + 24]);
+            let s4 = dequantize(coefficients[i + 32], quantization_table[i + 32]);
+            let s5 = dequantize(coefficients[i + 40], quantization_table[i + 40]);
+            let s6 = dequantize(coefficients[i + 48], quantization_table[i + 48]);
+            let s7 = dequantize(coefficients[i + 56], quantization_table[i + 56]);
+
+            let Kernel {
+                xs: [x0, x1, x2, x3],
+                ts: [t0, t1, t2, t3],
+            } = kernel(
+                [s0, s1, s2, s3, s4, s5, s6, s7],
+                // constants scaled things up by 1<<12; let's bring them back
+                // down, but keep 2 extra bits of precision
+                512,
+            );
 
             temp[i] = (x0 + t3) >> 10;
             temp[i + 56] = (x0 - t3) >> 10;
@@ -130,72 +128,126 @@ fn dequantize_and_idct_block_8x8(coefficients: &[i16], quantization_table: &[u16
         }
     }
 
-    for i in 0 .. 8 {
-        // no fast case since the first 1D IDCT spread components out
-        let s0 = temp[i * 8];
-        let s1 = temp[i * 8 + 1];
-        let s2 = temp[i * 8 + 2];
-        let s3 = temp[i * 8 + 3];
-        let s4 = temp[i * 8 + 4];
-        let s5 = temp[i * 8 + 5];
-        let s6 = temp[i * 8 + 6];
-        let s7 = temp[i * 8 + 7];
-
-        let p2 = s2;
-        let p3 = s6;
-        let p1 = (p2 + p3) * stbi_f2f(0.5411961);
-        let t2 = p1 + p3 * stbi_f2f(-1.847759065);
-        let t3 = p1 + p2 * stbi_f2f(0.765366865);
-        let p2 = s0;
-        let p3 = s4;
-        let t0 = stbi_fsh(p2 + p3);
-        let t1 = stbi_fsh(p2 - p3);
-        let x0 = t0 + t3;
-        let x3 = t0 - t3;
-        let x1 = t1 + t2;
-        let x2 = t1 - t2;
-        let t0 = s7;
-        let t1 = s5;
-        let t2 = s3;
-        let t3 = s1;
-        let p3 = t0 + t2;
-        let p4 = t1 + t3;
-        let p1 = t0 + t3;
-        let p2 = t1 + t2;
-        let p5 = (p3 + p4) * stbi_f2f(1.175875602);
-        let t0 = t0 * stbi_f2f(0.298631336);
-        let t1 = t1 * stbi_f2f(2.053119869);
-        let t2 = t2 * stbi_f2f(3.072711026);
-        let t3 = t3 * stbi_f2f(1.501321110);
-        let p1 = p5 + p1 * stbi_f2f(-0.899976223);
-        let p2 = p5 + p2 * stbi_f2f(-2.562915447);
-        let p3 = p3 * stbi_f2f(-1.961570560);
-        let p4 = p4 * stbi_f2f(-0.390180644);
-        let t3 = t3 + p1 + p4;
-        let t2 = t2 + p2 + p3;
-        let t1 = t1 + p2 + p4;
-        let t0 = t0 + p1 + p3;
+    for (chunk, output_chunk) in temp.chunks_exact(8).zip(output) {
+        let chunk = <&[_; 8]>::try_from(chunk).unwrap();
 
         // constants scaled things up by 1<<12, plus we had 1<<2 from first
         // loop, plus horizontal and vertical each scale by sqrt(8) so together
         // we've got an extra 1<<3, so 1<<17 total we need to remove.
         // so we want to round that, which means adding 0.5 * 1<<17,
         // aka 65536. Also, we'll end up with -128 to 127 that we want
         // to encode as 0..255 by adding 128, so we'll add that before the shift
-        let x0 = x0 + Wrapping(65536 + (128 << 17));
-        let x1 = x1 + Wrapping(65536 + (128 << 17));
-        let x2 = x2 + Wrapping(65536 + (128 << 17));
-        let x3 = x3 + Wrapping(65536 + (128 << 17));
-
-        output[i * output_linestride] = stbi_clamp((x0 + t3) >> 17);
-        output[i * output_linestride + 7] = stbi_clamp((x0 - t3) >> 17);
-        output[i * output_linestride + 1] = stbi_clamp((x1 + t2) >> 17);
-        output[i * output_linestride + 6] = stbi_clamp((x1 - t2) >> 17);
-        output[i * output_linestride + 2] = stbi_clamp((x2 + t1) >> 17);
-        output[i * output_linestride + 5] = stbi_clamp((x2 - t1) >> 17);
-        output[i * output_linestride + 3] = stbi_clamp((x3 + t0) >> 17);
-        output[i * output_linestride + 4] = stbi_clamp((x3 - t0) >> 17);
+        const X_SCALE: i32 = 65536 + (128 << 17);
+
+        let [s0, rest @ ..] = chunk;
+        if *rest == [Wrapping(0); 7] {
+            let dcterm = stbi_clamp((stbi_fsh(*s0) + Wrapping(X_SCALE)) >> 17);
+            output_chunk[0] = dcterm;
+            output_chunk[1] = dcterm;
+            output_chunk[2] = dcterm;
+            output_chunk[3] = dcterm;
+            output_chunk[4] = dcterm;
+            output_chunk[5] = dcterm;
+            output_chunk[6] = dcterm;
+            output_chunk[7] = dcterm;
+        } else {
+            let Kernel {
+                xs: [x0, x1, x2, x3],
+                ts: [t0, t1, t2, t3],
+            } = kernel(*chunk, X_SCALE);
+
+            output_chunk[0] = stbi_clamp((x0 + t3) >> 17);
+            output_chunk[7] = stbi_clamp((x0 - t3) >> 17);
+            output_chunk[1] = stbi_clamp((x1 + t2) >> 17);
+            output_chunk[6] = stbi_clamp((x1 - t2) >> 17);
+            output_chunk[2] = stbi_clamp((x2 + t1) >> 17);
+            output_chunk[5] = stbi_clamp((x2 - t1) >> 17);
+            output_chunk[3] = stbi_clamp((x3 + t0) >> 17);
+            output_chunk[4] = stbi_clamp((x3 - t0) >> 17);
+        }
+    }
+}
+
+struct Kernel {
+    xs: [Wrapping<i32>; 4],
+    ts: [Wrapping<i32>; 4],
+}
+
+#[inline]
+fn kernel_x([s0, s2, s4, s6]: [Wrapping<i32>; 4], x_scale: i32) -> [Wrapping<i32>; 4] {
+    // Even `chunk` indicies
+    let (t2, t3);
+    {
+        let p2 = s2;
+        let p3 = s6;
+
+        let p1 = (p2 + p3) * stbi_f2f(0.5411961);
+        t2 = p1 + p3 * stbi_f2f(-1.847759065);
+        t3 = p1 + p2 * stbi_f2f(0.765366865);
+    }
+
+    let (t0, t1);
+    {
+        let p2 = s0;
+        let p3 = s4;
+
+        t0 = stbi_fsh(p2 + p3);
+        t1 = stbi_fsh(p2 - p3);
     }
+
+    let x0 = t0 + t3;
+    let x3 = t0 - t3;
+    let x1 = t1 + t2;
+    let x2 = t1 - t2;
+
+    let x_scale = Wrapping(x_scale);
+
+    [x0 + x_scale, x1 + x_scale, x2 + x_scale, x3 + x_scale]
+}
+
+#[inline]
+fn kernel_t([s1, s3, s5, s7]: [Wrapping<i32>; 4]) -> [Wrapping<i32>; 4] {
+    // Odd `chunk` indicies
+    let mut t0 = s7;
+    let mut t1 = s5;
+    let mut t2 = s3;
+    let mut t3 = s1;
+
+    let p3 = t0 + t2;
+    let p4 = t1 + t3;
+    let p1 = t0 + t3;
+    let p2 = t1 + t2;
+    let p5 = (p3 + p4) * stbi_f2f(1.175875602);
+
+    t0 *= stbi_f2f(0.298631336);
+    t1 *= stbi_f2f(2.053119869);
+    t2 *= stbi_f2f(3.072711026);
+    t3 *= stbi_f2f(1.501321110);
+
+    let p1 = p5 + p1 * stbi_f2f(-0.899976223);
+    let p2 = p5 + p2 * stbi_f2f(-2.562915447);
+    let p3 = p3 * stbi_f2f(-1.961570560);
+    let p4 = p4 * stbi_f2f(-0.390180644);
+
+    t3 += p1 + p4;
+    t2 += p2 + p3;
+    t1 += p2 + p4;
+    t0 += p1 + p3;
+
+    [t0, t1, t2, t3]
+}
+
+#[inline]
+fn kernel([s0, s1, s2, s3, s4, s5, s6, s7]: [Wrapping<i32>; 8], x_scale: i32) -> Kernel {
+    Kernel {
+        xs: kernel_x([s0, s2, s4, s6], x_scale),
+        ts: kernel_t([s1, s3, s5, s7]),
+    }
+}
+
+#[inline(always)]
+fn dequantize(c: i16, q: u16) -> Wrapping<i32> {
+    Wrapping(i32::from(c) * i32::from(q))
 }
 
 // 4x4 and 2x2 IDCT based on Rakesh Dugad and Narendra Ahuja: "A Fast Scheme for Image Size Change in the Compressed Domain" (2001).