Add 4x4 and 8x8 IDCT

kevinmehall · kevinmehall · commit ae7e1db3e6eb · 2019-11-12T12:24:21.000-08:00
diff --git a/src/idct.rs b/src/idct.rs
@@ -2,13 +2,6 @@
 // One example is tests/crashtest/images/imagetestsuite/b0b8914cc5f7a6eff409f16d8cc236c5.jpg
 // That's why wrapping operators are needed.
 
-pub fn dequantize_and_idct_block_1x1(coefficients: &[i16], quantization_table: &[u16; 64], _output_linestride: usize, output: &mut [u8]) {
-    debug_assert_eq!(coefficients.len(), 64);
-
-    let s0 = (coefficients[0] as i32 * quantization_table[0] as i32).wrapping_add(128 * 8) / 8;
-    output[0] = stbi_clamp(s0);
-}
-
 // This is based on stb_image's 'stbi__idct_block'.
 pub fn dequantize_and_idct_block_8x8(coefficients: &[i16], quantization_table: &[u16; 64], output_linestride: usize, output: &mut [u8]) {
     debug_assert_eq!(coefficients.len(), 64);
@@ -162,6 +155,103 @@ pub fn dequantize_and_idct_block_8x8(coefficients: &[i16], quantization_table: &
     }
 }
 
+// 4x4 and 2x2 IDCT based on Rakesh Dugad and Narendra Ahuja: "A Fast Scheme for Image Size Change in the Compressed Domain" (2001).
+// http://sylvana.net/jpegcrop/jidctred/
+pub fn dequantize_and_idct_block_4x4(coefficients: &[i16], quantization_table: &[u16; 64], output_linestride: usize, output: &mut [u8]) {
+    debug_assert_eq!(coefficients.len(), 64);
+    let mut temp = [0i32; 4*4];
+
+    const CONST_BITS: u32 = 12;
+    const PASS1_BITS: u32 = 2;
+    const FINAL_BITS: u32 = CONST_BITS + PASS1_BITS + 3;
+
+    // columns
+    for i in 0 .. 4 {
+        let s0 = coefficients[i + 8*0] as i32 * quantization_table[i + 8*0] as i32;
+        let s1 = coefficients[i + 8*1] as i32 * quantization_table[i + 8*1] as i32;
+        let s2 = coefficients[i + 8*2] as i32 * quantization_table[i + 8*2] as i32;
+        let s3 = coefficients[i + 8*3] as i32 * quantization_table[i + 8*3] as i32;
+    
+        let x0 = s0.wrapping_add(s2).wrapping_shl(PASS1_BITS);
+        let x2 = s0.wrapping_sub(s2).wrapping_shl(PASS1_BITS);
+
+        let p1 = s1.wrapping_add(s3).wrapping_mul(stbi_f2f(0.541196100));
+        let t0 = p1.wrapping_add(s3.wrapping_mul(stbi_f2f(-1.847759065))).wrapping_add(512).wrapping_shr(CONST_BITS - PASS1_BITS);
+        let t2 = p1.wrapping_add(s1.wrapping_mul(stbi_f2f( 0.765366865))).wrapping_add(512).wrapping_shr(CONST_BITS - PASS1_BITS);
+
+        temp[i + 4*0] = x0.wrapping_add(t2);
+        temp[i + 4*3] = x0.wrapping_sub(t2);
+        temp[i + 4*1] = x2.wrapping_add(t0);
+        temp[i + 4*2] = x2.wrapping_sub(t0);
+    }
+
+    for i in 0 .. 4 {
+        let s0 = temp[i * 4 + 0];
+        let s1 = temp[i * 4 + 1];
+        let s2 = temp[i * 4 + 2];
+        let s3 = temp[i * 4 + 3];
+
+        let x0 = s0.wrapping_add(s2).wrapping_shl(CONST_BITS);
+        let x2 = s0.wrapping_sub(s2).wrapping_shl(CONST_BITS);
+
+        let p1 = s1.wrapping_add(s3).wrapping_mul(stbi_f2f(0.541196100));
+        let t0 = p1.wrapping_add(s3.wrapping_mul(stbi_f2f(-1.847759065)));
+        let t2 = p1.wrapping_add(s1.wrapping_mul(stbi_f2f(0.765366865)));
+
+        // constants scaled things up by 1<<12, plus we had 1<<2 from first
+        // loop, plus horizontal and vertical each scale by sqrt(8) so together
+        // we've got an extra 1<<3, so 1<<17 total we need to remove.
+        // so we want to round that, which means adding 0.5 * 1<<17,
+        // aka 65536. Also, we'll end up with -128 to 127 that we want
+        // to encode as 0..255 by adding 128, so we'll add that before the shift
+        let x0 = x0.wrapping_add((1 << (FINAL_BITS - 1)) + (128 << FINAL_BITS));
+        let x2 = x2.wrapping_add((1 << (FINAL_BITS - 1)) + (128 << FINAL_BITS));
+
+        output[i * output_linestride + 0] = stbi_clamp(x0.wrapping_add(t2).wrapping_shr(FINAL_BITS));
+        output[i * output_linestride + 3] = stbi_clamp(x0.wrapping_sub(t2).wrapping_shr(FINAL_BITS));
+        output[i * output_linestride + 1] = stbi_clamp(x2.wrapping_add(t0).wrapping_shr(FINAL_BITS));
+        output[i * output_linestride + 2] = stbi_clamp(x2.wrapping_sub(t0).wrapping_shr(FINAL_BITS));
+    }
+}
+
+pub fn dequantize_and_idct_block_2x2(coefficients: &[i16], quantization_table: &[u16; 64], output_linestride: usize, output: &mut [u8]) {
+    debug_assert_eq!(coefficients.len(), 64);
+
+    const SCALE_BITS: u32 = 3;
+
+    // Column 0
+    let s00 = coefficients[8*0] as i32 * quantization_table[8*0] as i32;
+    let s10 = coefficients[8*1] as i32 * quantization_table[8*1] as i32;
+
+    let x0 = s00.wrapping_add(s10);
+    let x2 = s00.wrapping_sub(s10);
+
+    // Column 1
+    let s01 = coefficients[8*0+1] as i32 * quantization_table[8*0+1] as i32;
+    let s11 = coefficients[8*1+1] as i32 * quantization_table[8*1+1] as i32;
+
+    let x1 = s01.wrapping_add(s11);
+    let x3 = s01.wrapping_sub(s11);
+
+    let x0 = x0.wrapping_add((1 << (SCALE_BITS-1)) + (128 << SCALE_BITS));
+    let x2 = x2.wrapping_add((1 << (SCALE_BITS-1)) + (128 << SCALE_BITS));
+
+    // Row 0
+    output[0] = stbi_clamp(x0.wrapping_add(x1).wrapping_shr(SCALE_BITS));
+    output[1] = stbi_clamp(x0.wrapping_sub(x1).wrapping_shr(SCALE_BITS));
+
+    // Row 1
+    output[output_linestride + 0] = stbi_clamp(x2.wrapping_add(x3).wrapping_shr(SCALE_BITS));
+    output[output_linestride + 1] = stbi_clamp(x2.wrapping_sub(x3).wrapping_shr(SCALE_BITS));
+}
+
+pub fn dequantize_and_idct_block_1x1(coefficients: &[i16], quantization_table: &[u16; 64], _output_linestride: usize, output: &mut [u8]) {
+    debug_assert_eq!(coefficients.len(), 64);
+
+    let s0 = (coefficients[0] as i32 * quantization_table[0] as i32).wrapping_add(128 * 8) / 8;
+    output[0] = stbi_clamp(s0);
+}
+
 // take a -128..127 value and stbi__clamp it and convert to 0..255
 fn stbi_clamp(x: i32) -> u8
 {
diff --git a/src/worker/immediate.rs b/src/worker/immediate.rs
@@ -1,6 +1,6 @@
 use decoder::MAX_COMPONENTS;
 use error::Result;
-use idct::{ dequantize_and_idct_block_8x8, dequantize_and_idct_block_1x1 };
+use idct::{ dequantize_and_idct_block_8x8, dequantize_and_idct_block_4x4, dequantize_and_idct_block_2x2, dequantize_and_idct_block_1x1 };
 use std::mem;
 use std::sync::Arc;
 use parser::Component;
@@ -49,6 +49,8 @@ impl ImmediateWorker {
 
             match component.dct_scale {
                 8 => dequantize_and_idct_block_8x8(coefficients, quantization_table, line_stride, output),
+                4 => dequantize_and_idct_block_4x4(coefficients, quantization_table, line_stride, output),
+                2 => dequantize_and_idct_block_2x2(coefficients, quantization_table, line_stride, output),
                 1 => dequantize_and_idct_block_1x1(coefficients, quantization_table, line_stride, output),
                 _ => unimplemented!(),
             }