Skip to content

Commit 70c1668

Browse files
committed
refactor: Extract and clarify the 8x8 idct implementation
This code is extracted from https://github.com/Marwes/combine-jpeg which is originally based on this repo. While the parser parts are mostly rewritten, the parts like idct, huffman etc are mostly the same, though with some performance and (in my opinion) clarity improvements to the code. If this PR is accepted I may try and extract some other parts as well. (While this PR doesn't contain any performance improvements, I will just add that `combine-jpeg` has some significant performance improvements. It still lacks some features but it's almost, but not quite as fast as mozjpeg on the images that it accepts (no simd for either). Some of this is because of unsafe, some of it because of less overhead in the decoding). ``` it_works/combine time: [2.6562 ms 2.6633 ms 2.6749 ms] change: [-7.2890% -1.9145% +1.5656%] (p = 0.68 > 0.05) No change in performance detected. Found 2 outliers among 10 measurements (20.00%) 2 (20.00%) high severe it_works/mozjpeg time: [2.0400 ms 2.0415 ms 2.0429 ms] change: [+0.4243% +0.6680% +0.8305%] (p = 0.00 < 0.05) Change within noise threshold. Found 1 outliers among 10 measurements (10.00%) 1 (10.00%) high mild it_works/jpeg-decoder time: [8.9513 ms 9.1526 ms 9.3654 ms] change: [-13.663% -5.0961% +1.6086%] (p = 0.29 > 0.05) No change in performance detected. Found 2 outliers among 10 measurements (20.00%) 1 (10.00%) low severe 1 (10.00%) high mild ```
1 parent 073e21e commit 70c1668

File tree

1 file changed

+175
-123
lines changed

1 file changed

+175
-123
lines changed

src/idct.rs

Lines changed: 175 additions & 123 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@
22
// One example is tests/crashtest/images/imagetestsuite/b0b8914cc5f7a6eff409f16d8cc236c5.jpg
33
// That's why wrapping operators are needed.
44
use crate::parser::Dimensions;
5-
use std::num::Wrapping;
5+
use std::{
6+
convert::TryFrom,
7+
num::Wrapping,
8+
};
69

710
pub(crate) fn choose_idct_size(full_size: Dimensions, requested_size: Dimensions) -> usize {
811
fn scaled(len: u16, scale: usize) -> u16 { ((len as u32 * scale as u32 - 1) / 8 + 1) as u16 }
@@ -28,7 +31,7 @@ fn test_choose_idct_size() {
2831
assert_eq!(choose_idct_size(Dimensions{width: 5472, height: 3648}, Dimensions{width: 685, height: 999}), 2);
2932
assert_eq!(choose_idct_size(Dimensions{width: 5472, height: 3648}, Dimensions{width: 1000, height: 1000}), 2);
3033
assert_eq!(choose_idct_size(Dimensions{width: 5472, height: 3648}, Dimensions{width: 1400, height: 1400}), 4);
31-
34+
3235
assert_eq!(choose_idct_size(Dimensions{width: 5472, height: 3648}, Dimensions{width: 5472, height: 3648}), 8);
3336
assert_eq!(choose_idct_size(Dimensions{width: 5472, height: 3648}, Dimensions{width: 16384, height: 16384}), 8);
3437
assert_eq!(choose_idct_size(Dimensions{width: 1, height: 1}, Dimensions{width: 65535, height: 65535}), 8);
@@ -45,79 +48,74 @@ pub(crate) fn dequantize_and_idct_block(scale: usize, coefficients: &[i16], quan
4548
}
4649
}
4750

48-
// This is based on stb_image's 'stbi__idct_block'.
49-
fn dequantize_and_idct_block_8x8(coefficients: &[i16], quantization_table: &[u16; 64], output_linestride: usize, output: &mut [u8]) {
51+
pub fn dequantize_and_idct_block_8x8(
52+
coefficients: &[i16],
53+
quantization_table: &[u16; 64],
54+
output_linestride: usize,
55+
output: &mut [u8]
56+
) {
5057
debug_assert_eq!(coefficients.len(), 64);
58+
let output = output
59+
.chunks_mut(output_linestride);
60+
dequantize_and_idct_block_8x8_inner(coefficients, quantization_table, output)
61+
}
62+
63+
// This is based on stb_image's 'stbi__idct_block'.
64+
fn dequantize_and_idct_block_8x8_inner<'a, I>(
65+
coefficients: &[i16],
66+
quantization_table: &[u16; 64],
67+
output: I,
68+
) where
69+
I: IntoIterator<Item = &'a mut [u8]>,
70+
I::IntoIter: ExactSizeIterator<Item = &'a mut [u8]>,
71+
{
72+
let output = output.into_iter();
73+
debug_assert!(
74+
output.len() >= 8,
75+
"Output iterator has the wrong length: {}",
76+
output.len()
77+
);
5178

52-
let mut temp = [Wrapping(0i32); 64];
79+
let mut temp = [Wrapping(0); 64];
5380

5481
// columns
55-
for i in 0 .. 8 {
56-
// if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
57-
if coefficients[i + 8] == 0 && coefficients[i + 16] == 0 && coefficients[i + 24] == 0 &&
58-
coefficients[i + 32] == 0 && coefficients[i + 40] == 0 && coefficients[i + 48] == 0 &&
59-
coefficients[i + 56] == 0 {
60-
let dcterm = Wrapping(coefficients[i] as i32 * quantization_table[i] as i32) << 2;
61-
temp[i] = dcterm;
62-
temp[i + 8] = dcterm;
82+
for i in 0..8 {
83+
if coefficients[i + 8] == 0
84+
&& coefficients[i + 16] == 0
85+
&& coefficients[i + 24] == 0
86+
&& coefficients[i + 32] == 0
87+
&& coefficients[i + 40] == 0
88+
&& coefficients[i + 48] == 0
89+
&& coefficients[i + 56] == 0
90+
{
91+
let dcterm = dequantize(coefficients[i], quantization_table[i]) << 2;
92+
temp[i] = dcterm;
93+
temp[i + 8] = dcterm;
6394
temp[i + 16] = dcterm;
6495
temp[i + 24] = dcterm;
6596
temp[i + 32] = dcterm;
6697
temp[i + 40] = dcterm;
6798
temp[i + 48] = dcterm;
6899
temp[i + 56] = dcterm;
69-
}
70-
else {
71-
let s0 = Wrapping(coefficients[i] as i32 * quantization_table[i] as i32);
72-
let s1 = Wrapping(coefficients[i + 8] as i32 * quantization_table[i + 8] as i32);
73-
let s2 = Wrapping(coefficients[i + 16] as i32 * quantization_table[i + 16] as i32);
74-
let s3 = Wrapping(coefficients[i + 24] as i32 * quantization_table[i + 24] as i32);
75-
let s4 = Wrapping(coefficients[i + 32] as i32 * quantization_table[i + 32] as i32);
76-
let s5 = Wrapping(coefficients[i + 40] as i32 * quantization_table[i + 40] as i32);
77-
let s6 = Wrapping(coefficients[i + 48] as i32 * quantization_table[i + 48] as i32);
78-
let s7 = Wrapping(coefficients[i + 56] as i32 * quantization_table[i + 56] as i32);
79-
80-
let p2 = s2;
81-
let p3 = s6;
82-
let p1 = (p2 + p3) * stbi_f2f(0.5411961);
83-
let t2 = p1 + p3 * stbi_f2f(-1.847759065);
84-
let t3 = p1 + p2 * stbi_f2f(0.765366865);
85-
let p2 = s0;
86-
let p3 = s4;
87-
let t0 = stbi_fsh(p2 + p3);
88-
let t1 = stbi_fsh(p2 - p3);
89-
let x0 = t0 + t3;
90-
let x3 = t0 - t3;
91-
let x1 = t1 + t2;
92-
let x2 = t1 - t2;
93-
let t0 = s7;
94-
let t1 = s5;
95-
let t2 = s3;
96-
let t3 = s1;
97-
let p3 = t0 + t2;
98-
let p4 = t1 + t3;
99-
let p1 = t0 + t3;
100-
let p2 = t1 + t2;
101-
let p5 = (p3 + p4) * stbi_f2f(1.175875602);
102-
let t0 = t0 * stbi_f2f(0.298631336);
103-
let t1 = t1 * stbi_f2f(2.053119869);
104-
let t2 = t2 * stbi_f2f(3.072711026);
105-
let t3 = t3 * stbi_f2f(1.501321110);
106-
let p1 = p5 + (p1 * stbi_f2f(-0.899976223));
107-
let p2 = p5 + (p2 * stbi_f2f(-2.562915447));
108-
let p3 = p3 * stbi_f2f(-1.961570560);
109-
let p4 = p4 * stbi_f2f(-0.390180644);
110-
let t3 = t3 + p1 + p4;
111-
let t2 = t2 + p2 + p3;
112-
let t1 = t1 + p2 + p4;
113-
let t0 = t0 + p1 + p3;
114-
115-
// constants scaled things up by 1<<12; let's bring them back
116-
// down, but keep 2 extra bits of precision
117-
let x0 = x0 + Wrapping(512);
118-
let x1 = x1 + Wrapping(512);
119-
let x2 = x2 + Wrapping(512);
120-
let x3 = x3 + Wrapping(512);
100+
} else {
101+
let s0 = dequantize(coefficients[i], quantization_table[i]);
102+
let s1 = dequantize(coefficients[i + 8], quantization_table[i + 8]);
103+
let s2 = dequantize(coefficients[i + 16], quantization_table[i + 16]);
104+
let s3 = dequantize(coefficients[i + 24], quantization_table[i + 24]);
105+
let s4 = dequantize(coefficients[i + 32], quantization_table[i + 32]);
106+
let s5 = dequantize(coefficients[i + 40], quantization_table[i + 40]);
107+
let s6 = dequantize(coefficients[i + 48], quantization_table[i + 48]);
108+
let s7 = dequantize(coefficients[i + 56], quantization_table[i + 56]);
109+
110+
let Kernel {
111+
xs: [x0, x1, x2, x3],
112+
ts: [t0, t1, t2, t3],
113+
} = kernel(
114+
[s0, s1, s2, s3, s4, s5, s6, s7],
115+
// constants scaled things up by 1<<12; let's bring them back
116+
// down, but keep 2 extra bits of precision
117+
512,
118+
);
121119

122120
temp[i] = (x0 + t3) >> 10;
123121
temp[i + 56] = (x0 - t3) >> 10;
@@ -130,72 +128,126 @@ fn dequantize_and_idct_block_8x8(coefficients: &[i16], quantization_table: &[u16
130128
}
131129
}
132130

133-
for i in 0 .. 8 {
134-
// no fast case since the first 1D IDCT spread components out
135-
let s0 = temp[i * 8];
136-
let s1 = temp[i * 8 + 1];
137-
let s2 = temp[i * 8 + 2];
138-
let s3 = temp[i * 8 + 3];
139-
let s4 = temp[i * 8 + 4];
140-
let s5 = temp[i * 8 + 5];
141-
let s6 = temp[i * 8 + 6];
142-
let s7 = temp[i * 8 + 7];
143-
144-
let p2 = s2;
145-
let p3 = s6;
146-
let p1 = (p2 + p3) * stbi_f2f(0.5411961);
147-
let t2 = p1 + p3 * stbi_f2f(-1.847759065);
148-
let t3 = p1 + p2 * stbi_f2f(0.765366865);
149-
let p2 = s0;
150-
let p3 = s4;
151-
let t0 = stbi_fsh(p2 + p3);
152-
let t1 = stbi_fsh(p2 - p3);
153-
let x0 = t0 + t3;
154-
let x3 = t0 - t3;
155-
let x1 = t1 + t2;
156-
let x2 = t1 - t2;
157-
let t0 = s7;
158-
let t1 = s5;
159-
let t2 = s3;
160-
let t3 = s1;
161-
let p3 = t0 + t2;
162-
let p4 = t1 + t3;
163-
let p1 = t0 + t3;
164-
let p2 = t1 + t2;
165-
let p5 = (p3 + p4) * stbi_f2f(1.175875602);
166-
let t0 = t0 * stbi_f2f(0.298631336);
167-
let t1 = t1 * stbi_f2f(2.053119869);
168-
let t2 = t2 * stbi_f2f(3.072711026);
169-
let t3 = t3 * stbi_f2f(1.501321110);
170-
let p1 = p5 + p1 * stbi_f2f(-0.899976223);
171-
let p2 = p5 + p2 * stbi_f2f(-2.562915447);
172-
let p3 = p3 * stbi_f2f(-1.961570560);
173-
let p4 = p4 * stbi_f2f(-0.390180644);
174-
let t3 = t3 + p1 + p4;
175-
let t2 = t2 + p2 + p3;
176-
let t1 = t1 + p2 + p4;
177-
let t0 = t0 + p1 + p3;
131+
for (chunk, output_chunk) in temp.chunks_exact(8).zip(output) {
132+
let chunk = <&[_; 8]>::try_from(chunk).unwrap();
178133

179134
// constants scaled things up by 1<<12, plus we had 1<<2 from first
180135
// loop, plus horizontal and vertical each scale by sqrt(8) so together
181136
// we've got an extra 1<<3, so 1<<17 total we need to remove.
182137
// so we want to round that, which means adding 0.5 * 1<<17,
183138
// aka 65536. Also, we'll end up with -128 to 127 that we want
184139
// to encode as 0..255 by adding 128, so we'll add that before the shift
185-
let x0 = x0 + Wrapping(65536 + (128 << 17));
186-
let x1 = x1 + Wrapping(65536 + (128 << 17));
187-
let x2 = x2 + Wrapping(65536 + (128 << 17));
188-
let x3 = x3 + Wrapping(65536 + (128 << 17));
189-
190-
output[i * output_linestride] = stbi_clamp((x0 + t3) >> 17);
191-
output[i * output_linestride + 7] = stbi_clamp((x0 - t3) >> 17);
192-
output[i * output_linestride + 1] = stbi_clamp((x1 + t2) >> 17);
193-
output[i * output_linestride + 6] = stbi_clamp((x1 - t2) >> 17);
194-
output[i * output_linestride + 2] = stbi_clamp((x2 + t1) >> 17);
195-
output[i * output_linestride + 5] = stbi_clamp((x2 - t1) >> 17);
196-
output[i * output_linestride + 3] = stbi_clamp((x3 + t0) >> 17);
197-
output[i * output_linestride + 4] = stbi_clamp((x3 - t0) >> 17);
140+
const X_SCALE: i32 = 65536 + (128 << 17);
141+
142+
let [s0, rest @ ..] = chunk;
143+
if *rest == [Wrapping(0); 7] {
144+
let dcterm = stbi_clamp((stbi_fsh(*s0) + Wrapping(X_SCALE)) >> 17);
145+
output_chunk[0] = dcterm;
146+
output_chunk[1] = dcterm;
147+
output_chunk[2] = dcterm;
148+
output_chunk[3] = dcterm;
149+
output_chunk[4] = dcterm;
150+
output_chunk[5] = dcterm;
151+
output_chunk[6] = dcterm;
152+
output_chunk[7] = dcterm;
153+
} else {
154+
let Kernel {
155+
xs: [x0, x1, x2, x3],
156+
ts: [t0, t1, t2, t3],
157+
} = kernel(*chunk, X_SCALE);
158+
159+
output_chunk[0] = stbi_clamp((x0 + t3) >> 17);
160+
output_chunk[7] = stbi_clamp((x0 - t3) >> 17);
161+
output_chunk[1] = stbi_clamp((x1 + t2) >> 17);
162+
output_chunk[6] = stbi_clamp((x1 - t2) >> 17);
163+
output_chunk[2] = stbi_clamp((x2 + t1) >> 17);
164+
output_chunk[5] = stbi_clamp((x2 - t1) >> 17);
165+
output_chunk[3] = stbi_clamp((x3 + t0) >> 17);
166+
output_chunk[4] = stbi_clamp((x3 - t0) >> 17);
167+
}
168+
}
169+
}
170+
171+
struct Kernel {
172+
xs: [Wrapping<i32>; 4],
173+
ts: [Wrapping<i32>; 4],
174+
}
175+
176+
#[inline]
177+
fn kernel_x([s0, s2, s4, s6]: [Wrapping<i32>; 4], x_scale: i32) -> [Wrapping<i32>; 4] {
178+
// Even `chunk` indicies
179+
let (t2, t3);
180+
{
181+
let p2 = s2;
182+
let p3 = s6;
183+
184+
let p1 = (p2 + p3) * stbi_f2f(0.5411961);
185+
t2 = p1 + p3 * stbi_f2f(-1.847759065);
186+
t3 = p1 + p2 * stbi_f2f(0.765366865);
187+
}
188+
189+
let (t0, t1);
190+
{
191+
let p2 = s0;
192+
let p3 = s4;
193+
194+
t0 = stbi_fsh(p2 + p3);
195+
t1 = stbi_fsh(p2 - p3);
198196
}
197+
198+
let x0 = t0 + t3;
199+
let x3 = t0 - t3;
200+
let x1 = t1 + t2;
201+
let x2 = t1 - t2;
202+
203+
let x_scale = Wrapping(x_scale);
204+
205+
[x0 + x_scale, x1 + x_scale, x2 + x_scale, x3 + x_scale]
206+
}
207+
208+
#[inline]
209+
fn kernel_t([s1, s3, s5, s7]: [Wrapping<i32>; 4]) -> [Wrapping<i32>; 4] {
210+
// Odd `chunk` indicies
211+
let mut t0 = s7;
212+
let mut t1 = s5;
213+
let mut t2 = s3;
214+
let mut t3 = s1;
215+
216+
let p3 = t0 + t2;
217+
let p4 = t1 + t3;
218+
let p1 = t0 + t3;
219+
let p2 = t1 + t2;
220+
let p5 = (p3 + p4) * stbi_f2f(1.175875602);
221+
222+
t0 *= stbi_f2f(0.298631336);
223+
t1 *= stbi_f2f(2.053119869);
224+
t2 *= stbi_f2f(3.072711026);
225+
t3 *= stbi_f2f(1.501321110);
226+
227+
let p1 = p5 + p1 * stbi_f2f(-0.899976223);
228+
let p2 = p5 + p2 * stbi_f2f(-2.562915447);
229+
let p3 = p3 * stbi_f2f(-1.961570560);
230+
let p4 = p4 * stbi_f2f(-0.390180644);
231+
232+
t3 += p1 + p4;
233+
t2 += p2 + p3;
234+
t1 += p2 + p4;
235+
t0 += p1 + p3;
236+
237+
[t0, t1, t2, t3]
238+
}
239+
240+
#[inline]
241+
fn kernel([s0, s1, s2, s3, s4, s5, s6, s7]: [Wrapping<i32>; 8], x_scale: i32) -> Kernel {
242+
Kernel {
243+
xs: kernel_x([s0, s2, s4, s6], x_scale),
244+
ts: kernel_t([s1, s3, s5, s7]),
245+
}
246+
}
247+
248+
#[inline(always)]
249+
fn dequantize(c: i16, q: u16) -> Wrapping<i32> {
250+
Wrapping(i32::from(c) * i32::from(q))
199251
}
200252

201253
// 4x4 and 2x2 IDCT based on Rakesh Dugad and Narendra Ahuja: "A Fast Scheme for Image Size Change in the Compressed Domain" (2001).

0 commit comments

Comments
 (0)