Skip to content

Commit 56bb2a0

Browse files
author
HeroicKatora
authored
Merge pull request #211 from veluca93/master
SSSE3 implementations of 8x8 IDCT and YCbCr conversion
2 parents b3c025f + 2986d5a commit 56bb2a0

File tree

9 files changed

+1047
-268
lines changed

9 files changed

+1047
-268
lines changed

Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,5 +22,10 @@ criterion = "0.3"
2222
name = "decoding_benchmark"
2323
harness = false
2424

25+
[[bench]]
26+
name = "large_image"
27+
harness = false
28+
2529
[features]
2630
default = ["rayon"]
31+
platform_independent = []

benches/large_image.jpg

488 KB
Loading

benches/large_image.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
extern crate criterion;
2+
extern crate jpeg_decoder;
3+
4+
use criterion::{black_box, Criterion};
5+
6+
use jpeg_decoder as jpeg;
7+
8+
fn read_image(image: &[u8]) -> Vec<u8> {
9+
jpeg::Decoder::new(black_box(image)).decode().unwrap()
10+
}
11+
12+
fn main() {
13+
let mut c = Criterion::default().configure_from_args();
14+
c.bench_function("decode a 2268x1512 JPEG", |b| {
15+
b.iter(|| read_image(include_bytes!("large_image.jpg")))
16+
});
17+
c.final_summary();
18+
}

src/arch/mod.rs

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#![allow(unsafe_code)]
2+
3+
mod ssse3;
4+
5+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
6+
use std::is_x86_feature_detected;
7+
8+
/// Arch-specific implementation of YCbCr conversion. Returns the number of pixels that were
9+
/// converted.
10+
pub fn get_color_convert_line_ycbcr() -> Option<unsafe fn(&[u8], &[u8], &[u8], &mut [u8]) -> usize>
11+
{
12+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
13+
#[allow(unsafe_code)]
14+
{
15+
if is_x86_feature_detected!("ssse3") {
16+
return Some(ssse3::color_convert_line_ycbcr);
17+
}
18+
}
19+
None
20+
}
21+
22+
/// Arch-specific implementation of 8x8 IDCT.
23+
pub fn get_dequantize_and_idct_block_8x8() -> Option<unsafe fn(&[i16], &[u16; 64], usize, &mut [u8])>
24+
{
25+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
26+
#[allow(unsafe_code)]
27+
{
28+
if is_x86_feature_detected!("ssse3") {
29+
return Some(ssse3::dequantize_and_idct_block_8x8);
30+
}
31+
}
32+
None
33+
}

src/arch/ssse3.rs

Lines changed: 289 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,289 @@
1+
#[cfg(target_arch = "x86")]
2+
use std::arch::x86::*;
3+
#[cfg(target_arch = "x86_64")]
4+
use std::arch::x86_64::*;
5+
6+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
7+
#[target_feature(enable = "ssse3")]
8+
unsafe fn idct8(data: &mut [__m128i; 8]) {
9+
// The fixed-point constants here are obtained by taking the fractional part of the constants
10+
// from the non-SIMD implementation and scaling them up by 1<<15. This is because
11+
// _mm_mulhrs_epi16(a, b) is effectively equivalent to (a*b)>>15 (except for possibly some
12+
// slight differences in rounding).
13+
14+
// The code here is effectively equivalent to the calls to "kernel" in idct.rs, except that it
15+
// doesn't apply any further scaling and fixed point constants have a different precision.
16+
17+
let p2 = data[2];
18+
let p3 = data[6];
19+
let p1 = _mm_mulhrs_epi16(_mm_adds_epi16(p2, p3), _mm_set1_epi16(17734)); // 0.5411961
20+
let t2 = _mm_subs_epi16(
21+
_mm_subs_epi16(p1, p3),
22+
_mm_mulhrs_epi16(p3, _mm_set1_epi16(27779)), // 0.847759065
23+
);
24+
let t3 = _mm_adds_epi16(p1, _mm_mulhrs_epi16(p2, _mm_set1_epi16(25079))); // 0.765366865
25+
26+
let p2 = data[0];
27+
let p3 = data[4];
28+
let t0 = _mm_adds_epi16(p2, p3);
29+
let t1 = _mm_subs_epi16(p2, p3);
30+
31+
let x0 = _mm_adds_epi16(t0, t3);
32+
let x3 = _mm_subs_epi16(t0, t3);
33+
let x1 = _mm_adds_epi16(t1, t2);
34+
let x2 = _mm_subs_epi16(t1, t2);
35+
36+
let t0 = data[7];
37+
let t1 = data[5];
38+
let t2 = data[3];
39+
let t3 = data[1];
40+
41+
let p3 = _mm_adds_epi16(t0, t2);
42+
let p4 = _mm_adds_epi16(t1, t3);
43+
let p1 = _mm_adds_epi16(t0, t3);
44+
let p2 = _mm_adds_epi16(t1, t2);
45+
let p5 = _mm_adds_epi16(p3, p4);
46+
let p5 = _mm_adds_epi16(p5, _mm_mulhrs_epi16(p5, _mm_set1_epi16(5763))); // 0.175875602
47+
48+
let t0 = _mm_mulhrs_epi16(t0, _mm_set1_epi16(9786)); // 0.298631336
49+
let t1 = _mm_adds_epi16(
50+
_mm_adds_epi16(t1, t1),
51+
_mm_mulhrs_epi16(t1, _mm_set1_epi16(1741)), // 0.053119869
52+
);
53+
let t2 = _mm_adds_epi16(
54+
_mm_adds_epi16(t2, _mm_adds_epi16(t2, t2)),
55+
_mm_mulhrs_epi16(t2, _mm_set1_epi16(2383)), // 0.072711026
56+
);
57+
let t3 = _mm_adds_epi16(t3, _mm_mulhrs_epi16(t3, _mm_set1_epi16(16427))); // 0.501321110
58+
59+
let p1 = _mm_subs_epi16(p5, _mm_mulhrs_epi16(p1, _mm_set1_epi16(29490))); // 0.899976223
60+
let p2 = _mm_subs_epi16(
61+
_mm_subs_epi16(_mm_subs_epi16(p5, p2), p2),
62+
_mm_mulhrs_epi16(p2, _mm_set1_epi16(18446)), // 0.562915447
63+
);
64+
65+
let p3 = _mm_subs_epi16(
66+
_mm_mulhrs_epi16(p3, _mm_set1_epi16(-31509)), // -0.961570560
67+
p3,
68+
);
69+
let p4 = _mm_mulhrs_epi16(p4, _mm_set1_epi16(-12785)); // -0.390180644
70+
71+
let t3 = _mm_adds_epi16(_mm_adds_epi16(p1, p4), t3);
72+
let t2 = _mm_adds_epi16(_mm_adds_epi16(p2, p3), t2);
73+
let t1 = _mm_adds_epi16(_mm_adds_epi16(p2, p4), t1);
74+
let t0 = _mm_adds_epi16(_mm_adds_epi16(p1, p3), t0);
75+
76+
data[0] = _mm_adds_epi16(x0, t3);
77+
data[7] = _mm_subs_epi16(x0, t3);
78+
data[1] = _mm_adds_epi16(x1, t2);
79+
data[6] = _mm_subs_epi16(x1, t2);
80+
data[2] = _mm_adds_epi16(x2, t1);
81+
data[5] = _mm_subs_epi16(x2, t1);
82+
data[3] = _mm_adds_epi16(x3, t0);
83+
data[4] = _mm_subs_epi16(x3, t0);
84+
}
85+
86+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
87+
#[target_feature(enable = "ssse3")]
88+
unsafe fn transpose8(data: &mut [__m128i; 8]) {
89+
// Transpose a 8x8 matrix with a sequence of interleaving operations.
90+
// Naming: dABl contains elements from the *l*ower halves of vectors A and B, interleaved, i.e.
91+
// A0 B0 A1 B1 ...
92+
// dABCDll contains elements from the lower quarter (ll) of vectors A, B, C, D, interleaved -
93+
// A0 B0 C0 D0 A1 B1 C1 D1 ...
94+
let d01l = _mm_unpacklo_epi16(data[0], data[1]);
95+
let d23l = _mm_unpacklo_epi16(data[2], data[3]);
96+
let d45l = _mm_unpacklo_epi16(data[4], data[5]);
97+
let d67l = _mm_unpacklo_epi16(data[6], data[7]);
98+
let d01h = _mm_unpackhi_epi16(data[0], data[1]);
99+
let d23h = _mm_unpackhi_epi16(data[2], data[3]);
100+
let d45h = _mm_unpackhi_epi16(data[4], data[5]);
101+
let d67h = _mm_unpackhi_epi16(data[6], data[7]);
102+
// Operating on 32-bits will interleave *consecutive pairs* of 16-bit integers.
103+
let d0123ll = _mm_unpacklo_epi32(d01l, d23l);
104+
let d0123lh = _mm_unpackhi_epi32(d01l, d23l);
105+
let d4567ll = _mm_unpacklo_epi32(d45l, d67l);
106+
let d4567lh = _mm_unpackhi_epi32(d45l, d67l);
107+
let d0123hl = _mm_unpacklo_epi32(d01h, d23h);
108+
let d0123hh = _mm_unpackhi_epi32(d01h, d23h);
109+
let d4567hl = _mm_unpacklo_epi32(d45h, d67h);
110+
let d4567hh = _mm_unpackhi_epi32(d45h, d67h);
111+
// Operating on 64-bits will interleave *consecutive quadruples* of 16-bit integers.
112+
data[0] = _mm_unpacklo_epi64(d0123ll, d4567ll);
113+
data[1] = _mm_unpackhi_epi64(d0123ll, d4567ll);
114+
data[2] = _mm_unpacklo_epi64(d0123lh, d4567lh);
115+
data[3] = _mm_unpackhi_epi64(d0123lh, d4567lh);
116+
data[4] = _mm_unpacklo_epi64(d0123hl, d4567hl);
117+
data[5] = _mm_unpackhi_epi64(d0123hl, d4567hl);
118+
data[6] = _mm_unpacklo_epi64(d0123hh, d4567hh);
119+
data[7] = _mm_unpackhi_epi64(d0123hh, d4567hh);
120+
}
121+
122+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
123+
#[target_feature(enable = "ssse3")]
124+
pub unsafe fn dequantize_and_idct_block_8x8(
125+
coefficients: &[i16],
126+
quantization_table: &[u16; 64],
127+
output_linestride: usize,
128+
output: &mut [u8],
129+
) {
130+
assert!(coefficients.len() >= 64);
131+
// The loop below will write to positions [output_linestride * i, output_linestride * i + 8)
132+
// for 0<=i<8. Thus, the last accessed position is at an offset of output_linestrade * 7 + 7,
133+
// and if that position is in-bounds, so are all other accesses.
134+
assert!(
135+
output.len()
136+
> output_linestride
137+
.checked_mul(7)
138+
.unwrap()
139+
.checked_add(7)
140+
.unwrap()
141+
);
142+
143+
#[cfg(target_arch = "x86")]
144+
use std::arch::x86::*;
145+
#[cfg(target_arch = "x86_64")]
146+
use std::arch::x86_64::*;
147+
148+
const SHIFT: i32 = 3;
149+
150+
// Read the DCT coefficients, scale them up and dequantize them.
151+
let mut data = [_mm_setzero_si128(); 8];
152+
for i in 0..8 {
153+
data[i] = _mm_slli_epi16(
154+
_mm_mullo_epi16(
155+
_mm_loadu_si128(coefficients.as_ptr().wrapping_add(i * 8) as *const _),
156+
_mm_loadu_si128(quantization_table.as_ptr().wrapping_add(i * 8) as *const _),
157+
),
158+
SHIFT,
159+
);
160+
}
161+
162+
// Usual column IDCT - transpose - column IDCT - transpose approach.
163+
idct8(&mut data);
164+
transpose8(&mut data);
165+
idct8(&mut data);
166+
transpose8(&mut data);
167+
168+
for i in 0..8 {
169+
let mut buf = [0u8; 16];
170+
// The two passes of the IDCT algorithm give us a factor of 8, so the shift here is
171+
// increased by 3.
172+
// As values will be stored in a u8, they need to be 128-centered and not 0-centered.
173+
// We add 128 with the appropriate shift for that purpose.
174+
const OFFSET: i16 = 128 << (SHIFT + 3);
175+
// We want rounding right shift, so we should add (1/2) << (SHIFT+3) before shifting.
176+
const ROUNDING_BIAS: i16 = (1 << (SHIFT + 3)) >> 1;
177+
178+
let data_with_offset = _mm_adds_epi16(data[i], _mm_set1_epi16(OFFSET + ROUNDING_BIAS));
179+
180+
_mm_storeu_si128(
181+
buf.as_mut_ptr() as *mut _,
182+
_mm_packus_epi16(
183+
_mm_srai_epi16(data_with_offset, SHIFT + 3),
184+
_mm_setzero_si128(),
185+
),
186+
);
187+
std::ptr::copy_nonoverlapping::<u8>(
188+
buf.as_ptr(),
189+
output.as_mut_ptr().wrapping_add(output_linestride * i) as *mut _,
190+
8,
191+
);
192+
}
193+
}
194+
195+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
196+
#[target_feature(enable = "ssse3")]
197+
pub unsafe fn color_convert_line_ycbcr(y: &[u8], cb: &[u8], cr: &[u8], output: &mut [u8]) -> usize {
198+
assert!(output.len() % 3 == 0);
199+
let num = output.len() / 3;
200+
assert!(num <= y.len());
201+
assert!(num <= cb.len());
202+
assert!(num <= cr.len());
203+
// _mm_loadu_si64 generates incorrect code for Rust <1.58. To circumvent this, we use a full
204+
// 128-bit load, but that requires leaving an extra vector of border to the scalar code.
205+
// From Rust 1.58 on, the _mm_loadu_si128 can be replaced with _mm_loadu_si64 and this
206+
// .saturating_sub() can be removed.
207+
let num_vecs = (num / 8).saturating_sub(1);
208+
209+
for i in 0..num_vecs {
210+
const SHIFT: i32 = 6;
211+
// Load.
212+
let y = _mm_loadu_si128(y.as_ptr().wrapping_add(i * 8) as *const _);
213+
let cb = _mm_loadu_si128(cb.as_ptr().wrapping_add(i * 8) as *const _);
214+
let cr = _mm_loadu_si128(cr.as_ptr().wrapping_add(i * 8) as *const _);
215+
216+
// Convert to 16 bit.
217+
let shuf16 = _mm_setr_epi8(
218+
0, -0x7F, 1, -0x7F, 2, -0x7F, 3, -0x7F, 4, -0x7F, 5, -0x7F, 6, -0x7F, 7, -0x7F,
219+
);
220+
let y = _mm_slli_epi16(_mm_shuffle_epi8(y, shuf16), SHIFT);
221+
let cb = _mm_slli_epi16(_mm_shuffle_epi8(cb, shuf16), SHIFT);
222+
let cr = _mm_slli_epi16(_mm_shuffle_epi8(cr, shuf16), SHIFT);
223+
224+
// Add offsets
225+
let c128 = _mm_set1_epi16(128 << SHIFT);
226+
let y = _mm_adds_epi16(y, _mm_set1_epi16((1 << SHIFT) >> 1));
227+
let cb = _mm_subs_epi16(cb, c128);
228+
let cr = _mm_subs_epi16(cr, c128);
229+
230+
// Compute cr * 1.402, cb * 0.34414, cr * 0.71414, cb * 1.772
231+
let cr_140200 = _mm_adds_epi16(_mm_mulhrs_epi16(cr, _mm_set1_epi16(13173)), cr);
232+
let cb_034414 = _mm_mulhrs_epi16(cb, _mm_set1_epi16(11276));
233+
let cr_071414 = _mm_mulhrs_epi16(cr, _mm_set1_epi16(23401));
234+
let cb_177200 = _mm_adds_epi16(_mm_mulhrs_epi16(cb, _mm_set1_epi16(25297)), cb);
235+
236+
// Last conversion step.
237+
let r = _mm_adds_epi16(y, cr_140200);
238+
let g = _mm_subs_epi16(y, _mm_adds_epi16(cb_034414, cr_071414));
239+
let b = _mm_adds_epi16(y, cb_177200);
240+
241+
// Shift back and convert to u8.
242+
let zero = _mm_setzero_si128();
243+
let r = _mm_packus_epi16(_mm_srai_epi16(r, SHIFT), zero);
244+
let g = _mm_packus_epi16(_mm_srai_epi16(g, SHIFT), zero);
245+
let b = _mm_packus_epi16(_mm_srai_epi16(b, SHIFT), zero);
246+
247+
// Shuffle rrrrrrrrggggggggbbbbbbbb to rgbrgbrgb...
248+
249+
// Control vectors for _mm_shuffle_epi8. -0x7F is selected so that the resulting position
250+
// after _mm_shuffle_epi8 will be filled with 0, so that the r, g, and b vectors can then
251+
// be OR-ed together.
252+
let shufr = _mm_setr_epi8(
253+
0, -0x7F, -0x7F, 1, -0x7F, -0x7F, 2, -0x7F, -0x7F, 3, -0x7F, -0x7F, 4, -0x7F, -0x7F, 5,
254+
);
255+
let shufg = _mm_setr_epi8(
256+
-0x7F, 0, -0x7F, -0x7F, 1, -0x7F, -0x7F, 2, -0x7F, -0x7F, 3, -0x7F, -0x7F, 4, -0x7F,
257+
-0x7F,
258+
);
259+
let shufb = _mm_alignr_epi8(shufg, shufg, 15);
260+
261+
let rgb_low = _mm_or_si128(
262+
_mm_shuffle_epi8(r, shufr),
263+
_mm_or_si128(_mm_shuffle_epi8(g, shufg), _mm_shuffle_epi8(b, shufb)),
264+
);
265+
266+
// For the next part of the rgb vectors, we need to select R values from 6 up, G and B from
267+
// 5 up. The highest bit of -0x7F + 6 is still set, so the corresponding location will
268+
// still be 0.
269+
let shufr1 = _mm_add_epi8(shufb, _mm_set1_epi8(6));
270+
let shufg1 = _mm_add_epi8(shufr, _mm_set1_epi8(5));
271+
let shufb1 = _mm_add_epi8(shufg, _mm_set1_epi8(5));
272+
273+
let rgb_hi = _mm_or_si128(
274+
_mm_shuffle_epi8(r, shufr1),
275+
_mm_or_si128(_mm_shuffle_epi8(g, shufg1), _mm_shuffle_epi8(b, shufb1)),
276+
);
277+
278+
let mut data = [0u8; 32];
279+
_mm_storeu_si128(data.as_mut_ptr() as *mut _, rgb_low);
280+
_mm_storeu_si128(data.as_mut_ptr().wrapping_add(16) as *mut _, rgb_hi);
281+
std::ptr::copy_nonoverlapping::<u8>(
282+
data.as_ptr(),
283+
output.as_mut_ptr().wrapping_add(24 * i),
284+
24,
285+
);
286+
}
287+
288+
num_vecs * 8
289+
}

0 commit comments

Comments
 (0)