Skip to content

Commit 774806a

Browse files
committed
Address review comments.
1 parent 24e86e5 commit 774806a

File tree

6 files changed

+378
-297
lines changed

6 files changed

+378
-297
lines changed

Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,4 +34,5 @@ name = "large_image"
3434
harness = false
3535

3636
[features]
37-
default = ["rayon"]
37+
default = ["rayon", "simd"]
38+
simd = []

src/arch/mod.rs

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
mod ssse3;
2+
3+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
4+
use std::is_x86_feature_detected;
5+
6+
/// Arch-specific implementation of YCbCr conversion. Returns the number of pixels that were
7+
/// converted.
8+
pub fn color_convert_line_ycbcr(y: &[u8], cb: &[u8], cr: &[u8], output: &mut [u8]) -> usize {
9+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
10+
#[allow(unsafe_code)]
11+
{
12+
if is_x86_feature_detected!("ssse3") {
13+
unsafe {
14+
return ssse3::color_convert_line_ycbcr(y, cb, cr, output);
15+
}
16+
}
17+
}
18+
return 0;
19+
}
20+
21+
/// Arch-specific implementation of 8x8 IDCT.
22+
pub fn dequantize_and_idct_block_8x8(
23+
coefficients: &[i16],
24+
quantization_table: &[u16; 64],
25+
output_linestride: usize,
26+
output: &mut [u8],
27+
) {
28+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
29+
#[allow(unsafe_code)]
30+
{
31+
if is_x86_feature_detected!("ssse3") {
32+
unsafe {
33+
ssse3::dequantize_and_idct_block_8x8(
34+
coefficients,
35+
quantization_table,
36+
output_linestride,
37+
output,
38+
);
39+
return;
40+
}
41+
}
42+
}
43+
unreachable!("No arch-specific IDCT available");
44+
}
45+
46+
/// Returns true if an arch-specific IDCT is avaliable, false otherwise.
47+
pub fn has_arch_specific_idct() -> bool {
48+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
49+
#[allow(unsafe_code)]
50+
{
51+
if is_x86_feature_detected!("ssse3") {
52+
return true;
53+
}
54+
}
55+
return false;
56+
}

src/arch/ssse3.rs

Lines changed: 265 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,265 @@
1+
#[cfg(target_arch = "x86")]
2+
use std::arch::x86::*;
3+
#[cfg(target_arch = "x86_64")]
4+
use std::arch::x86_64::*;
5+
6+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
7+
#[target_feature(enable = "ssse3")]
8+
#[allow(unsafe_code)]
9+
unsafe fn idct8(data: &mut [__m128i; 8]) {
10+
// The fixed-point constants here are obtained by taking the fractional part of the constants
11+
// from the non-SIMD implementation and scaling them up by 1<<15. This is because
12+
// _mm_mulhrs_epi16(a, b) is effectively equivalent to (a*b)>>15 (except for possibly some
13+
// slight differences in rounding).
14+
15+
let p2 = data[2];
16+
let p3 = data[6];
17+
let p1 = _mm_mulhrs_epi16(_mm_adds_epi16(p2, p3), _mm_set1_epi16(17734)); // 0.5411961
18+
let t2 = _mm_subs_epi16(
19+
_mm_subs_epi16(p1, p3),
20+
_mm_mulhrs_epi16(p3, _mm_set1_epi16(27779)), // 0.847759065
21+
);
22+
let t3 = _mm_adds_epi16(p1, _mm_mulhrs_epi16(p2, _mm_set1_epi16(25079))); // 0.765366865
23+
24+
let p2 = data[0];
25+
let p3 = data[4];
26+
let t0 = _mm_adds_epi16(p2, p3);
27+
let t1 = _mm_subs_epi16(p2, p3);
28+
29+
let x0 = _mm_adds_epi16(t0, t3);
30+
let x3 = _mm_subs_epi16(t0, t3);
31+
let x1 = _mm_adds_epi16(t1, t2);
32+
let x2 = _mm_subs_epi16(t1, t2);
33+
34+
let t0 = data[7];
35+
let t1 = data[5];
36+
let t2 = data[3];
37+
let t3 = data[1];
38+
39+
let p3 = _mm_adds_epi16(t0, t2);
40+
let p4 = _mm_adds_epi16(t1, t3);
41+
let p1 = _mm_adds_epi16(t0, t3);
42+
let p2 = _mm_adds_epi16(t1, t2);
43+
let p5 = _mm_adds_epi16(p3, p4);
44+
let p5 = _mm_adds_epi16(p5, _mm_mulhrs_epi16(p5, _mm_set1_epi16(5763))); // 0.175875602
45+
46+
let t0 = _mm_mulhrs_epi16(t0, _mm_set1_epi16(9786)); // 0.298631336
47+
let t1 = _mm_adds_epi16(
48+
_mm_adds_epi16(t1, t1),
49+
_mm_mulhrs_epi16(t1, _mm_set1_epi16(1741)), // 0.053119869
50+
);
51+
let t2 = _mm_adds_epi16(
52+
_mm_adds_epi16(t2, _mm_adds_epi16(t2, t2)),
53+
_mm_mulhrs_epi16(t2, _mm_set1_epi16(2383)), // 0.072711026
54+
);
55+
let t3 = _mm_adds_epi16(t3, _mm_mulhrs_epi16(t3, _mm_set1_epi16(16427))); // 0.501321110
56+
57+
let p1 = _mm_subs_epi16(p5, _mm_mulhrs_epi16(p1, _mm_set1_epi16(29490))); // 0.899976223
58+
let p2 = _mm_subs_epi16(
59+
_mm_subs_epi16(_mm_subs_epi16(p5, p2), p2),
60+
_mm_mulhrs_epi16(p2, _mm_set1_epi16(18446)), // 0.562915447
61+
);
62+
63+
let p3 = _mm_subs_epi16(
64+
_mm_mulhrs_epi16(p3, _mm_set1_epi16(-31509)), // -0.961570560
65+
p3,
66+
);
67+
let p4 = _mm_mulhrs_epi16(p4, _mm_set1_epi16(-12785)); // -0.390180644
68+
69+
let t3 = _mm_adds_epi16(_mm_adds_epi16(p1, p4), t3);
70+
let t2 = _mm_adds_epi16(_mm_adds_epi16(p2, p3), t2);
71+
let t1 = _mm_adds_epi16(_mm_adds_epi16(p2, p4), t1);
72+
let t0 = _mm_adds_epi16(_mm_adds_epi16(p1, p3), t0);
73+
74+
data[0] = _mm_adds_epi16(x0, t3);
75+
data[7] = _mm_subs_epi16(x0, t3);
76+
data[1] = _mm_adds_epi16(x1, t2);
77+
data[6] = _mm_subs_epi16(x1, t2);
78+
data[2] = _mm_adds_epi16(x2, t1);
79+
data[5] = _mm_subs_epi16(x2, t1);
80+
data[3] = _mm_adds_epi16(x3, t0);
81+
data[4] = _mm_subs_epi16(x3, t0);
82+
}
83+
84+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
85+
#[target_feature(enable = "ssse3")]
86+
#[allow(unsafe_code)]
87+
unsafe fn transpose8(data: &mut [__m128i; 8]) {
88+
let d01l = _mm_unpacklo_epi16(data[0], data[1]);
89+
let d23l = _mm_unpacklo_epi16(data[2], data[3]);
90+
let d45l = _mm_unpacklo_epi16(data[4], data[5]);
91+
let d67l = _mm_unpacklo_epi16(data[6], data[7]);
92+
let d01h = _mm_unpackhi_epi16(data[0], data[1]);
93+
let d23h = _mm_unpackhi_epi16(data[2], data[3]);
94+
let d45h = _mm_unpackhi_epi16(data[4], data[5]);
95+
let d67h = _mm_unpackhi_epi16(data[6], data[7]);
96+
let d0123ll = _mm_unpacklo_epi32(d01l, d23l);
97+
let d0123lh = _mm_unpackhi_epi32(d01l, d23l);
98+
let d4567ll = _mm_unpacklo_epi32(d45l, d67l);
99+
let d4567lh = _mm_unpackhi_epi32(d45l, d67l);
100+
let d0123hl = _mm_unpacklo_epi32(d01h, d23h);
101+
let d0123hh = _mm_unpackhi_epi32(d01h, d23h);
102+
let d4567hl = _mm_unpacklo_epi32(d45h, d67h);
103+
let d4567hh = _mm_unpackhi_epi32(d45h, d67h);
104+
data[0] = _mm_unpacklo_epi64(d0123ll, d4567ll);
105+
data[1] = _mm_unpackhi_epi64(d0123ll, d4567ll);
106+
data[2] = _mm_unpacklo_epi64(d0123lh, d4567lh);
107+
data[3] = _mm_unpackhi_epi64(d0123lh, d4567lh);
108+
data[4] = _mm_unpacklo_epi64(d0123hl, d4567hl);
109+
data[5] = _mm_unpackhi_epi64(d0123hl, d4567hl);
110+
data[6] = _mm_unpacklo_epi64(d0123hh, d4567hh);
111+
data[7] = _mm_unpackhi_epi64(d0123hh, d4567hh);
112+
}
113+
114+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
115+
#[target_feature(enable = "ssse3")]
116+
#[allow(unsafe_code)]
117+
pub unsafe fn dequantize_and_idct_block_8x8(
118+
coefficients: &[i16],
119+
quantization_table: &[u16; 64],
120+
output_linestride: usize,
121+
output: &mut [u8],
122+
) {
123+
assert!(coefficients.len() >= 64);
124+
// The loop below will write to positions [output_linestride * i, output_linestride * i + 8)
125+
// for 0<=i<8. Thus, the last accessed position is at an offset of output_linestrade * 7 + 7,
126+
// and if that position is in-bounds, so are all other accesses.
127+
assert!(
128+
output.len()
129+
> output_linestride
130+
.checked_mul(7)
131+
.unwrap()
132+
.checked_add(7)
133+
.unwrap()
134+
);
135+
136+
#[cfg(target_arch = "x86")]
137+
use std::arch::x86::*;
138+
#[cfg(target_arch = "x86_64")]
139+
use std::arch::x86_64::*;
140+
141+
const SHIFT: i32 = 3;
142+
143+
let mut data = [_mm_setzero_si128(); 8];
144+
for i in 0..8 {
145+
data[i] = _mm_slli_epi16(
146+
_mm_mullo_epi16(
147+
_mm_loadu_si128(coefficients.as_ptr().wrapping_add(i * 8) as *const _),
148+
_mm_loadu_si128(quantization_table.as_ptr().wrapping_add(i * 8) as *const _),
149+
),
150+
SHIFT,
151+
);
152+
}
153+
154+
idct8(&mut data);
155+
transpose8(&mut data);
156+
idct8(&mut data);
157+
transpose8(&mut data);
158+
159+
for i in 0..8 {
160+
let mut buf = [0u8; 16];
161+
_mm_storeu_si128(
162+
buf.as_mut_ptr() as *mut _,
163+
_mm_packus_epi16(
164+
_mm_srai_epi16(
165+
_mm_adds_epi16(data[i], _mm_set1_epi16(257 << (SHIFT + 2))),
166+
SHIFT + 3,
167+
),
168+
_mm_setzero_si128(),
169+
),
170+
);
171+
std::ptr::copy_nonoverlapping::<u8>(
172+
buf.as_ptr(),
173+
output.as_mut_ptr().wrapping_add(output_linestride * i) as *mut _,
174+
8,
175+
);
176+
}
177+
}
178+
179+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
180+
#[target_feature(enable = "ssse3")]
181+
#[allow(unsafe_code)]
182+
pub unsafe fn color_convert_line_ycbcr(y: &[u8], cb: &[u8], cr: &[u8], output: &mut [u8]) -> usize {
183+
assert!(output.len() % 3 == 0);
184+
let num = output.len() / 3;
185+
assert!(num <= y.len());
186+
assert!(num <= cb.len());
187+
assert!(num <= cr.len());
188+
let num_vecs = num / 8;
189+
190+
for i in 0..num_vecs {
191+
const SHIFT: i32 = 6;
192+
// Load.
193+
let y = _mm_loadu_si64(y.as_ptr().wrapping_add(i * 8) as *const _);
194+
let cb = _mm_loadu_si64(cb.as_ptr().wrapping_add(i * 8) as *const _);
195+
let cr = _mm_loadu_si64(cr.as_ptr().wrapping_add(i * 8) as *const _);
196+
197+
// Convert to 16 bit.
198+
let zero = _mm_setzero_si128();
199+
let y = _mm_slli_epi16(_mm_unpackhi_epi8(y, zero), SHIFT);
200+
let cb = _mm_slli_epi16(_mm_unpackhi_epi8(cb, zero), SHIFT);
201+
let cr = _mm_slli_epi16(_mm_unpackhi_epi8(cr, zero), SHIFT);
202+
203+
// Add offsets
204+
let c128 = _mm_set1_epi16(128 << SHIFT);
205+
let y = _mm_adds_epi16(y, _mm_set1_epi16((1 << SHIFT) >> 1));
206+
let cb = _mm_subs_epi16(cb, c128);
207+
let cr = _mm_subs_epi16(cr, c128);
208+
209+
// Compute cr * 1.402, cb * 0.34414, cr * 0.71414, cb * 1.772
210+
let cr_140200 = _mm_adds_epi16(_mm_mulhrs_epi16(cr, _mm_set1_epi16(13173)), cr);
211+
let cb_034414 = _mm_mulhrs_epi16(cb, _mm_set1_epi16(11276));
212+
let cr_071414 = _mm_mulhrs_epi16(cr, _mm_set1_epi16(23401));
213+
let cb_177200 = _mm_adds_epi16(_mm_mulhrs_epi16(cb, _mm_set1_epi16(25297)), cb);
214+
215+
// Last conversion step.
216+
let r = _mm_adds_epi16(y, cr_140200);
217+
let g = _mm_subs_epi16(y, _mm_adds_epi16(cb_034414, cr_071414));
218+
let b = _mm_adds_epi16(y, cb_177200);
219+
220+
// Shift back and convert to u8.
221+
let r = _mm_packus_epi16(_mm_srai_epi16(r, SHIFT), zero);
222+
let g = _mm_packus_epi16(_mm_srai_epi16(g, SHIFT), zero);
223+
let b = _mm_packus_epi16(_mm_srai_epi16(b, SHIFT), zero);
224+
225+
// Shuffle rrrrrrrrggggggggbbbbbbbb to rgbrgbrgb...
226+
let shufr = _mm_loadu_si128(
227+
[
228+
0u8, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80, 5,
229+
]
230+
.as_ptr() as *const _,
231+
);
232+
let shufg = _mm_loadu_si128(
233+
[
234+
0x80u8, 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80,
235+
]
236+
.as_ptr() as *const _,
237+
);
238+
let shufb = _mm_alignr_epi8(shufg, shufg, 15);
239+
240+
let rgb_low = _mm_or_si128(
241+
_mm_shuffle_epi8(r, shufr),
242+
_mm_or_si128(_mm_shuffle_epi8(g, shufg), _mm_shuffle_epi8(b, shufb)),
243+
);
244+
245+
let shufr1 = _mm_add_epi8(shufb, _mm_set1_epi8(6));
246+
let shufg1 = _mm_add_epi8(shufr, _mm_set1_epi8(5));
247+
let shufb1 = _mm_add_epi8(shufg, _mm_set1_epi8(5));
248+
249+
let rgb_hi = _mm_or_si128(
250+
_mm_shuffle_epi8(r, shufr1),
251+
_mm_or_si128(_mm_shuffle_epi8(g, shufg1), _mm_shuffle_epi8(b, shufb1)),
252+
);
253+
254+
let mut data = [0u8; 32];
255+
_mm_storeu_si128(data.as_mut_ptr() as *mut _, rgb_low);
256+
_mm_storeu_si128(data.as_mut_ptr().wrapping_add(16) as *mut _, rgb_hi);
257+
std::ptr::copy_nonoverlapping::<u8>(
258+
data.as_ptr(),
259+
output.as_mut_ptr().wrapping_add(24 * i),
260+
24,
261+
);
262+
}
263+
264+
num_vecs * 8
265+
}

0 commit comments

Comments
 (0)