Skip to content

Commit e831f95

Browse files
committed
SSE3 YCbCr conversion
Compared with baseline, no rayon: decode a 2268x1512 JPEG time: [35.694 ms 35.736 ms 35.778 ms] change: [-23.510% -23.401% -23.290%] (p = 0.00 < 0.05)
1 parent 6806e03 commit e831f95

File tree

3 files changed

+120
-3
lines changed

3 files changed

+120
-3
lines changed

Cargo.toml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,13 @@ png = "0.16"
1818
walkdir = "2.0"
1919
criterion = "0.3"
2020

21+
[profile.bench]
22+
debug = true
23+
24+
[profile.release]
25+
debug = true
26+
27+
2128
[[bench]]
2229
name = "decoding_benchmark"
2330
harness = false

src/decoder.rs

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ use parser::{
1616
};
1717
use std::convert::TryInto;
1818
use std::io::Read;
19+
use std::is_x86_feature_detected;
1920
use upsampler::Upsampler;
2021
use worker::{PlatformWorker, RowData, Worker};
2122

@@ -1253,15 +1254,124 @@ fn color_convert_line_rgb(data: &[Vec<u8>], output: &mut [u8]) {
12531254
}
12541255
}
12551256

1257+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1258+
#[target_feature(enable = "ssse3")]
1259+
#[allow(unsafe_code)]
1260+
unsafe fn color_convert_line_ycbcr_ssse3(
1261+
y: &[u8],
1262+
cb: &[u8],
1263+
cr: &[u8],
1264+
output: &mut [u8],
1265+
) -> usize {
1266+
#[cfg(target_arch = "x86")]
1267+
use std::arch::x86::*;
1268+
#[cfg(target_arch = "x86_64")]
1269+
use std::arch::x86_64::*;
1270+
1271+
assert!(output.len() % 3 == 0);
1272+
let num = output.len() / 3;
1273+
assert!(num <= y.len());
1274+
assert!(num <= cb.len());
1275+
assert!(num <= cr.len());
1276+
let num_vecs = num / 8;
1277+
1278+
for i in 0..num_vecs {
1279+
const SHIFT: i32 = 6;
1280+
// Load.
1281+
let y = _mm_loadu_si64(y.as_ptr().wrapping_add(i * 8) as *const _);
1282+
let cb = _mm_loadu_si64(cb.as_ptr().wrapping_add(i * 8) as *const _);
1283+
let cr = _mm_loadu_si64(cr.as_ptr().wrapping_add(i * 8) as *const _);
1284+
1285+
// Convert to 16 bit.
1286+
let zero = _mm_setzero_si128();
1287+
let y = _mm_slli_epi16(_mm_unpackhi_epi8(y, zero), SHIFT);
1288+
let cb = _mm_slli_epi16(_mm_unpackhi_epi8(cb, zero), SHIFT);
1289+
let cr = _mm_slli_epi16(_mm_unpackhi_epi8(cr, zero), SHIFT);
1290+
1291+
// Add offsets
1292+
let c128 = _mm_set1_epi16(128 << SHIFT);
1293+
let y = _mm_adds_epi16(y, _mm_set1_epi16((1 << SHIFT) >> 1));
1294+
let cb = _mm_subs_epi16(cb, c128);
1295+
let cr = _mm_subs_epi16(cr, c128);
1296+
1297+
// Compute cr * 1.402, cb * 0.34414, cr * 0.71414, cb * 1.772
1298+
let cr_140200 = _mm_adds_epi16(_mm_mulhrs_epi16(cr, _mm_set1_epi16(13173)), cr);
1299+
let cb_034414 = _mm_mulhrs_epi16(cb, _mm_set1_epi16(11276));
1300+
let cr_071414 = _mm_mulhrs_epi16(cr, _mm_set1_epi16(23401));
1301+
let cb_177200 = _mm_adds_epi16(_mm_mulhrs_epi16(cb, _mm_set1_epi16(25297)), cb);
1302+
1303+
// Last conversion step.
1304+
let r = _mm_adds_epi16(y, cr_140200);
1305+
let g = _mm_subs_epi16(y, _mm_adds_epi16(cb_034414, cr_071414));
1306+
let b = _mm_adds_epi16(y, cb_177200);
1307+
1308+
// Shift back and convert to u8.
1309+
let r = _mm_packus_epi16(_mm_srai_epi16(r, SHIFT), zero);
1310+
let g = _mm_packus_epi16(_mm_srai_epi16(g, SHIFT), zero);
1311+
let b = _mm_packus_epi16(_mm_srai_epi16(b, SHIFT), zero);
1312+
1313+
// Shuffle rrrrrrrrggggggggbbbbbbbb to rgbrgbrgb...
1314+
let shufr = _mm_loadu_si128(
1315+
[
1316+
0u8, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80, 5,
1317+
]
1318+
.as_ptr() as *const _,
1319+
);
1320+
let shufg = _mm_loadu_si128(
1321+
[
1322+
0x80u8, 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80,
1323+
]
1324+
.as_ptr() as *const _,
1325+
);
1326+
let shufb = _mm_alignr_epi8(shufg, shufg, 15);
1327+
1328+
let rgb_low = _mm_or_si128(
1329+
_mm_shuffle_epi8(r, shufr),
1330+
_mm_or_si128(_mm_shuffle_epi8(g, shufg), _mm_shuffle_epi8(b, shufb)),
1331+
);
1332+
1333+
let shufr1 = _mm_add_epi8(shufb, _mm_set1_epi8(6));
1334+
let shufg1 = _mm_add_epi8(shufr, _mm_set1_epi8(5));
1335+
let shufb1 = _mm_add_epi8(shufg, _mm_set1_epi8(5));
1336+
1337+
let rgb_hi = _mm_or_si128(
1338+
_mm_shuffle_epi8(r, shufr1),
1339+
_mm_or_si128(_mm_shuffle_epi8(g, shufg1), _mm_shuffle_epi8(b, shufb1)),
1340+
);
1341+
1342+
let mut data = [0u8; 32];
1343+
_mm_storeu_si128(data.as_mut_ptr() as *mut _, rgb_low);
1344+
_mm_storeu_si128(data.as_mut_ptr().wrapping_add(16) as *mut _, rgb_hi);
1345+
std::ptr::copy_nonoverlapping::<u8>(
1346+
data.as_ptr(),
1347+
output.as_mut_ptr().wrapping_add(24 * i),
1348+
24,
1349+
);
1350+
}
1351+
1352+
num_vecs * 8
1353+
}
1354+
12561355
fn color_convert_line_ycbcr(data: &[Vec<u8>], output: &mut [u8]) {
12571356
assert!(data.len() == 3, "wrong number of components for ycbcr");
12581357
let [y, cb, cr]: &[_; 3] = data.try_into().unwrap();
12591358

1359+
let mut skip = 0usize;
1360+
1361+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1362+
#[allow(unsafe_code)]
1363+
{
1364+
if is_x86_feature_detected!("ssse3") {
1365+
skip = unsafe { color_convert_line_ycbcr_ssse3(y, cb, cr, output) };
1366+
}
1367+
}
1368+
12601369
for (((chunk, y), cb), cr) in output
12611370
.chunks_exact_mut(3)
12621371
.zip(y.iter())
12631372
.zip(cb.iter())
12641373
.zip(cr.iter())
1374+
.skip(skip)
12651375
{
12661376
let (r, g, b) = ycbcr_to_rgb(*y, *cb, *cr);
12671377
chunk[0] = r;

src/lib.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,12 @@
2727
//! ```
2828
2929
#![deny(missing_docs)]
30-
#![forbid(unsafe_code)]
30+
#![deny(unsafe_code)]
3131

32-
extern crate core;
3332
extern crate alloc;
33+
extern crate core;
3434

35-
#[cfg(feature="rayon")]
35+
#[cfg(feature = "rayon")]
3636
extern crate rayon;
3737

3838
pub use decoder::{Decoder, ImageInfo, PixelFormat};

0 commit comments

Comments
 (0)