Skip to content

Commit 509ec66

Browse files
author
Andreas Molzer
authored
Merge pull request #264 from dustletter/wasm-simd
Wasm simd
2 parents 6d1bd9e + 4035b7e commit 509ec66

File tree

3 files changed

+287
-0
lines changed

3 files changed

+287
-0
lines changed

src/arch/mod.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
mod neon;
44
mod ssse3;
5+
mod wasm;
56

67
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
78
use std::is_x86_feature_detected;
@@ -22,6 +23,10 @@ pub fn get_color_convert_line_ycbcr() -> Option<unsafe fn(&[u8], &[u8], &[u8], &
2223
{
2324
return Some(neon::color_convert_line_ycbcr);
2425
}
26+
#[cfg(all(target_feature = "simd128", target_arch = "wasm32"))]
27+
{
28+
return Some(wasm::color_convert_line_ycbcr);
29+
}
2530
#[allow(unreachable_code)]
2631
None
2732
}
@@ -41,6 +46,10 @@ pub fn get_dequantize_and_idct_block_8x8(
4146
{
4247
return Some(neon::dequantize_and_idct_block_8x8);
4348
}
49+
#[cfg(all(target_feature = "simd128", target_arch = "wasm32"))]
50+
{
51+
return Some(wasm::dequantize_and_idct_block_8x8);
52+
}
4453
#[allow(unreachable_code)]
4554
None
4655
}

src/arch/wasm.rs

Lines changed: 277 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,277 @@
1+
#[cfg(target_arch = "wasm32")]
2+
use std::arch::wasm32::*;
3+
4+
#[cfg(target_arch = "wasm32")]
5+
#[target_feature(enable = "simd128")]
6+
fn idct8(data: &mut [v128; 8]) {
7+
// The fixed-point constants here are obtained by taking the fractional part of the constants
8+
// from the non-SIMD implementation and scaling them up by 1<<15. This is because
9+
// i16x8_q15mulr_sat(a, b) is effectively equivalent to (a*b)>>15 (except for possibly some
10+
// slight differences in rounding).
11+
12+
// The code here is effectively equivalent to the calls to "kernel" in idct.rs, except that it
13+
// doesn't apply any further scaling and fixed point constants have a different precision.
14+
15+
let p2 = data[2];
16+
let p3 = data[6];
17+
let p1 = i16x8_q15mulr_sat(i16x8_add_sat(p2, p3), i16x8_splat(17734)); // 0.5411961
18+
let t2 = i16x8_sub_sat(
19+
i16x8_sub_sat(p1, p3),
20+
i16x8_q15mulr_sat(p3, i16x8_splat(27779)), // 0.847759065
21+
);
22+
let t3 = i16x8_add_sat(p1, i16x8_q15mulr_sat(p2, i16x8_splat(25079))); // 0.765366865
23+
24+
let p2 = data[0];
25+
let p3 = data[4];
26+
let t0 = i16x8_add_sat(p2, p3);
27+
let t1 = i16x8_sub_sat(p2, p3);
28+
29+
let x0 = i16x8_add_sat(t0, t3);
30+
let x3 = i16x8_sub_sat(t0, t3);
31+
let x1 = i16x8_add_sat(t1, t2);
32+
let x2 = i16x8_sub_sat(t1, t2);
33+
34+
let t0 = data[7];
35+
let t1 = data[5];
36+
let t2 = data[3];
37+
let t3 = data[1];
38+
39+
let p3 = i16x8_add_sat(t0, t2);
40+
let p4 = i16x8_add_sat(t1, t3);
41+
let p1 = i16x8_add_sat(t0, t3);
42+
let p2 = i16x8_add_sat(t1, t2);
43+
let p5 = i16x8_add_sat(p3, p4);
44+
let p5 = i16x8_add_sat(p5, i16x8_q15mulr_sat(p5, i16x8_splat(5763))); // 0.175875602
45+
46+
let t0 = i16x8_q15mulr_sat(t0, i16x8_splat(9786)); // 0.298631336
47+
let t1 = i16x8_add_sat(
48+
i16x8_add_sat(t1, t1),
49+
i16x8_q15mulr_sat(t1, i16x8_splat(1741)), // 0.053119869
50+
);
51+
let t2 = i16x8_add_sat(
52+
i16x8_add_sat(t2, i16x8_add_sat(t2, t2)),
53+
i16x8_q15mulr_sat(t2, i16x8_splat(2383)), // 0.072711026
54+
);
55+
let t3 = i16x8_add_sat(t3, i16x8_q15mulr_sat(t3, i16x8_splat(16427))); // 0.501321110
56+
57+
let p1 = i16x8_sub_sat(p5, i16x8_q15mulr_sat(p1, i16x8_splat(29490))); // 0.899976223
58+
let p2 = i16x8_sub_sat(
59+
i16x8_sub_sat(i16x8_sub_sat(p5, p2), p2),
60+
i16x8_q15mulr_sat(p2, i16x8_splat(18446)), // 0.562915447
61+
);
62+
63+
let p3 = i16x8_sub_sat(
64+
i16x8_q15mulr_sat(p3, i16x8_splat(-31509)), // -0.961570560
65+
p3,
66+
);
67+
let p4 = i16x8_q15mulr_sat(p4, i16x8_splat(-12785)); // -0.390180644
68+
69+
let t3 = i16x8_add_sat(i16x8_add_sat(p1, p4), t3);
70+
let t2 = i16x8_add_sat(i16x8_add_sat(p2, p3), t2);
71+
let t1 = i16x8_add_sat(i16x8_add_sat(p2, p4), t1);
72+
let t0 = i16x8_add_sat(i16x8_add_sat(p1, p3), t0);
73+
74+
data[0] = i16x8_add_sat(x0, t3);
75+
data[7] = i16x8_sub_sat(x0, t3);
76+
data[1] = i16x8_add_sat(x1, t2);
77+
data[6] = i16x8_sub_sat(x1, t2);
78+
data[2] = i16x8_add_sat(x2, t1);
79+
data[5] = i16x8_sub_sat(x2, t1);
80+
data[3] = i16x8_add_sat(x3, t0);
81+
data[4] = i16x8_sub_sat(x3, t0);
82+
}
83+
84+
#[cfg(target_arch = "wasm32")]
85+
#[target_feature(enable = "simd128")]
86+
fn transpose8(data: &mut [v128; 8]) {
87+
// Transpose a 8x8 matrix with a sequence of interleaving operations.
88+
// Naming: dABl contains elements from the *l*ower halves of vectors A and B, interleaved, i.e.
89+
// A0 B0 A1 B1 ...
90+
// dABCDll contains elements from the lower quarter (ll) of vectors A, B, C, D, interleaved -
91+
// A0 B0 C0 D0 A1 B1 C1 D1 ...
92+
let d01l = i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(data[0], data[1]);
93+
let d23l = i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(data[2], data[3]);
94+
let d45l = i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(data[4], data[5]);
95+
let d67l = i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(data[6], data[7]);
96+
let d01h = i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(data[0], data[1]);
97+
let d23h = i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(data[2], data[3]);
98+
let d45h = i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(data[4], data[5]);
99+
let d67h = i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(data[6], data[7]);
100+
101+
// Operating on 32-bits will interleave *consecutive pairs* of 16-bit integers.
102+
let d0123ll = i32x4_shuffle::<0, 4, 1, 5>(d01l, d23l);
103+
let d0123lh = i32x4_shuffle::<2, 6, 3, 7>(d01l, d23l);
104+
let d4567ll = i32x4_shuffle::<0, 4, 1, 5>(d45l, d67l);
105+
let d4567lh = i32x4_shuffle::<2, 6, 3, 7>(d45l, d67l);
106+
let d0123hl = i32x4_shuffle::<0, 4, 1, 5>(d01h, d23h);
107+
let d0123hh = i32x4_shuffle::<2, 6, 3, 7>(d01h, d23h);
108+
let d4567hl = i32x4_shuffle::<0, 4, 1, 5>(d45h, d67h);
109+
let d4567hh = i32x4_shuffle::<2, 6, 3, 7>(d45h, d67h);
110+
111+
// Operating on 64-bits will interleave *consecutive quadruples* of 16-bit integers.
112+
data[0] = i64x2_shuffle::<0, 2>(d0123ll, d4567ll);
113+
data[1] = i64x2_shuffle::<1, 3>(d0123ll, d4567ll);
114+
data[2] = i64x2_shuffle::<0, 2>(d0123lh, d4567lh);
115+
data[3] = i64x2_shuffle::<1, 3>(d0123lh, d4567lh);
116+
data[4] = i64x2_shuffle::<0, 2>(d0123hl, d4567hl);
117+
data[5] = i64x2_shuffle::<1, 3>(d0123hl, d4567hl);
118+
data[6] = i64x2_shuffle::<0, 2>(d0123hh, d4567hh);
119+
data[7] = i64x2_shuffle::<1, 3>(d0123hh, d4567hh);
120+
}
121+
122+
#[cfg(target_arch = "wasm32")]
123+
#[target_feature(enable = "simd128")]
124+
pub fn dequantize_and_idct_block_8x8(
125+
coefficients: &[i16; 64],
126+
quantization_table: &[u16; 64],
127+
output_linestride: usize,
128+
output: &mut [u8],
129+
) {
130+
// The loop below will write to positions [output_linestride * i, output_linestride * i + 8)
131+
// for 0<=i<8. Thus, the last accessed position is at an offset of output_linestrade * 7 + 7,
132+
// and if that position is in-bounds, so are all other accesses.
133+
assert!(
134+
output.len()
135+
> output_linestride
136+
.checked_mul(7)
137+
.unwrap()
138+
.checked_add(7)
139+
.unwrap()
140+
);
141+
142+
const SHIFT: u32 = 3;
143+
144+
// Read the DCT coefficients, scale them up and dequantize them.
145+
let mut data = [i16x8_splat(0); 8];
146+
unsafe {
147+
for i in 0..8 {
148+
data[i] = i16x8_shl(
149+
i16x8_mul(
150+
v128_load(coefficients.as_ptr().wrapping_add(i * 8) as *const _),
151+
v128_load(quantization_table.as_ptr().wrapping_add(i * 8) as *const _),
152+
),
153+
SHIFT,
154+
);
155+
}
156+
}
157+
158+
// Usual column IDCT - transpose - column IDCT - transpose approach.
159+
idct8(&mut data);
160+
transpose8(&mut data);
161+
idct8(&mut data);
162+
transpose8(&mut data);
163+
164+
for i in 0..8 {
165+
// The two passes of the IDCT algorithm give us a factor of 8, so the shift here is
166+
// increased by 3.
167+
// As values will be stored in a u8, they need to be 128-centered and not 0-centered.
168+
// We add 128 with the appropriate shift for that purpose.
169+
const OFFSET: i16 = 128 << (SHIFT + 3);
170+
// We want rounding right shift, so we should add (1/2) << (SHIFT+3) before shifting.
171+
const ROUNDING_BIAS: i16 = (1 << (SHIFT + 3)) >> 1;
172+
173+
let data_with_offset = i16x8_add_sat(data[i], i16x8_splat(OFFSET + ROUNDING_BIAS));
174+
175+
// SAFETY: the assert at the start of this function ensures
176+
// `output_linestride * i + 7` < output.len(), so all accesses are in-bounds.
177+
unsafe {
178+
v128_store64_lane::<0>(
179+
u8x16_narrow_i16x8(
180+
i16x8_shr(data_with_offset, SHIFT + 3),
181+
i16x8_splat(0),
182+
),
183+
output.as_mut_ptr().wrapping_add(output_linestride * i) as *mut _,
184+
);
185+
}
186+
}
187+
}
188+
189+
#[cfg(target_arch = "wasm32")]
190+
#[target_feature(enable = "simd128")]
191+
pub fn color_convert_line_ycbcr(y_slice: &[u8], cb_slice: &[u8], cr_slice: &[u8], output: &mut [u8]) -> usize {
192+
193+
assert!(output.len() % 3 == 0);
194+
let num = output.len() / 3;
195+
assert!(num <= y_slice.len());
196+
assert!(num <= cb_slice.len());
197+
assert!(num <= cr_slice.len());
198+
199+
let num_vecs = num / 8;
200+
201+
for i in 0..num_vecs {
202+
const SHIFT: u32 = 6;
203+
// Load.
204+
let y: v128;
205+
let cb: v128;
206+
let cr: v128;
207+
// SAFETY: i is at most `num / 8 - 8`, so the highest v128_load64_zero reads from
208+
// [num - 8, num). The above asserts ensure this is in-bounds.
209+
unsafe {
210+
y = v128_load64_zero(y_slice.as_ptr().wrapping_add(i * 8) as *const _);
211+
cb = v128_load64_zero(cb_slice.as_ptr().wrapping_add(i * 8) as *const _);
212+
cr = v128_load64_zero(cr_slice.as_ptr().wrapping_add(i * 8) as *const _);
213+
}
214+
215+
// Convert to 16 bit.
216+
let y = i16x8_shl(i16x8_extend_low_u8x16(y), SHIFT);
217+
let cb = i16x8_shl(i16x8_extend_low_u8x16(cb), SHIFT);
218+
let cr = i16x8_shl(i16x8_extend_low_u8x16(cr), SHIFT);
219+
220+
// Add offsets
221+
let c128 = i16x8_splat(128 << SHIFT);
222+
let y = i16x8_add_sat(y, i16x8_splat((1 << SHIFT) >> 1));
223+
let cb = i16x8_sub_sat(cb, c128);
224+
let cr = i16x8_sub_sat(cr, c128);
225+
226+
// Compute cr * 1.402, cb * 0.34414, cr * 0.71414, cb * 1.772
227+
let cr_140200 = i16x8_add_sat(i16x8_q15mulr_sat(cr, i16x8_splat(13173)), cr);
228+
let cb_034414 = i16x8_q15mulr_sat(cb, i16x8_splat(11276));
229+
let cr_071414 = i16x8_q15mulr_sat(cr, i16x8_splat(23401));
230+
let cb_177200 = i16x8_add_sat(i16x8_q15mulr_sat(cb, i16x8_splat(25297)), cb);
231+
232+
// Last conversion step.
233+
let r = i16x8_add_sat(y, cr_140200);
234+
let g = i16x8_sub_sat(y, i16x8_add_sat(cb_034414, cr_071414));
235+
let b = i16x8_add_sat(y, cb_177200);
236+
237+
// Shift back and convert to u8.
238+
let zero = u8x16_splat(0);
239+
let r = u8x16_narrow_i16x8(i16x8_shr(r, SHIFT), zero);
240+
let g = u8x16_narrow_i16x8(i16x8_shr(g, SHIFT), zero);
241+
let b = u8x16_narrow_i16x8(i16x8_shr(b, SHIFT), zero);
242+
243+
// Shuffle rrrrrrrrggggggggbbbbbbbb to rgbrgbrgb...
244+
245+
let rg_lanes = i8x16_shuffle::<0, 16,
246+
1, 17,
247+
2, 18,
248+
3, 19,
249+
4, 20,
250+
5, 21,
251+
6, 22,
252+
7, 23>(r, g);
253+
254+
let rgb_low = i8x16_shuffle::<0, 1, 16, // r0, g0, b0
255+
2, 3, 17, // r1, g1, b1
256+
4, 5, 18, // r2, g2, b2
257+
6, 7, 19, // r3, g3, b3
258+
8, 9, 20, // r4, g4, b4
259+
10>(rg_lanes, b); // r5
260+
261+
let rgb_hi = i8x16_shuffle::<11, 21, 12, // g5, b5, r6
262+
13, 22, 14, // g6, b6, r7
263+
15, 23, 0, // g7, b7, --
264+
0, 0, 0, // --, --, --
265+
0, 0, 0, // --, --, --
266+
0>(rg_lanes, b); // --
267+
268+
// SAFETY: i is at most `output.len() / 24 - 1` so the highest possible write is to
269+
// `output.len() - 1`.
270+
unsafe {
271+
v128_store(output.as_mut_ptr().wrapping_add(24 * i) as *mut _, rgb_low);
272+
v128_store64_lane::<0>(rgb_hi, output.as_mut_ptr().wrapping_add(24 * i + 16) as *mut _);
273+
}
274+
}
275+
276+
num_vecs * 8
277+
}

tests/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ mod crashtest;
1010
mod reftest;
1111

1212
#[test]
13+
#[cfg(all(target_family="wasm", target_os="unknown"))]
1314
#[wasm_bindgen_test::wasm_bindgen_test]
1415
fn included_file() {
1516
const FILE: &[u8] = include_bytes!(concat!(env!("CARGO_MANIFEST_DIR"), "/tests/reftest/images/mozilla/jpg-progressive.jpg"));

0 commit comments

Comments
 (0)