Skip to content

Commit 5543b35

Browse files
author
wartmanm
committed
Add wasm simd support
This is largely copy pasted from the SSSE3 implementation.
1 parent 071bca9 commit 5543b35

File tree

2 files changed

+279
-0
lines changed

2 files changed

+279
-0
lines changed

src/arch/mod.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
mod neon;
44
mod ssse3;
5+
mod wasm;
56

67
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
78
use std::is_x86_feature_detected;
@@ -22,6 +23,10 @@ pub fn get_color_convert_line_ycbcr() -> Option<unsafe fn(&[u8], &[u8], &[u8], &
2223
{
2324
return Some(neon::color_convert_line_ycbcr);
2425
}
26+
#[cfg(all(target_feature = "simd128", target_arch = "wasm32"))]
27+
{
28+
return Some(wasm::color_convert_line_ycbcr);
29+
}
2530
#[allow(unreachable_code)]
2631
None
2732
}
@@ -41,6 +46,10 @@ pub fn get_dequantize_and_idct_block_8x8(
4146
{
4247
return Some(neon::dequantize_and_idct_block_8x8);
4348
}
49+
#[cfg(all(target_feature = "simd128", target_arch = "wasm32"))]
50+
{
51+
return Some(wasm::dequantize_and_idct_block_8x8);
52+
}
4453
#[allow(unreachable_code)]
4554
None
4655
}

src/arch/wasm.rs

Lines changed: 270 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,270 @@
1+
#[cfg(target_arch = "wasm32")]
2+
use std::arch::wasm32::*;
3+
4+
#[cfg(target_arch = "wasm32")]
5+
#[target_feature(enable = "simd128")]
6+
fn idct8(data: &mut [v128; 8]) {
7+
// The fixed-point constants here are obtained by taking the fractional part of the constants
8+
// from the non-SIMD implementation and scaling them up by 1<<15. This is because
9+
// i16x8_q15mulr_sat(a, b) is effectively equivalent to (a*b)>>15 (except for possibly some
10+
// slight differences in rounding).
11+
12+
// The code here is effectively equivalent to the calls to "kernel" in idct.rs, except that it
13+
// doesn't apply any further scaling and fixed point constants have a different precision.
14+
15+
let p2 = data[2];
16+
let p3 = data[6];
17+
let p1 = i16x8_q15mulr_sat(i16x8_add_sat(p2, p3), i16x8_splat(17734)); // 0.5411961
18+
let t2 = i16x8_sub_sat(
19+
i16x8_sub_sat(p1, p3),
20+
i16x8_q15mulr_sat(p3, i16x8_splat(27779)), // 0.847759065
21+
);
22+
let t3 = i16x8_add_sat(p1, i16x8_q15mulr_sat(p2, i16x8_splat(25079))); // 0.765366865
23+
24+
let p2 = data[0];
25+
let p3 = data[4];
26+
let t0 = i16x8_add_sat(p2, p3);
27+
let t1 = i16x8_sub_sat(p2, p3);
28+
29+
let x0 = i16x8_add_sat(t0, t3);
30+
let x3 = i16x8_sub_sat(t0, t3);
31+
let x1 = i16x8_add_sat(t1, t2);
32+
let x2 = i16x8_sub_sat(t1, t2);
33+
34+
let t0 = data[7];
35+
let t1 = data[5];
36+
let t2 = data[3];
37+
let t3 = data[1];
38+
39+
let p3 = i16x8_add_sat(t0, t2);
40+
let p4 = i16x8_add_sat(t1, t3);
41+
let p1 = i16x8_add_sat(t0, t3);
42+
let p2 = i16x8_add_sat(t1, t2);
43+
let p5 = i16x8_add_sat(p3, p4);
44+
let p5 = i16x8_add_sat(p5, i16x8_q15mulr_sat(p5, i16x8_splat(5763))); // 0.175875602
45+
46+
let t0 = i16x8_q15mulr_sat(t0, i16x8_splat(9786)); // 0.298631336
47+
let t1 = i16x8_add_sat(
48+
i16x8_add_sat(t1, t1),
49+
i16x8_q15mulr_sat(t1, i16x8_splat(1741)), // 0.053119869
50+
);
51+
let t2 = i16x8_add_sat(
52+
i16x8_add_sat(t2, i16x8_add_sat(t2, t2)),
53+
i16x8_q15mulr_sat(t2, i16x8_splat(2383)), // 0.072711026
54+
);
55+
let t3 = i16x8_add_sat(t3, i16x8_q15mulr_sat(t3, i16x8_splat(16427))); // 0.501321110
56+
57+
let p1 = i16x8_sub_sat(p5, i16x8_q15mulr_sat(p1, i16x8_splat(29490))); // 0.899976223
58+
let p2 = i16x8_sub_sat(
59+
i16x8_sub_sat(i16x8_sub_sat(p5, p2), p2),
60+
i16x8_q15mulr_sat(p2, i16x8_splat(18446)), // 0.562915447
61+
);
62+
63+
let p3 = i16x8_sub_sat(
64+
i16x8_q15mulr_sat(p3, i16x8_splat(-31509)), // -0.961570560
65+
p3,
66+
);
67+
let p4 = i16x8_q15mulr_sat(p4, i16x8_splat(-12785)); // -0.390180644
68+
69+
let t3 = i16x8_add_sat(i16x8_add_sat(p1, p4), t3);
70+
let t2 = i16x8_add_sat(i16x8_add_sat(p2, p3), t2);
71+
let t1 = i16x8_add_sat(i16x8_add_sat(p2, p4), t1);
72+
let t0 = i16x8_add_sat(i16x8_add_sat(p1, p3), t0);
73+
74+
data[0] = i16x8_add_sat(x0, t3);
75+
data[7] = i16x8_sub_sat(x0, t3);
76+
data[1] = i16x8_add_sat(x1, t2);
77+
data[6] = i16x8_sub_sat(x1, t2);
78+
data[2] = i16x8_add_sat(x2, t1);
79+
data[5] = i16x8_sub_sat(x2, t1);
80+
data[3] = i16x8_add_sat(x3, t0);
81+
data[4] = i16x8_sub_sat(x3, t0);
82+
}
83+
84+
#[cfg(target_arch = "wasm32")]
85+
#[target_feature(enable = "simd128")]
86+
fn transpose8(data: &mut [v128; 8]) {
87+
// Transpose a 8x8 matrix with a sequence of interleaving operations.
88+
// Naming: dABl contains elements from the *l*ower halves of vectors A and B, interleaved, i.e.
89+
// A0 B0 A1 B1 ...
90+
// dABCDll contains elements from the lower quarter (ll) of vectors A, B, C, D, interleaved -
91+
// A0 B0 C0 D0 A1 B1 C1 D1 ...
92+
let d01l = i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(data[0], data[1]);
93+
let d23l = i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(data[2], data[3]);
94+
let d45l = i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(data[4], data[5]);
95+
let d67l = i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(data[6], data[7]);
96+
let d01h = i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(data[0], data[1]);
97+
let d23h = i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(data[2], data[3]);
98+
let d45h = i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(data[4], data[5]);
99+
let d67h = i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(data[6], data[7]);
100+
101+
// Operating on 32-bits will interleave *consecutive pairs* of 16-bit integers.
102+
let d0123ll = i32x4_shuffle::<0, 4, 1, 5>(d01l, d23l);
103+
let d0123lh = i32x4_shuffle::<2, 6, 3, 7>(d01l, d23l);
104+
let d4567ll = i32x4_shuffle::<0, 4, 1, 5>(d45l, d67l);
105+
let d4567lh = i32x4_shuffle::<2, 6, 3, 7>(d45l, d67l);
106+
let d0123hl = i32x4_shuffle::<0, 4, 1, 5>(d01h, d23h);
107+
let d0123hh = i32x4_shuffle::<2, 6, 3, 7>(d01h, d23h);
108+
let d4567hl = i32x4_shuffle::<0, 4, 1, 5>(d45h, d67h);
109+
let d4567hh = i32x4_shuffle::<2, 6, 3, 7>(d45h, d67h);
110+
111+
// Operating on 64-bits will interleave *consecutive quadruples* of 16-bit integers.
112+
data[0] = i64x2_shuffle::<0, 2>(d0123ll, d4567ll);
113+
data[1] = i64x2_shuffle::<1, 3>(d0123ll, d4567ll);
114+
data[2] = i64x2_shuffle::<0, 2>(d0123lh, d4567lh);
115+
data[3] = i64x2_shuffle::<1, 3>(d0123lh, d4567lh);
116+
data[4] = i64x2_shuffle::<0, 2>(d0123hl, d4567hl);
117+
data[5] = i64x2_shuffle::<1, 3>(d0123hl, d4567hl);
118+
data[6] = i64x2_shuffle::<0, 2>(d0123hh, d4567hh);
119+
data[7] = i64x2_shuffle::<1, 3>(d0123hh, d4567hh);
120+
}
121+
122+
#[cfg(target_arch = "wasm32")]
123+
#[target_feature(enable = "simd128")]
124+
pub fn dequantize_and_idct_block_8x8(
125+
coefficients: &[i16; 64],
126+
quantization_table: &[u16; 64],
127+
output_linestride: usize,
128+
output: &mut [u8],
129+
) {
130+
// The loop below will write to positions [output_linestride * i, output_linestride * i + 8)
131+
// for 0<=i<8. Thus, the last accessed position is at an offset of output_linestrade * 7 + 7,
132+
// and if that position is in-bounds, so are all other accesses.
133+
assert!(
134+
output.len()
135+
> output_linestride
136+
.checked_mul(7)
137+
.unwrap()
138+
.checked_add(7)
139+
.unwrap()
140+
);
141+
142+
const SHIFT: u32 = 3;
143+
144+
// Read the DCT coefficients, scale them up and dequantize them.
145+
let mut data = [i16x8_splat(0); 8];
146+
unsafe {
147+
for i in 0..8 {
148+
data[i] = i16x8_shl(
149+
i16x8_mul(
150+
v128_load(coefficients.as_ptr().wrapping_add(i * 8) as *const _),
151+
v128_load(quantization_table.as_ptr().wrapping_add(i * 8) as *const _),
152+
),
153+
SHIFT,
154+
);
155+
}
156+
}
157+
158+
// Usual column IDCT - transpose - column IDCT - transpose approach.
159+
idct8(&mut data);
160+
transpose8(&mut data);
161+
idct8(&mut data);
162+
transpose8(&mut data);
163+
164+
for i in 0..8 {
165+
// The two passes of the IDCT algorithm give us a factor of 8, so the shift here is
166+
// increased by 3.
167+
// As values will be stored in a u8, they need to be 128-centered and not 0-centered.
168+
// We add 128 with the appropriate shift for that purpose.
169+
const OFFSET: i16 = 128 << (SHIFT + 3);
170+
// We want rounding right shift, so we should add (1/2) << (SHIFT+3) before shifting.
171+
const ROUNDING_BIAS: i16 = (1 << (SHIFT + 3)) >> 1;
172+
173+
let data_with_offset = i16x8_add_sat(data[i], i16x8_splat(OFFSET + ROUNDING_BIAS));
174+
175+
unsafe {
176+
v128_store64_lane::<0>(
177+
u8x16_narrow_i16x8(
178+
i16x8_shr(data_with_offset, SHIFT + 3),
179+
i16x8_splat(0),
180+
),
181+
output.as_mut_ptr().wrapping_add(output_linestride * i) as *mut _,
182+
);
183+
}
184+
}
185+
}
186+
187+
#[cfg(target_arch = "wasm32")]
188+
#[target_feature(enable = "simd128")]
189+
pub fn color_convert_line_ycbcr(y_slice: &[u8], cb_slice: &[u8], cr_slice: &[u8], output: &mut [u8]) -> usize {
190+
assert!(output.len() % 3 == 0);
191+
let num = output.len() / 3;
192+
assert!(num <= y_slice.len());
193+
assert!(num <= cb_slice.len());
194+
assert!(num <= cr_slice.len());
195+
196+
let num_vecs = num / 8;
197+
198+
for i in 0..num_vecs {
199+
const SHIFT: u32 = 6;
200+
// Load.
201+
let y: v128;
202+
let cb: v128;
203+
let cr: v128;
204+
unsafe {
205+
y = v128_load64_zero(y_slice.as_ptr().wrapping_add(i * 8) as *const _);
206+
cb = v128_load64_zero(cb_slice.as_ptr().wrapping_add(i * 8) as *const _);
207+
cr = v128_load64_zero(cr_slice.as_ptr().wrapping_add(i * 8) as *const _);
208+
}
209+
210+
// Convert to 16 bit.
211+
let y = i16x8_shl(i16x8_extend_low_u8x16(y), SHIFT);
212+
let cb = i16x8_shl(i16x8_extend_low_u8x16(cb), SHIFT);
213+
let cr = i16x8_shl(i16x8_extend_low_u8x16(cr), SHIFT);
214+
215+
// Add offsets
216+
let c128 = i16x8_splat(128 << SHIFT);
217+
let y = i16x8_add_sat(y, i16x8_splat((1 << SHIFT) >> 1));
218+
let cb = i16x8_sub_sat(cb, c128);
219+
let cr = i16x8_sub_sat(cr, c128);
220+
221+
// Compute cr * 1.402, cb * 0.34414, cr * 0.71414, cb * 1.772
222+
let cr_140200 = i16x8_add_sat(i16x8_q15mulr_sat(cr, i16x8_splat(13173)), cr);
223+
let cb_034414 = i16x8_q15mulr_sat(cb, i16x8_splat(11276));
224+
let cr_071414 = i16x8_q15mulr_sat(cr, i16x8_splat(23401));
225+
let cb_177200 = i16x8_add_sat(i16x8_q15mulr_sat(cb, i16x8_splat(25297)), cb);
226+
227+
// Last conversion step.
228+
let r = i16x8_add_sat(y, cr_140200);
229+
let g = i16x8_sub_sat(y, i16x8_add_sat(cb_034414, cr_071414));
230+
let b = i16x8_add_sat(y, cb_177200);
231+
232+
// Shift back and convert to u8.
233+
let zero = u8x16_splat(0);
234+
let r = u8x16_narrow_i16x8(i16x8_shr(r, SHIFT), zero);
235+
let g = u8x16_narrow_i16x8(i16x8_shr(g, SHIFT), zero);
236+
let b = u8x16_narrow_i16x8(i16x8_shr(b, SHIFT), zero);
237+
238+
// Shuffle rrrrrrrrggggggggbbbbbbbb to rgbrgbrgb...
239+
240+
let rg_lanes = i8x16_shuffle::<0, 16,
241+
1, 17,
242+
2, 18,
243+
3, 19,
244+
4, 20,
245+
5, 21,
246+
6, 22,
247+
7, 23>(r, g);
248+
249+
let rgb_low = i8x16_shuffle::<0, 1, 16, // r0, g0, b0
250+
2, 3, 17, // r1, g1, b1
251+
4, 5, 18, // r2, g2, b2
252+
6, 7, 19, // r3, g3, b3
253+
8, 9, 20, // r4, g4, b4
254+
10>(rg_lanes, b); // r5
255+
256+
let rgb_hi = i8x16_shuffle::<11, 21, 12, // g5, b5, r6
257+
13, 22, 14, // g6, b6, r7
258+
15, 23, 0, // g7, b7, --
259+
0, 0, 0, // --, --, --
260+
0, 0, 0, // --, --, --
261+
0>(rg_lanes, b); // --
262+
263+
unsafe {
264+
v128_store(output.as_mut_ptr().wrapping_add(24 * i) as *mut _, rgb_low);
265+
v128_store64_lane::<0>(rgb_hi, output.as_mut_ptr().wrapping_add(24 * i + 16) as *mut _);
266+
}
267+
}
268+
269+
num_vecs * 8
270+
}

0 commit comments

Comments
 (0)