|
| 1 | +#[cfg(target_arch = "wasm32")] |
| 2 | +use std::arch::wasm32::*; |
| 3 | + |
| 4 | +#[cfg(target_arch = "wasm32")] |
| 5 | +#[target_feature(enable = "simd128")] |
| 6 | +fn idct8(data: &mut [v128; 8]) { |
| 7 | + // The fixed-point constants here are obtained by taking the fractional part of the constants |
| 8 | + // from the non-SIMD implementation and scaling them up by 1<<15. This is because |
| 9 | + // i16x8_q15mulr_sat(a, b) is effectively equivalent to (a*b)>>15 (except for possibly some |
| 10 | + // slight differences in rounding). |
| 11 | + |
| 12 | + // The code here is effectively equivalent to the calls to "kernel" in idct.rs, except that it |
| 13 | + // doesn't apply any further scaling and fixed point constants have a different precision. |
| 14 | + |
| 15 | + let p2 = data[2]; |
| 16 | + let p3 = data[6]; |
| 17 | + let p1 = i16x8_q15mulr_sat(i16x8_add_sat(p2, p3), i16x8_splat(17734)); // 0.5411961 |
| 18 | + let t2 = i16x8_sub_sat( |
| 19 | + i16x8_sub_sat(p1, p3), |
| 20 | + i16x8_q15mulr_sat(p3, i16x8_splat(27779)), // 0.847759065 |
| 21 | + ); |
| 22 | + let t3 = i16x8_add_sat(p1, i16x8_q15mulr_sat(p2, i16x8_splat(25079))); // 0.765366865 |
| 23 | + |
| 24 | + let p2 = data[0]; |
| 25 | + let p3 = data[4]; |
| 26 | + let t0 = i16x8_add_sat(p2, p3); |
| 27 | + let t1 = i16x8_sub_sat(p2, p3); |
| 28 | + |
| 29 | + let x0 = i16x8_add_sat(t0, t3); |
| 30 | + let x3 = i16x8_sub_sat(t0, t3); |
| 31 | + let x1 = i16x8_add_sat(t1, t2); |
| 32 | + let x2 = i16x8_sub_sat(t1, t2); |
| 33 | + |
| 34 | + let t0 = data[7]; |
| 35 | + let t1 = data[5]; |
| 36 | + let t2 = data[3]; |
| 37 | + let t3 = data[1]; |
| 38 | + |
| 39 | + let p3 = i16x8_add_sat(t0, t2); |
| 40 | + let p4 = i16x8_add_sat(t1, t3); |
| 41 | + let p1 = i16x8_add_sat(t0, t3); |
| 42 | + let p2 = i16x8_add_sat(t1, t2); |
| 43 | + let p5 = i16x8_add_sat(p3, p4); |
| 44 | + let p5 = i16x8_add_sat(p5, i16x8_q15mulr_sat(p5, i16x8_splat(5763))); // 0.175875602 |
| 45 | + |
| 46 | + let t0 = i16x8_q15mulr_sat(t0, i16x8_splat(9786)); // 0.298631336 |
| 47 | + let t1 = i16x8_add_sat( |
| 48 | + i16x8_add_sat(t1, t1), |
| 49 | + i16x8_q15mulr_sat(t1, i16x8_splat(1741)), // 0.053119869 |
| 50 | + ); |
| 51 | + let t2 = i16x8_add_sat( |
| 52 | + i16x8_add_sat(t2, i16x8_add_sat(t2, t2)), |
| 53 | + i16x8_q15mulr_sat(t2, i16x8_splat(2383)), // 0.072711026 |
| 54 | + ); |
| 55 | + let t3 = i16x8_add_sat(t3, i16x8_q15mulr_sat(t3, i16x8_splat(16427))); // 0.501321110 |
| 56 | + |
| 57 | + let p1 = i16x8_sub_sat(p5, i16x8_q15mulr_sat(p1, i16x8_splat(29490))); // 0.899976223 |
| 58 | + let p2 = i16x8_sub_sat( |
| 59 | + i16x8_sub_sat(i16x8_sub_sat(p5, p2), p2), |
| 60 | + i16x8_q15mulr_sat(p2, i16x8_splat(18446)), // 0.562915447 |
| 61 | + ); |
| 62 | + |
| 63 | + let p3 = i16x8_sub_sat( |
| 64 | + i16x8_q15mulr_sat(p3, i16x8_splat(-31509)), // -0.961570560 |
| 65 | + p3, |
| 66 | + ); |
| 67 | + let p4 = i16x8_q15mulr_sat(p4, i16x8_splat(-12785)); // -0.390180644 |
| 68 | + |
| 69 | + let t3 = i16x8_add_sat(i16x8_add_sat(p1, p4), t3); |
| 70 | + let t2 = i16x8_add_sat(i16x8_add_sat(p2, p3), t2); |
| 71 | + let t1 = i16x8_add_sat(i16x8_add_sat(p2, p4), t1); |
| 72 | + let t0 = i16x8_add_sat(i16x8_add_sat(p1, p3), t0); |
| 73 | + |
| 74 | + data[0] = i16x8_add_sat(x0, t3); |
| 75 | + data[7] = i16x8_sub_sat(x0, t3); |
| 76 | + data[1] = i16x8_add_sat(x1, t2); |
| 77 | + data[6] = i16x8_sub_sat(x1, t2); |
| 78 | + data[2] = i16x8_add_sat(x2, t1); |
| 79 | + data[5] = i16x8_sub_sat(x2, t1); |
| 80 | + data[3] = i16x8_add_sat(x3, t0); |
| 81 | + data[4] = i16x8_sub_sat(x3, t0); |
| 82 | +} |
| 83 | + |
| 84 | +#[cfg(target_arch = "wasm32")] |
| 85 | +#[target_feature(enable = "simd128")] |
| 86 | +fn transpose8(data: &mut [v128; 8]) { |
| 87 | + // Transpose a 8x8 matrix with a sequence of interleaving operations. |
| 88 | + // Naming: dABl contains elements from the *l*ower halves of vectors A and B, interleaved, i.e. |
| 89 | + // A0 B0 A1 B1 ... |
| 90 | + // dABCDll contains elements from the lower quarter (ll) of vectors A, B, C, D, interleaved - |
| 91 | + // A0 B0 C0 D0 A1 B1 C1 D1 ... |
| 92 | + let d01l = i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(data[0], data[1]); |
| 93 | + let d23l = i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(data[2], data[3]); |
| 94 | + let d45l = i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(data[4], data[5]); |
| 95 | + let d67l = i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(data[6], data[7]); |
| 96 | + let d01h = i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(data[0], data[1]); |
| 97 | + let d23h = i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(data[2], data[3]); |
| 98 | + let d45h = i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(data[4], data[5]); |
| 99 | + let d67h = i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(data[6], data[7]); |
| 100 | + |
| 101 | + // Operating on 32-bits will interleave *consecutive pairs* of 16-bit integers. |
| 102 | + let d0123ll = i32x4_shuffle::<0, 4, 1, 5>(d01l, d23l); |
| 103 | + let d0123lh = i32x4_shuffle::<2, 6, 3, 7>(d01l, d23l); |
| 104 | + let d4567ll = i32x4_shuffle::<0, 4, 1, 5>(d45l, d67l); |
| 105 | + let d4567lh = i32x4_shuffle::<2, 6, 3, 7>(d45l, d67l); |
| 106 | + let d0123hl = i32x4_shuffle::<0, 4, 1, 5>(d01h, d23h); |
| 107 | + let d0123hh = i32x4_shuffle::<2, 6, 3, 7>(d01h, d23h); |
| 108 | + let d4567hl = i32x4_shuffle::<0, 4, 1, 5>(d45h, d67h); |
| 109 | + let d4567hh = i32x4_shuffle::<2, 6, 3, 7>(d45h, d67h); |
| 110 | + |
| 111 | + // Operating on 64-bits will interleave *consecutive quadruples* of 16-bit integers. |
| 112 | + data[0] = i64x2_shuffle::<0, 2>(d0123ll, d4567ll); |
| 113 | + data[1] = i64x2_shuffle::<1, 3>(d0123ll, d4567ll); |
| 114 | + data[2] = i64x2_shuffle::<0, 2>(d0123lh, d4567lh); |
| 115 | + data[3] = i64x2_shuffle::<1, 3>(d0123lh, d4567lh); |
| 116 | + data[4] = i64x2_shuffle::<0, 2>(d0123hl, d4567hl); |
| 117 | + data[5] = i64x2_shuffle::<1, 3>(d0123hl, d4567hl); |
| 118 | + data[6] = i64x2_shuffle::<0, 2>(d0123hh, d4567hh); |
| 119 | + data[7] = i64x2_shuffle::<1, 3>(d0123hh, d4567hh); |
| 120 | +} |
| 121 | + |
| 122 | +#[cfg(target_arch = "wasm32")] |
| 123 | +#[target_feature(enable = "simd128")] |
| 124 | +pub fn dequantize_and_idct_block_8x8( |
| 125 | + coefficients: &[i16; 64], |
| 126 | + quantization_table: &[u16; 64], |
| 127 | + output_linestride: usize, |
| 128 | + output: &mut [u8], |
| 129 | +) { |
| 130 | + // The loop below will write to positions [output_linestride * i, output_linestride * i + 8) |
| 131 | + // for 0<=i<8. Thus, the last accessed position is at an offset of output_linestrade * 7 + 7, |
| 132 | + // and if that position is in-bounds, so are all other accesses. |
| 133 | + assert!( |
| 134 | + output.len() |
| 135 | + > output_linestride |
| 136 | + .checked_mul(7) |
| 137 | + .unwrap() |
| 138 | + .checked_add(7) |
| 139 | + .unwrap() |
| 140 | + ); |
| 141 | + |
| 142 | + const SHIFT: u32 = 3; |
| 143 | + |
| 144 | + // Read the DCT coefficients, scale them up and dequantize them. |
| 145 | + let mut data = [i16x8_splat(0); 8]; |
| 146 | + unsafe { |
| 147 | + for i in 0..8 { |
| 148 | + data[i] = i16x8_shl( |
| 149 | + i16x8_mul( |
| 150 | + v128_load(coefficients.as_ptr().wrapping_add(i * 8) as *const _), |
| 151 | + v128_load(quantization_table.as_ptr().wrapping_add(i * 8) as *const _), |
| 152 | + ), |
| 153 | + SHIFT, |
| 154 | + ); |
| 155 | + } |
| 156 | + } |
| 157 | + |
| 158 | + // Usual column IDCT - transpose - column IDCT - transpose approach. |
| 159 | + idct8(&mut data); |
| 160 | + transpose8(&mut data); |
| 161 | + idct8(&mut data); |
| 162 | + transpose8(&mut data); |
| 163 | + |
| 164 | + for i in 0..8 { |
| 165 | + // The two passes of the IDCT algorithm give us a factor of 8, so the shift here is |
| 166 | + // increased by 3. |
| 167 | + // As values will be stored in a u8, they need to be 128-centered and not 0-centered. |
| 168 | + // We add 128 with the appropriate shift for that purpose. |
| 169 | + const OFFSET: i16 = 128 << (SHIFT + 3); |
| 170 | + // We want rounding right shift, so we should add (1/2) << (SHIFT+3) before shifting. |
| 171 | + const ROUNDING_BIAS: i16 = (1 << (SHIFT + 3)) >> 1; |
| 172 | + |
| 173 | + let data_with_offset = i16x8_add_sat(data[i], i16x8_splat(OFFSET + ROUNDING_BIAS)); |
| 174 | + |
| 175 | + // SAFETY: the assert at the start of this function ensures |
| 176 | + // `output_linestride * i + 7` < output.len(), so all accesses are in-bounds. |
| 177 | + unsafe { |
| 178 | + v128_store64_lane::<0>( |
| 179 | + u8x16_narrow_i16x8( |
| 180 | + i16x8_shr(data_with_offset, SHIFT + 3), |
| 181 | + i16x8_splat(0), |
| 182 | + ), |
| 183 | + output.as_mut_ptr().wrapping_add(output_linestride * i) as *mut _, |
| 184 | + ); |
| 185 | + } |
| 186 | + } |
| 187 | +} |
| 188 | + |
| 189 | +#[cfg(target_arch = "wasm32")] |
| 190 | +#[target_feature(enable = "simd128")] |
| 191 | +pub fn color_convert_line_ycbcr(y_slice: &[u8], cb_slice: &[u8], cr_slice: &[u8], output: &mut [u8]) -> usize { |
| 192 | + |
| 193 | + assert!(output.len() % 3 == 0); |
| 194 | + let num = output.len() / 3; |
| 195 | + assert!(num <= y_slice.len()); |
| 196 | + assert!(num <= cb_slice.len()); |
| 197 | + assert!(num <= cr_slice.len()); |
| 198 | + |
| 199 | + let num_vecs = num / 8; |
| 200 | + |
| 201 | + for i in 0..num_vecs { |
| 202 | + const SHIFT: u32 = 6; |
| 203 | + // Load. |
| 204 | + let y: v128; |
| 205 | + let cb: v128; |
| 206 | + let cr: v128; |
| 207 | + // SAFETY: i is at most `num / 8 - 8`, so the highest v128_load64_zero reads from |
| 208 | + // [num - 8, num). The above asserts ensure this is in-bounds. |
| 209 | + unsafe { |
| 210 | + y = v128_load64_zero(y_slice.as_ptr().wrapping_add(i * 8) as *const _); |
| 211 | + cb = v128_load64_zero(cb_slice.as_ptr().wrapping_add(i * 8) as *const _); |
| 212 | + cr = v128_load64_zero(cr_slice.as_ptr().wrapping_add(i * 8) as *const _); |
| 213 | + } |
| 214 | + |
| 215 | + // Convert to 16 bit. |
| 216 | + let y = i16x8_shl(i16x8_extend_low_u8x16(y), SHIFT); |
| 217 | + let cb = i16x8_shl(i16x8_extend_low_u8x16(cb), SHIFT); |
| 218 | + let cr = i16x8_shl(i16x8_extend_low_u8x16(cr), SHIFT); |
| 219 | + |
| 220 | + // Add offsets |
| 221 | + let c128 = i16x8_splat(128 << SHIFT); |
| 222 | + let y = i16x8_add_sat(y, i16x8_splat((1 << SHIFT) >> 1)); |
| 223 | + let cb = i16x8_sub_sat(cb, c128); |
| 224 | + let cr = i16x8_sub_sat(cr, c128); |
| 225 | + |
| 226 | + // Compute cr * 1.402, cb * 0.34414, cr * 0.71414, cb * 1.772 |
| 227 | + let cr_140200 = i16x8_add_sat(i16x8_q15mulr_sat(cr, i16x8_splat(13173)), cr); |
| 228 | + let cb_034414 = i16x8_q15mulr_sat(cb, i16x8_splat(11276)); |
| 229 | + let cr_071414 = i16x8_q15mulr_sat(cr, i16x8_splat(23401)); |
| 230 | + let cb_177200 = i16x8_add_sat(i16x8_q15mulr_sat(cb, i16x8_splat(25297)), cb); |
| 231 | + |
| 232 | + // Last conversion step. |
| 233 | + let r = i16x8_add_sat(y, cr_140200); |
| 234 | + let g = i16x8_sub_sat(y, i16x8_add_sat(cb_034414, cr_071414)); |
| 235 | + let b = i16x8_add_sat(y, cb_177200); |
| 236 | + |
| 237 | + // Shift back and convert to u8. |
| 238 | + let zero = u8x16_splat(0); |
| 239 | + let r = u8x16_narrow_i16x8(i16x8_shr(r, SHIFT), zero); |
| 240 | + let g = u8x16_narrow_i16x8(i16x8_shr(g, SHIFT), zero); |
| 241 | + let b = u8x16_narrow_i16x8(i16x8_shr(b, SHIFT), zero); |
| 242 | + |
| 243 | + // Shuffle rrrrrrrrggggggggbbbbbbbb to rgbrgbrgb... |
| 244 | + |
| 245 | + let rg_lanes = i8x16_shuffle::<0, 16, |
| 246 | + 1, 17, |
| 247 | + 2, 18, |
| 248 | + 3, 19, |
| 249 | + 4, 20, |
| 250 | + 5, 21, |
| 251 | + 6, 22, |
| 252 | + 7, 23>(r, g); |
| 253 | + |
| 254 | + let rgb_low = i8x16_shuffle::<0, 1, 16, // r0, g0, b0 |
| 255 | + 2, 3, 17, // r1, g1, b1 |
| 256 | + 4, 5, 18, // r2, g2, b2 |
| 257 | + 6, 7, 19, // r3, g3, b3 |
| 258 | + 8, 9, 20, // r4, g4, b4 |
| 259 | + 10>(rg_lanes, b); // r5 |
| 260 | + |
| 261 | + let rgb_hi = i8x16_shuffle::<11, 21, 12, // g5, b5, r6 |
| 262 | + 13, 22, 14, // g6, b6, r7 |
| 263 | + 15, 23, 0, // g7, b7, -- |
| 264 | + 0, 0, 0, // --, --, -- |
| 265 | + 0, 0, 0, // --, --, -- |
| 266 | + 0>(rg_lanes, b); // -- |
| 267 | + |
| 268 | + // SAFETY: i is at most `output.len() / 24 - 1` so the highest possible write is to |
| 269 | + // `output.len() - 1`. |
| 270 | + unsafe { |
| 271 | + v128_store(output.as_mut_ptr().wrapping_add(24 * i) as *mut _, rgb_low); |
| 272 | + v128_store64_lane::<0>(rgb_hi, output.as_mut_ptr().wrapping_add(24 * i + 16) as *mut _); |
| 273 | + } |
| 274 | + } |
| 275 | + |
| 276 | + num_vecs * 8 |
| 277 | +} |
0 commit comments