Skip to content

Commit 18350e0

Browse files
author
HeroicKatora
authored
Merge pull request #221 from veluca93/master
Use NEON instructions on aarch64.
2 parents 3158a1c + e1c3d88 commit 18350e0

File tree

5 files changed

+243
-3
lines changed

5 files changed

+243
-3
lines changed

.github/workflows/rust.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ jobs:
4444
strategy:
4545
matrix:
4646
rust: [nightly]
47-
features: ["", "rayon"]
47+
features: ["", "rayon", "nightly_aarch64_neon"]
4848

4949
steps:
5050
- name: Installing emulator and linker

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,5 @@ harness = false
2929
[features]
3030
default = ["rayon"]
3131
platform_independent = []
32+
nightly_aarch64_neon = []
33+

src/arch/mod.rs

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#![allow(unsafe_code)]
22

3+
mod neon;
34
mod ssse3;
45

56
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
@@ -16,18 +17,30 @@ pub fn get_color_convert_line_ycbcr() -> Option<unsafe fn(&[u8], &[u8], &[u8], &
1617
return Some(ssse3::color_convert_line_ycbcr);
1718
}
1819
}
20+
// Runtime detection is not needed on aarch64.
21+
#[cfg(all(feature = "nightly_aarch64_neon", target_arch = "aarch64"))]
22+
{
23+
return Some(neon::color_convert_line_ycbcr);
24+
}
25+
#[allow(unreachable_code)]
1926
None
2027
}
2128

2229
/// Arch-specific implementation of 8x8 IDCT.
23-
pub fn get_dequantize_and_idct_block_8x8() -> Option<unsafe fn(&[i16; 64], &[u16; 64], usize, &mut [u8])>
24-
{
30+
pub fn get_dequantize_and_idct_block_8x8(
31+
) -> Option<unsafe fn(&[i16; 64], &[u16; 64], usize, &mut [u8])> {
2532
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
2633
#[allow(unsafe_code)]
2734
{
2835
if is_x86_feature_detected!("ssse3") {
2936
return Some(ssse3::dequantize_and_idct_block_8x8);
3037
}
3138
}
39+
// Runtime detection is not needed on aarch64.
40+
#[cfg(all(feature = "nightly_aarch64_neon", target_arch = "aarch64"))]
41+
{
42+
return Some(neon::dequantize_and_idct_block_8x8);
43+
}
44+
#[allow(unreachable_code)]
3245
None
3346
}

src/arch/neon.rs

Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
#[cfg(all(feature = "nightly_aarch64_neon", target_arch = "aarch64"))]
2+
use core::arch::aarch64::*;
3+
4+
#[cfg(all(feature = "nightly_aarch64_neon", target_arch = "aarch64"))]
5+
#[target_feature(enable = "neon")]
6+
unsafe fn idct8(data: &mut [int16x8_t; 8]) {
7+
// The fixed-point constants here are obtained by taking the fractional part of the constants
8+
// from the non-SIMD implementation and scaling them up by 1<<15. This is because
9+
// vqrdmulhq_n_s16(a, b) is effectively equivalent to (a*b)>>15 (except for possibly some
10+
// slight differences in rounding).
11+
12+
// The code here is effectively equivalent to the calls to "kernel" in idct.rs, except that it
13+
// doesn't apply any further scaling and fixed point constants have a different precision.
14+
15+
let p2 = data[2];
16+
let p3 = data[6];
17+
let p1 = vqrdmulhq_n_s16(vqaddq_s16(p2, p3), 17734); // 0.5411961
18+
let t2 = vqsubq_s16(
19+
vqsubq_s16(p1, p3),
20+
vqrdmulhq_n_s16(p3, 27779), // 0.847759065
21+
);
22+
let t3 = vqaddq_s16(p1, vqrdmulhq_n_s16(p2, 25079)); // 0.765366865
23+
24+
let p2 = data[0];
25+
let p3 = data[4];
26+
let t0 = vqaddq_s16(p2, p3);
27+
let t1 = vqsubq_s16(p2, p3);
28+
29+
let x0 = vqaddq_s16(t0, t3);
30+
let x3 = vqsubq_s16(t0, t3);
31+
let x1 = vqaddq_s16(t1, t2);
32+
let x2 = vqsubq_s16(t1, t2);
33+
34+
let t0 = data[7];
35+
let t1 = data[5];
36+
let t2 = data[3];
37+
let t3 = data[1];
38+
39+
let p3 = vqaddq_s16(t0, t2);
40+
let p4 = vqaddq_s16(t1, t3);
41+
let p1 = vqaddq_s16(t0, t3);
42+
let p2 = vqaddq_s16(t1, t2);
43+
let p5 = vqaddq_s16(p3, p4);
44+
let p5 = vqaddq_s16(p5, vqrdmulhq_n_s16(p5, 5763)); // 0.175875602
45+
46+
let t0 = vqrdmulhq_n_s16(t0, 9786); // 0.298631336
47+
let t1 = vqaddq_s16(
48+
vqaddq_s16(t1, t1),
49+
vqrdmulhq_n_s16(t1, 1741), // 0.053119869
50+
);
51+
let t2 = vqaddq_s16(
52+
vqaddq_s16(t2, vqaddq_s16(t2, t2)),
53+
vqrdmulhq_n_s16(t2, 2383), // 0.072711026
54+
);
55+
let t3 = vqaddq_s16(t3, vqrdmulhq_n_s16(t3, 16427)); // 0.501321110
56+
57+
let p1 = vqsubq_s16(p5, vqrdmulhq_n_s16(p1, 29490)); // 0.899976223
58+
let p2 = vqsubq_s16(
59+
vqsubq_s16(vqsubq_s16(p5, p2), p2),
60+
vqrdmulhq_n_s16(p2, 18446), // 0.562915447
61+
);
62+
63+
let p3 = vqsubq_s16(
64+
vqrdmulhq_n_s16(p3, -31509), // -0.961570560
65+
p3,
66+
);
67+
let p4 = vqrdmulhq_n_s16(p4, -12785); // -0.390180644
68+
69+
let t3 = vqaddq_s16(vqaddq_s16(p1, p4), t3);
70+
let t2 = vqaddq_s16(vqaddq_s16(p2, p3), t2);
71+
let t1 = vqaddq_s16(vqaddq_s16(p2, p4), t1);
72+
let t0 = vqaddq_s16(vqaddq_s16(p1, p3), t0);
73+
74+
data[0] = vqaddq_s16(x0, t3);
75+
data[7] = vqsubq_s16(x0, t3);
76+
data[1] = vqaddq_s16(x1, t2);
77+
data[6] = vqsubq_s16(x1, t2);
78+
data[2] = vqaddq_s16(x2, t1);
79+
data[5] = vqsubq_s16(x2, t1);
80+
data[3] = vqaddq_s16(x3, t0);
81+
data[4] = vqsubq_s16(x3, t0);
82+
}
83+
84+
#[cfg(all(feature = "nightly_aarch64_neon", target_arch = "aarch64"))]
85+
#[target_feature(enable = "neon")]
86+
unsafe fn transpose8(data: &mut [int16x8_t; 8]) {
87+
// Use NEON's 2x2 matrix transposes (vtrn) to do the transposition in each 4x4 block, then
88+
// combine the 4x4 blocks.
89+
let a01 = vtrnq_s16(data[0], data[1]);
90+
let a23 = vtrnq_s16(data[2], data[3]);
91+
92+
let four0 = vtrnq_s32(vreinterpretq_s32_s16(a01.0), vreinterpretq_s32_s16(a23.0));
93+
let four1 = vtrnq_s32(vreinterpretq_s32_s16(a01.1), vreinterpretq_s32_s16(a23.1));
94+
95+
let a45 = vtrnq_s16(data[4], data[5]);
96+
let a67 = vtrnq_s16(data[6], data[7]);
97+
98+
let four2 = vtrnq_s32(vreinterpretq_s32_s16(a45.0), vreinterpretq_s32_s16(a67.0));
99+
let four3 = vtrnq_s32(vreinterpretq_s32_s16(a45.1), vreinterpretq_s32_s16(a67.1));
100+
101+
data[0] = vreinterpretq_s16_s32(vcombine_s32(vget_low_s32(four0.0), vget_low_s32(four2.0)));
102+
data[1] = vreinterpretq_s16_s32(vcombine_s32(vget_low_s32(four1.0), vget_low_s32(four3.0)));
103+
data[2] = vreinterpretq_s16_s32(vcombine_s32(vget_low_s32(four0.1), vget_low_s32(four2.1)));
104+
data[3] = vreinterpretq_s16_s32(vcombine_s32(vget_low_s32(four1.1), vget_low_s32(four3.1)));
105+
data[4] = vreinterpretq_s16_s32(vcombine_s32(vget_high_s32(four0.0), vget_high_s32(four2.0)));
106+
data[5] = vreinterpretq_s16_s32(vcombine_s32(vget_high_s32(four1.0), vget_high_s32(four3.0)));
107+
data[6] = vreinterpretq_s16_s32(vcombine_s32(vget_high_s32(four0.1), vget_high_s32(four2.1)));
108+
data[7] = vreinterpretq_s16_s32(vcombine_s32(vget_high_s32(four1.1), vget_high_s32(four3.1)));
109+
}
110+
111+
#[cfg(all(feature = "nightly_aarch64_neon", target_arch = "aarch64"))]
112+
#[target_feature(enable = "neon")]
113+
pub unsafe fn dequantize_and_idct_block_8x8(
114+
coefficients: &[i16; 64],
115+
quantization_table: &[u16; 64],
116+
output_linestride: usize,
117+
output: &mut [u8],
118+
) {
119+
// The loop below will write to positions [output_linestride * i, output_linestride * i + 8)
120+
// for 0<=i<8. Thus, the last accessed position is at an offset of output_linestrade * 7 + 7,
121+
// and if that position is in-bounds, so are all other accesses.
122+
assert!(
123+
output.len()
124+
> output_linestride
125+
.checked_mul(7)
126+
.unwrap()
127+
.checked_add(7)
128+
.unwrap()
129+
);
130+
131+
const SHIFT: i32 = 3;
132+
133+
// Read the DCT coefficients, scale them up and dequantize them.
134+
let mut data = [vdupq_n_s16(0); 8];
135+
for i in 0..8 {
136+
data[i] = vshlq_n_s16(
137+
vmulq_s16(
138+
vld1q_s16(coefficients.as_ptr().wrapping_add(i * 8)),
139+
vreinterpretq_s16_u16(vld1q_u16(quantization_table.as_ptr().wrapping_add(i * 8))),
140+
),
141+
SHIFT,
142+
);
143+
}
144+
145+
// Usual column IDCT - transpose - column IDCT - transpose approach.
146+
idct8(&mut data);
147+
transpose8(&mut data);
148+
idct8(&mut data);
149+
transpose8(&mut data);
150+
151+
for i in 0..8 {
152+
// The two passes of the IDCT algorithm give us a factor of 8, so the shift here is
153+
// increased by 3.
154+
// As values will be stored in a u8, they need to be 128-centered and not 0-centered.
155+
// We add 128 with the appropriate shift for that purpose.
156+
const OFFSET: i16 = 128 << (SHIFT + 3);
157+
// We want rounding right shift, so we should add (1/2) << (SHIFT+3) before shifting.
158+
const ROUNDING_BIAS: i16 = (1 << (SHIFT + 3)) >> 1;
159+
160+
let data_with_offset = vqaddq_s16(data[i], vdupq_n_s16(OFFSET + ROUNDING_BIAS));
161+
162+
vst1_u8(
163+
output.as_mut_ptr().wrapping_add(output_linestride * i),
164+
vqshrun_n_s16(data_with_offset, SHIFT + 3),
165+
);
166+
}
167+
}
168+
169+
#[cfg(all(feature = "nightly_aarch64_neon", target_arch = "aarch64"))]
170+
#[target_feature(enable = "neon")]
171+
pub unsafe fn color_convert_line_ycbcr(y: &[u8], cb: &[u8], cr: &[u8], output: &mut [u8]) -> usize {
172+
assert!(output.len() % 3 == 0);
173+
let num = output.len() / 3;
174+
assert!(num <= y.len());
175+
assert!(num <= cb.len());
176+
assert!(num <= cr.len());
177+
let num_vecs = num / 8;
178+
179+
for i in 0..num_vecs {
180+
const SHIFT: i32 = 6;
181+
// Load.
182+
let y = vld1_u8(y.as_ptr().wrapping_add(i * 8));
183+
let cb = vld1_u8(cb.as_ptr().wrapping_add(i * 8));
184+
let cr = vld1_u8(cr.as_ptr().wrapping_add(i * 8));
185+
186+
// Convert to 16 bit and shift.
187+
let y = vreinterpretq_s16_u16(vshll_n_u8(y, SHIFT));
188+
let cb = vreinterpretq_s16_u16(vshll_n_u8(cb, SHIFT));
189+
let cr = vreinterpretq_s16_u16(vshll_n_u8(cr, SHIFT));
190+
191+
// Add offsets
192+
let y = vqaddq_s16(y, vdupq_n_s16((1 << SHIFT) >> 1));
193+
let c128 = vdupq_n_s16(128 << SHIFT);
194+
let cb = vqsubq_s16(cb, c128);
195+
let cr = vqsubq_s16(cr, c128);
196+
197+
// Compute cr * 1.402, cb * 0.34414, cr * 0.71414, cb * 1.772
198+
let cr_140200 = vqaddq_s16(vqrdmulhq_n_s16(cr, 13173), cr);
199+
let cb_034414 = vqrdmulhq_n_s16(cb, 11276);
200+
let cr_071414 = vqrdmulhq_n_s16(cr, 23401);
201+
let cb_177200 = vqaddq_s16(vqrdmulhq_n_s16(cb, 25297), cb);
202+
203+
// Last conversion step.
204+
let r = vqaddq_s16(y, cr_140200);
205+
let g = vqsubq_s16(y, vqaddq_s16(cb_034414, cr_071414));
206+
let b = vqaddq_s16(y, cb_177200);
207+
208+
// Shift back and convert to u8.
209+
let r = vqshrun_n_s16(r, SHIFT);
210+
let g = vqshrun_n_s16(g, SHIFT);
211+
let b = vqshrun_n_s16(b, SHIFT);
212+
213+
// Shuffle + store.
214+
vst3_u8(
215+
output.as_mut_ptr().wrapping_add(24 * i),
216+
uint8x8x3_t(r, g, b),
217+
);
218+
}
219+
220+
num_vecs * 8
221+
}

src/lib.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,10 @@
2929
#![deny(missing_docs)]
3030
#![deny(unsafe_code)]
3131
#![cfg_attr(feature = "platform_independent", forbid(unsafe_code))]
32+
#![cfg_attr(
33+
all(feature = "nightly_aarch64_neon", target_arch = "aarch64"),
34+
feature(aarch64_target_feature)
35+
)]
3236

3337
extern crate alloc;
3438
extern crate core;

0 commit comments

Comments
 (0)