Skip to content

Commit 80cc0be

Browse files
committed
Fix ssse3 code on beta+.
1 parent 5db115c commit 80cc0be

File tree

1 file changed

+19
-18
lines changed

1 file changed

+19
-18
lines changed

src/arch/ssse3.rs

Lines changed: 19 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -181,20 +181,25 @@ pub unsafe fn color_convert_line_ycbcr(y: &[u8], cb: &[u8], cr: &[u8], output: &
181181
assert!(num <= y.len());
182182
assert!(num <= cb.len());
183183
assert!(num <= cr.len());
184-
let num_vecs = num / 8;
184+
// _mm_loadu_si64 seems to be confused between stable and beta on whether it loads on the low
185+
// or the upper half of the vector. To circumvent the issue, we use _mm_loadu_si128, and skip
186+
// one vector to avoid loads from outside accessible memory.
187+
let num_vecs = (num / 8).saturating_sub(1);
185188

186189
for i in 0..num_vecs {
187190
const SHIFT: i32 = 6;
188191
// Load.
189-
let y = _mm_loadu_si64(y.as_ptr().wrapping_add(i * 8) as *const _);
190-
let cb = _mm_loadu_si64(cb.as_ptr().wrapping_add(i * 8) as *const _);
191-
let cr = _mm_loadu_si64(cr.as_ptr().wrapping_add(i * 8) as *const _);
192+
let y = _mm_loadu_si128(y.as_ptr().wrapping_add(i * 8) as *const _);
193+
let cb = _mm_loadu_si128(cb.as_ptr().wrapping_add(i * 8) as *const _);
194+
let cr = _mm_loadu_si128(cr.as_ptr().wrapping_add(i * 8) as *const _);
192195

193196
// Convert to 16 bit.
194-
let zero = _mm_setzero_si128();
195-
let y = _mm_slli_epi16(_mm_unpackhi_epi8(y, zero), SHIFT);
196-
let cb = _mm_slli_epi16(_mm_unpackhi_epi8(cb, zero), SHIFT);
197-
let cr = _mm_slli_epi16(_mm_unpackhi_epi8(cr, zero), SHIFT);
197+
let shuf16 = _mm_setr_epi8(
198+
0, -0x7F, 1, -0x7F, 2, -0x7F, 3, -0x7F, 4, -0x7F, 5, -0x7F, 6, -0x7F, 7, -0x7F,
199+
);
200+
let y = _mm_slli_epi16(_mm_shuffle_epi8(y, shuf16), SHIFT);
201+
let cb = _mm_slli_epi16(_mm_shuffle_epi8(cb, shuf16), SHIFT);
202+
let cr = _mm_slli_epi16(_mm_shuffle_epi8(cr, shuf16), SHIFT);
198203

199204
// Add offsets
200205
let c128 = _mm_set1_epi16(128 << SHIFT);
@@ -214,22 +219,18 @@ pub unsafe fn color_convert_line_ycbcr(y: &[u8], cb: &[u8], cr: &[u8], output: &
214219
let b = _mm_adds_epi16(y, cb_177200);
215220

216221
// Shift back and convert to u8.
222+
let zero = _mm_setzero_si128();
217223
let r = _mm_packus_epi16(_mm_srai_epi16(r, SHIFT), zero);
218224
let g = _mm_packus_epi16(_mm_srai_epi16(g, SHIFT), zero);
219225
let b = _mm_packus_epi16(_mm_srai_epi16(b, SHIFT), zero);
220226

221227
// Shuffle rrrrrrrrggggggggbbbbbbbb to rgbrgbrgb...
222-
let shufr = _mm_loadu_si128(
223-
[
224-
0u8, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80, 5,
225-
]
226-
.as_ptr() as *const _,
228+
let shufr = _mm_setr_epi8(
229+
0, -0x7F, -0x7F, 1, -0x7F, -0x7F, 2, -0x7F, -0x7F, 3, -0x7F, -0x7F, 4, -0x7F, -0x7F, 5,
227230
);
228-
let shufg = _mm_loadu_si128(
229-
[
230-
0x80u8, 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80,
231-
]
232-
.as_ptr() as *const _,
231+
let shufg = _mm_setr_epi8(
232+
-0x7F, 0, -0x7F, -0x7F, 1, -0x7F, -0x7F, 2, -0x7F, -0x7F, 3, -0x7F, -0x7F, 4, -0x7F,
233+
-0x7F,
233234
);
234235
let shufb = _mm_alignr_epi8(shufg, shufg, 15);
235236

0 commit comments

Comments
 (0)