@@ -181,20 +181,25 @@ pub unsafe fn color_convert_line_ycbcr(y: &[u8], cb: &[u8], cr: &[u8], output: &
181
181
assert ! ( num <= y. len( ) ) ;
182
182
assert ! ( num <= cb. len( ) ) ;
183
183
assert ! ( num <= cr. len( ) ) ;
184
- let num_vecs = num / 8 ;
184
+ // _mm_loadu_si64 seems to be confused between stable and beta on whether it loads on the low
185
+ // or the upper half of the vector. To circumvent the issue, we use _mm_loadu_si128, and skip
186
+ // one vector to avoid loads from outside accessible memory.
187
+ let num_vecs = ( num / 8 ) . saturating_sub ( 1 ) ;
185
188
186
189
for i in 0 ..num_vecs {
187
190
const SHIFT : i32 = 6 ;
188
191
// Load.
189
- let y = _mm_loadu_si64 ( y. as_ptr ( ) . wrapping_add ( i * 8 ) as * const _ ) ;
190
- let cb = _mm_loadu_si64 ( cb. as_ptr ( ) . wrapping_add ( i * 8 ) as * const _ ) ;
191
- let cr = _mm_loadu_si64 ( cr. as_ptr ( ) . wrapping_add ( i * 8 ) as * const _ ) ;
192
+ let y = _mm_loadu_si128 ( y. as_ptr ( ) . wrapping_add ( i * 8 ) as * const _ ) ;
193
+ let cb = _mm_loadu_si128 ( cb. as_ptr ( ) . wrapping_add ( i * 8 ) as * const _ ) ;
194
+ let cr = _mm_loadu_si128 ( cr. as_ptr ( ) . wrapping_add ( i * 8 ) as * const _ ) ;
192
195
193
196
// Convert to 16 bit.
194
- let zero = _mm_setzero_si128 ( ) ;
195
- let y = _mm_slli_epi16 ( _mm_unpackhi_epi8 ( y, zero) , SHIFT ) ;
196
- let cb = _mm_slli_epi16 ( _mm_unpackhi_epi8 ( cb, zero) , SHIFT ) ;
197
- let cr = _mm_slli_epi16 ( _mm_unpackhi_epi8 ( cr, zero) , SHIFT ) ;
197
+ let shuf16 = _mm_setr_epi8 (
198
+ 0 , -0x7F , 1 , -0x7F , 2 , -0x7F , 3 , -0x7F , 4 , -0x7F , 5 , -0x7F , 6 , -0x7F , 7 , -0x7F ,
199
+ ) ;
200
+ let y = _mm_slli_epi16 ( _mm_shuffle_epi8 ( y, shuf16) , SHIFT ) ;
201
+ let cb = _mm_slli_epi16 ( _mm_shuffle_epi8 ( cb, shuf16) , SHIFT ) ;
202
+ let cr = _mm_slli_epi16 ( _mm_shuffle_epi8 ( cr, shuf16) , SHIFT ) ;
198
203
199
204
// Add offsets
200
205
let c128 = _mm_set1_epi16 ( 128 << SHIFT ) ;
@@ -214,22 +219,18 @@ pub unsafe fn color_convert_line_ycbcr(y: &[u8], cb: &[u8], cr: &[u8], output: &
214
219
let b = _mm_adds_epi16 ( y, cb_177200) ;
215
220
216
221
// Shift back and convert to u8.
222
+ let zero = _mm_setzero_si128 ( ) ;
217
223
let r = _mm_packus_epi16 ( _mm_srai_epi16 ( r, SHIFT ) , zero) ;
218
224
let g = _mm_packus_epi16 ( _mm_srai_epi16 ( g, SHIFT ) , zero) ;
219
225
let b = _mm_packus_epi16 ( _mm_srai_epi16 ( b, SHIFT ) , zero) ;
220
226
221
227
// Shuffle rrrrrrrrggggggggbbbbbbbb to rgbrgbrgb...
222
- let shufr = _mm_loadu_si128 (
223
- [
224
- 0u8 , 0x80 , 0x80 , 1 , 0x80 , 0x80 , 2 , 0x80 , 0x80 , 3 , 0x80 , 0x80 , 4 , 0x80 , 0x80 , 5 ,
225
- ]
226
- . as_ptr ( ) as * const _ ,
228
+ let shufr = _mm_setr_epi8 (
229
+ 0 , -0x7F , -0x7F , 1 , -0x7F , -0x7F , 2 , -0x7F , -0x7F , 3 , -0x7F , -0x7F , 4 , -0x7F , -0x7F , 5 ,
227
230
) ;
228
- let shufg = _mm_loadu_si128 (
229
- [
230
- 0x80u8 , 0 , 0x80 , 0x80 , 1 , 0x80 , 0x80 , 2 , 0x80 , 0x80 , 3 , 0x80 , 0x80 , 4 , 0x80 , 0x80 ,
231
- ]
232
- . as_ptr ( ) as * const _ ,
231
+ let shufg = _mm_setr_epi8 (
232
+ -0x7F , 0 , -0x7F , -0x7F , 1 , -0x7F , -0x7F , 2 , -0x7F , -0x7F , 3 , -0x7F , -0x7F , 4 , -0x7F ,
233
+ -0x7F ,
233
234
) ;
234
235
let shufb = _mm_alignr_epi8 ( shufg, shufg, 15 ) ;
235
236
0 commit comments