@@ -16,6 +16,7 @@ use parser::{
16
16
} ;
17
17
use std:: convert:: TryInto ;
18
18
use std:: io:: Read ;
19
+ use std:: is_x86_feature_detected;
19
20
use upsampler:: Upsampler ;
20
21
use worker:: { PlatformWorker , RowData , Worker } ;
21
22
@@ -1253,15 +1254,124 @@ fn color_convert_line_rgb(data: &[Vec<u8>], output: &mut [u8]) {
1253
1254
}
1254
1255
}
1255
1256
1257
+ #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
1258
+ #[ target_feature( enable = "ssse3" ) ]
1259
+ #[ allow( unsafe_code) ]
1260
+ unsafe fn color_convert_line_ycbcr_ssse3 (
1261
+ y : & [ u8 ] ,
1262
+ cb : & [ u8 ] ,
1263
+ cr : & [ u8 ] ,
1264
+ output : & mut [ u8 ] ,
1265
+ ) -> usize {
1266
+ #[ cfg( target_arch = "x86" ) ]
1267
+ use std:: arch:: x86:: * ;
1268
+ #[ cfg( target_arch = "x86_64" ) ]
1269
+ use std:: arch:: x86_64:: * ;
1270
+
1271
+ assert ! ( output. len( ) % 3 == 0 ) ;
1272
+ let num = output. len ( ) / 3 ;
1273
+ assert ! ( num <= y. len( ) ) ;
1274
+ assert ! ( num <= cb. len( ) ) ;
1275
+ assert ! ( num <= cr. len( ) ) ;
1276
+ let num_vecs = num / 8 ;
1277
+
1278
+ for i in 0 ..num_vecs {
1279
+ const SHIFT : i32 = 6 ;
1280
+ // Load.
1281
+ let y = _mm_loadu_si64 ( y. as_ptr ( ) . wrapping_add ( i * 8 ) as * const _ ) ;
1282
+ let cb = _mm_loadu_si64 ( cb. as_ptr ( ) . wrapping_add ( i * 8 ) as * const _ ) ;
1283
+ let cr = _mm_loadu_si64 ( cr. as_ptr ( ) . wrapping_add ( i * 8 ) as * const _ ) ;
1284
+
1285
+ // Convert to 16 bit.
1286
+ let zero = _mm_setzero_si128 ( ) ;
1287
+ let y = _mm_slli_epi16 ( _mm_unpackhi_epi8 ( y, zero) , SHIFT ) ;
1288
+ let cb = _mm_slli_epi16 ( _mm_unpackhi_epi8 ( cb, zero) , SHIFT ) ;
1289
+ let cr = _mm_slli_epi16 ( _mm_unpackhi_epi8 ( cr, zero) , SHIFT ) ;
1290
+
1291
+ // Add offsets
1292
+ let c128 = _mm_set1_epi16 ( 128 << SHIFT ) ;
1293
+ let y = _mm_adds_epi16 ( y, _mm_set1_epi16 ( ( 1 << SHIFT ) >> 1 ) ) ;
1294
+ let cb = _mm_subs_epi16 ( cb, c128) ;
1295
+ let cr = _mm_subs_epi16 ( cr, c128) ;
1296
+
1297
+ // Compute cr * 1.402, cb * 0.34414, cr * 0.71414, cb * 1.772
1298
+ let cr_140200 = _mm_adds_epi16 ( _mm_mulhrs_epi16 ( cr, _mm_set1_epi16 ( 13173 ) ) , cr) ;
1299
+ let cb_034414 = _mm_mulhrs_epi16 ( cb, _mm_set1_epi16 ( 11276 ) ) ;
1300
+ let cr_071414 = _mm_mulhrs_epi16 ( cr, _mm_set1_epi16 ( 23401 ) ) ;
1301
+ let cb_177200 = _mm_adds_epi16 ( _mm_mulhrs_epi16 ( cb, _mm_set1_epi16 ( 25297 ) ) , cb) ;
1302
+
1303
+ // Last conversion step.
1304
+ let r = _mm_adds_epi16 ( y, cr_140200) ;
1305
+ let g = _mm_subs_epi16 ( y, _mm_adds_epi16 ( cb_034414, cr_071414) ) ;
1306
+ let b = _mm_adds_epi16 ( y, cb_177200) ;
1307
+
1308
+ // Shift back and convert to u8.
1309
+ let r = _mm_packus_epi16 ( _mm_srai_epi16 ( r, SHIFT ) , zero) ;
1310
+ let g = _mm_packus_epi16 ( _mm_srai_epi16 ( g, SHIFT ) , zero) ;
1311
+ let b = _mm_packus_epi16 ( _mm_srai_epi16 ( b, SHIFT ) , zero) ;
1312
+
1313
+ // Shuffle rrrrrrrrggggggggbbbbbbbb to rgbrgbrgb...
1314
+ let shufr = _mm_loadu_si128 (
1315
+ [
1316
+ 0u8 , 0x80 , 0x80 , 1 , 0x80 , 0x80 , 2 , 0x80 , 0x80 , 3 , 0x80 , 0x80 , 4 , 0x80 , 0x80 , 5 ,
1317
+ ]
1318
+ . as_ptr ( ) as * const _ ,
1319
+ ) ;
1320
+ let shufg = _mm_loadu_si128 (
1321
+ [
1322
+ 0x80u8 , 0 , 0x80 , 0x80 , 1 , 0x80 , 0x80 , 2 , 0x80 , 0x80 , 3 , 0x80 , 0x80 , 4 , 0x80 , 0x80 ,
1323
+ ]
1324
+ . as_ptr ( ) as * const _ ,
1325
+ ) ;
1326
+ let shufb = _mm_alignr_epi8 ( shufg, shufg, 15 ) ;
1327
+
1328
+ let rgb_low = _mm_or_si128 (
1329
+ _mm_shuffle_epi8 ( r, shufr) ,
1330
+ _mm_or_si128 ( _mm_shuffle_epi8 ( g, shufg) , _mm_shuffle_epi8 ( b, shufb) ) ,
1331
+ ) ;
1332
+
1333
+ let shufr1 = _mm_add_epi8 ( shufb, _mm_set1_epi8 ( 6 ) ) ;
1334
+ let shufg1 = _mm_add_epi8 ( shufr, _mm_set1_epi8 ( 5 ) ) ;
1335
+ let shufb1 = _mm_add_epi8 ( shufg, _mm_set1_epi8 ( 5 ) ) ;
1336
+
1337
+ let rgb_hi = _mm_or_si128 (
1338
+ _mm_shuffle_epi8 ( r, shufr1) ,
1339
+ _mm_or_si128 ( _mm_shuffle_epi8 ( g, shufg1) , _mm_shuffle_epi8 ( b, shufb1) ) ,
1340
+ ) ;
1341
+
1342
+ let mut data = [ 0u8 ; 32 ] ;
1343
+ _mm_storeu_si128 ( data. as_mut_ptr ( ) as * mut _ , rgb_low) ;
1344
+ _mm_storeu_si128 ( data. as_mut_ptr ( ) . wrapping_add ( 16 ) as * mut _ , rgb_hi) ;
1345
+ std:: ptr:: copy_nonoverlapping :: < u8 > (
1346
+ data. as_ptr ( ) ,
1347
+ output. as_mut_ptr ( ) . wrapping_add ( 24 * i) ,
1348
+ 24 ,
1349
+ ) ;
1350
+ }
1351
+
1352
+ num_vecs * 8
1353
+ }
1354
+
1256
1355
fn color_convert_line_ycbcr ( data : & [ Vec < u8 > ] , output : & mut [ u8 ] ) {
1257
1356
assert ! ( data. len( ) == 3 , "wrong number of components for ycbcr" ) ;
1258
1357
let [ y, cb, cr] : & [ _ ; 3 ] = data. try_into ( ) . unwrap ( ) ;
1259
1358
1359
+ let mut skip = 0usize ;
1360
+
1361
+ #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
1362
+ #[ allow( unsafe_code) ]
1363
+ {
1364
+ if is_x86_feature_detected ! ( "ssse3" ) {
1365
+ skip = unsafe { color_convert_line_ycbcr_ssse3 ( y, cb, cr, output) } ;
1366
+ }
1367
+ }
1368
+
1260
1369
for ( ( ( chunk, y) , cb) , cr) in output
1261
1370
. chunks_exact_mut ( 3 )
1262
1371
. zip ( y. iter ( ) )
1263
1372
. zip ( cb. iter ( ) )
1264
1373
. zip ( cr. iter ( ) )
1374
+ . skip ( skip)
1265
1375
{
1266
1376
let ( r, g, b) = ycbcr_to_rgb ( * y, * cb, * cr) ;
1267
1377
chunk[ 0 ] = r;
0 commit comments