10
10
#![ allow( clippy:: identity_op) ]
11
11
use crate :: parser:: Dimensions ;
12
12
use core:: { convert:: TryFrom , num:: Wrapping } ;
13
+ use std:: is_x86_feature_detected;
13
14
14
15
pub ( crate ) fn choose_idct_size ( full_size : Dimensions , requested_size : Dimensions ) -> usize {
15
16
fn scaled ( len : u16 , scale : usize ) -> u16 {
@@ -238,12 +239,183 @@ pub(crate) fn dequantize_and_idct_block(
238
239
}
239
240
}
240
241
242
+ macro_rules! idct8_ssse3 {
243
+ ( $data: expr) => {
244
+ let p2 = $data[ 2 ] ;
245
+ let p3 = $data[ 6 ] ;
246
+ let p1 = _mm_mulhrs_epi16( _mm_adds_epi16( p2, p3) , _mm_set1_epi16( 17734 ) ) ; // 0.5411961
247
+ let t2 = _mm_subs_epi16(
248
+ _mm_subs_epi16( p1, p3) ,
249
+ _mm_mulhrs_epi16( p3, _mm_set1_epi16( 27779 ) ) , // 0.847759065
250
+ ) ;
251
+ let t3 = _mm_adds_epi16( p1, _mm_mulhrs_epi16( p2, _mm_set1_epi16( 25079 ) ) ) ; // 0.765366865
252
+
253
+ let p2 = $data[ 0 ] ;
254
+ let p3 = $data[ 4 ] ;
255
+ let t0 = _mm_adds_epi16( p2, p3) ;
256
+ let t1 = _mm_subs_epi16( p2, p3) ;
257
+
258
+ let x0 = _mm_adds_epi16( t0, t3) ;
259
+ let x3 = _mm_subs_epi16( t0, t3) ;
260
+ let x1 = _mm_adds_epi16( t1, t2) ;
261
+ let x2 = _mm_subs_epi16( t1, t2) ;
262
+
263
+ let t0 = $data[ 7 ] ;
264
+ let t1 = $data[ 5 ] ;
265
+ let t2 = $data[ 3 ] ;
266
+ let t3 = $data[ 1 ] ;
267
+
268
+ let p3 = _mm_adds_epi16( t0, t2) ;
269
+ let p4 = _mm_adds_epi16( t1, t3) ;
270
+ let p1 = _mm_adds_epi16( t0, t3) ;
271
+ let p2 = _mm_adds_epi16( t1, t2) ;
272
+ let p5 = _mm_adds_epi16( p3, p4) ;
273
+ let p5 = _mm_adds_epi16( p5, _mm_mulhrs_epi16( p5, _mm_set1_epi16( 5763 ) ) ) ; // 0.175875602
274
+
275
+ let t0 = _mm_mulhrs_epi16( t0, _mm_set1_epi16( 9786 ) ) ; // 0.298631336
276
+ let t1 = _mm_adds_epi16(
277
+ _mm_adds_epi16( t1, t1) ,
278
+ _mm_mulhrs_epi16( t1, _mm_set1_epi16( 1741 ) ) , // 0.053119869
279
+ ) ;
280
+ let t2 = _mm_adds_epi16(
281
+ _mm_adds_epi16( t2, _mm_adds_epi16( t2, t2) ) ,
282
+ _mm_mulhrs_epi16( t2, _mm_set1_epi16( 2383 ) ) , // 0.072711026
283
+ ) ;
284
+ let t3 = _mm_adds_epi16( t3, _mm_mulhrs_epi16( t3, _mm_set1_epi16( 16427 ) ) ) ; // 0.501321110
285
+
286
+ let p1 = _mm_subs_epi16( p5, _mm_mulhrs_epi16( p1, _mm_set1_epi16( 29490 ) ) ) ; // 0.899976223
287
+ let p2 = _mm_subs_epi16(
288
+ _mm_subs_epi16( _mm_subs_epi16( p5, p2) , p2) ,
289
+ _mm_mulhrs_epi16( p2, _mm_set1_epi16( 18446 ) ) , // 0.562915447
290
+ ) ;
291
+
292
+ let p3 = _mm_subs_epi16(
293
+ _mm_mulhrs_epi16( p3, _mm_set1_epi16( -31509 ) ) , // -0.961570560
294
+ p3,
295
+ ) ;
296
+ let p4 = _mm_mulhrs_epi16( p4, _mm_set1_epi16( -12785 ) ) ; // -0.390180644
297
+
298
+ let t3 = _mm_adds_epi16( _mm_adds_epi16( p1, p4) , t3) ;
299
+ let t2 = _mm_adds_epi16( _mm_adds_epi16( p2, p3) , t2) ;
300
+ let t1 = _mm_adds_epi16( _mm_adds_epi16( p2, p4) , t1) ;
301
+ let t0 = _mm_adds_epi16( _mm_adds_epi16( p1, p3) , t0) ;
302
+
303
+ $data[ 0 ] = _mm_adds_epi16( x0, t3) ;
304
+ $data[ 7 ] = _mm_subs_epi16( x0, t3) ;
305
+ $data[ 1 ] = _mm_adds_epi16( x1, t2) ;
306
+ $data[ 6 ] = _mm_subs_epi16( x1, t2) ;
307
+ $data[ 2 ] = _mm_adds_epi16( x2, t1) ;
308
+ $data[ 5 ] = _mm_subs_epi16( x2, t1) ;
309
+ $data[ 3 ] = _mm_adds_epi16( x3, t0) ;
310
+ $data[ 4 ] = _mm_subs_epi16( x3, t0) ;
311
+ } ;
312
+ }
313
+ macro_rules! transpose8_ssse3 {
314
+ ( $data: expr) => {
315
+ let d01l = _mm_unpacklo_epi16( $data[ 0 ] , $data[ 1 ] ) ;
316
+ let d23l = _mm_unpacklo_epi16( $data[ 2 ] , $data[ 3 ] ) ;
317
+ let d45l = _mm_unpacklo_epi16( $data[ 4 ] , $data[ 5 ] ) ;
318
+ let d67l = _mm_unpacklo_epi16( $data[ 6 ] , $data[ 7 ] ) ;
319
+ let d01h = _mm_unpackhi_epi16( $data[ 0 ] , $data[ 1 ] ) ;
320
+ let d23h = _mm_unpackhi_epi16( $data[ 2 ] , $data[ 3 ] ) ;
321
+ let d45h = _mm_unpackhi_epi16( $data[ 4 ] , $data[ 5 ] ) ;
322
+ let d67h = _mm_unpackhi_epi16( $data[ 6 ] , $data[ 7 ] ) ;
323
+ let d0123ll = _mm_unpacklo_epi32( d01l, d23l) ;
324
+ let d0123lh = _mm_unpackhi_epi32( d01l, d23l) ;
325
+ let d4567ll = _mm_unpacklo_epi32( d45l, d67l) ;
326
+ let d4567lh = _mm_unpackhi_epi32( d45l, d67l) ;
327
+ let d0123hl = _mm_unpacklo_epi32( d01h, d23h) ;
328
+ let d0123hh = _mm_unpackhi_epi32( d01h, d23h) ;
329
+ let d4567hl = _mm_unpacklo_epi32( d45h, d67h) ;
330
+ let d4567hh = _mm_unpackhi_epi32( d45h, d67h) ;
331
+ $data[ 0 ] = _mm_unpacklo_epi64( d0123ll, d4567ll) ;
332
+ $data[ 1 ] = _mm_unpackhi_epi64( d0123ll, d4567ll) ;
333
+ $data[ 2 ] = _mm_unpacklo_epi64( d0123lh, d4567lh) ;
334
+ $data[ 3 ] = _mm_unpackhi_epi64( d0123lh, d4567lh) ;
335
+ $data[ 4 ] = _mm_unpacklo_epi64( d0123hl, d4567hl) ;
336
+ $data[ 5 ] = _mm_unpackhi_epi64( d0123hl, d4567hl) ;
337
+ $data[ 6 ] = _mm_unpacklo_epi64( d0123hh, d4567hh) ;
338
+ $data[ 7 ] = _mm_unpackhi_epi64( d0123hh, d4567hh) ;
339
+ } ;
340
+ }
341
+
342
+ #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
343
+ #[ target_feature( enable = "ssse3" ) ]
344
+ #[ allow( unsafe_code) ]
345
+ pub unsafe fn dequantize_and_idct_block_8x8_ssse3 (
346
+ coefficients : & [ i16 ] ,
347
+ quantization_table : & [ u16 ; 64 ] ,
348
+ output_linestride : usize ,
349
+ output : & mut [ u8 ] ,
350
+ ) {
351
+ assert ! ( coefficients. len( ) >= 64 ) ;
352
+ assert ! ( output. len( ) >= output_linestride * 7 + 8 ) ;
353
+
354
+ #[ cfg( target_arch = "x86" ) ]
355
+ use std:: arch:: x86:: * ;
356
+ #[ cfg( target_arch = "x86_64" ) ]
357
+ use std:: arch:: x86_64:: * ;
358
+
359
+ const SHIFT : i32 = 3 ;
360
+
361
+ let mut data = [ _mm_setzero_si128 ( ) ; 8 ] ;
362
+ for i in 0 ..8 {
363
+ data[ i] = _mm_slli_epi16 (
364
+ _mm_mullo_epi16 (
365
+ _mm_loadu_si128 ( coefficients. as_ptr ( ) . wrapping_add ( i * 8 ) as * const _ ) ,
366
+ _mm_loadu_si128 ( quantization_table. as_ptr ( ) . wrapping_add ( i * 8 ) as * const _ ) ,
367
+ ) ,
368
+ SHIFT ,
369
+ ) ;
370
+ }
371
+
372
+ idct8_ssse3 ! ( data) ;
373
+ transpose8_ssse3 ! ( data) ;
374
+ idct8_ssse3 ! ( data) ;
375
+ transpose8_ssse3 ! ( data) ;
376
+
377
+ for i in 0 ..8 {
378
+ let mut buf = [ 0u8 ; 16 ] ;
379
+ _mm_storeu_si128 (
380
+ buf. as_mut_ptr ( ) as * mut _ ,
381
+ _mm_packus_epi16 (
382
+ _mm_srai_epi16 (
383
+ _mm_adds_epi16 ( data[ i] , _mm_set1_epi16 ( 257 << ( SHIFT + 2 ) ) ) ,
384
+ SHIFT + 3 ,
385
+ ) ,
386
+ _mm_setzero_si128 ( ) ,
387
+ ) ,
388
+ ) ;
389
+ std:: ptr:: copy_nonoverlapping :: < u8 > (
390
+ buf. as_ptr ( ) ,
391
+ output. as_mut_ptr ( ) . wrapping_add ( output_linestride * i) as * mut _ ,
392
+ 8 ,
393
+ ) ;
394
+ }
395
+ }
396
+
241
397
pub fn dequantize_and_idct_block_8x8 (
242
398
coefficients : & [ i16 ] ,
243
399
quantization_table : & [ u16 ; 64 ] ,
244
400
output_linestride : usize ,
245
401
output : & mut [ u8 ] ,
246
402
) {
403
+ #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
404
+ #[ allow( unsafe_code) ]
405
+ {
406
+ if is_x86_feature_detected ! ( "ssse3" ) {
407
+ unsafe {
408
+ dequantize_and_idct_block_8x8_ssse3 (
409
+ coefficients,
410
+ quantization_table,
411
+ output_linestride,
412
+ output,
413
+ )
414
+ } ;
415
+ return ;
416
+ }
417
+ }
418
+
247
419
let output = output. chunks_mut ( output_linestride) ;
248
420
dequantize_and_idct_block_8x8_inner ( coefficients, quantization_table, output)
249
421
}
@@ -596,7 +768,9 @@ fn test_dequantize_and_idct_block_8x8() {
596
768
105 , 64 , 59 , 59 , 63 , 94 , 183 , 201 , 35 , 27 , 28 , 37 , 72 , 121 , 203 , 204 , 37 , 45 , 41 , 47 , 98 ,
597
769
154 , 223 , 208 ,
598
770
] ;
599
- assert_eq ! ( & output[ ..] , & expected_output[ ..] ) ;
771
+ for i in 0 ..64 {
772
+ assert ! ( ( output[ i] as i16 - expected_output[ i] as i16 ) . abs( ) <= 1 ) ;
773
+ }
600
774
}
601
775
602
776
#[ test]
@@ -608,6 +782,12 @@ fn test_dequantize_and_idct_block_8x8_all_zero() {
608
782
609
783
#[ test]
610
784
fn test_dequantize_and_idct_block_8x8_saturated ( ) {
785
+ #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
786
+ {
787
+ if is_x86_feature_detected ! ( "ssse3" ) {
788
+ return ;
789
+ }
790
+ }
611
791
let mut output = [ 0u8 ; 8 * 8 ] ;
612
792
dequantize_and_idct_block_8x8 ( & [ i16:: MAX ; 8 * 8 ] , & [ u16:: MAX ; 8 * 8 ] , 8 , & mut output) ;
613
793
let expected = [
0 commit comments