@@ -11,6 +11,9 @@ unsafe fn idct8(data: &mut [__m128i; 8]) {
11
11
// _mm_mulhrs_epi16(a, b) is effectively equivalent to (a*b)>>15 (except for possibly some
12
12
// slight differences in rounding).
13
13
14
+ // The code here is effectively equivalent to the calls to "kernel" in idct.rs, except that it
15
+ // doesn't apply any further scaling and fixed point constants have a different precision.
16
+
14
17
let p2 = data[ 2 ] ;
15
18
let p3 = data[ 6 ] ;
16
19
let p1 = _mm_mulhrs_epi16 ( _mm_adds_epi16 ( p2, p3) , _mm_set1_epi16 ( 17734 ) ) ; // 0.5411961
@@ -83,6 +86,11 @@ unsafe fn idct8(data: &mut [__m128i; 8]) {
83
86
#[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
84
87
#[ target_feature( enable = "ssse3" ) ]
85
88
unsafe fn transpose8 ( data : & mut [ __m128i ; 8 ] ) {
89
+ // Transpose a 8x8 matrix with a sequence of interleaving operations.
90
+ // Naming: dABl contains elements from the *l*ower halves of vectors A and B, interleaved, i.e.
91
+ // A0 B0 A1 B1 ...
92
+ // dABCDll contains elements from the lower quarter (ll) of vectors A, B, C, D, interleaved -
93
+ // A0 B0 C0 D0 A1 B1 C1 D1 ...
86
94
let d01l = _mm_unpacklo_epi16 ( data[ 0 ] , data[ 1 ] ) ;
87
95
let d23l = _mm_unpacklo_epi16 ( data[ 2 ] , data[ 3 ] ) ;
88
96
let d45l = _mm_unpacklo_epi16 ( data[ 4 ] , data[ 5 ] ) ;
@@ -91,6 +99,7 @@ unsafe fn transpose8(data: &mut [__m128i; 8]) {
91
99
let d23h = _mm_unpackhi_epi16 ( data[ 2 ] , data[ 3 ] ) ;
92
100
let d45h = _mm_unpackhi_epi16 ( data[ 4 ] , data[ 5 ] ) ;
93
101
let d67h = _mm_unpackhi_epi16 ( data[ 6 ] , data[ 7 ] ) ;
102
+ // Operating on 32-bits will interleave *consecutive pairs* of 16-bit integers.
94
103
let d0123ll = _mm_unpacklo_epi32 ( d01l, d23l) ;
95
104
let d0123lh = _mm_unpackhi_epi32 ( d01l, d23l) ;
96
105
let d4567ll = _mm_unpacklo_epi32 ( d45l, d67l) ;
@@ -99,6 +108,7 @@ unsafe fn transpose8(data: &mut [__m128i; 8]) {
99
108
let d0123hh = _mm_unpackhi_epi32 ( d01h, d23h) ;
100
109
let d4567hl = _mm_unpacklo_epi32 ( d45h, d67h) ;
101
110
let d4567hh = _mm_unpackhi_epi32 ( d45h, d67h) ;
111
+ // Operating on 64-bits will interleave *consecutive quadruples* of 16-bit integers.
102
112
data[ 0 ] = _mm_unpacklo_epi64 ( d0123ll, d4567ll) ;
103
113
data[ 1 ] = _mm_unpackhi_epi64 ( d0123ll, d4567ll) ;
104
114
data[ 2 ] = _mm_unpacklo_epi64 ( d0123lh, d4567lh) ;
@@ -137,6 +147,7 @@ pub unsafe fn dequantize_and_idct_block_8x8(
137
147
138
148
const SHIFT : i32 = 3 ;
139
149
150
+ // Read the DCT coefficients, scale them up and dequantize them.
140
151
let mut data = [ _mm_setzero_si128 ( ) ; 8 ] ;
141
152
for i in 0 ..8 {
142
153
data[ i] = _mm_slli_epi16 (
@@ -148,20 +159,28 @@ pub unsafe fn dequantize_and_idct_block_8x8(
148
159
) ;
149
160
}
150
161
162
+ // Usual column IDCT - transpose - column IDCT - transpose approach.
151
163
idct8 ( & mut data) ;
152
164
transpose8 ( & mut data) ;
153
165
idct8 ( & mut data) ;
154
166
transpose8 ( & mut data) ;
155
167
156
168
for i in 0 ..8 {
157
169
let mut buf = [ 0u8 ; 16 ] ;
170
+ // The two passes of the IDCT algorithm give us a factor of 8, so the shift here is
171
+ // increased by 3.
172
+ // As values will be stored in a u8, they need to be 128-centered and not 0-centered.
173
+ // We add 128 with the appropriate shift for that purpose.
174
+ const OFFSET : i16 = 128 << ( SHIFT + 3 ) ;
175
+ // We want rounding right shift, so we should add (1/2) << (SHIFT+3) before shifting.
176
+ const ROUNDING_BIAS : i16 = ( 1 << ( SHIFT + 3 ) ) >> 1 ;
177
+
178
+ let data_with_offset = _mm_adds_epi16 ( data[ i] , _mm_set1_epi16 ( OFFSET + ROUNDING_BIAS ) ) ;
179
+
158
180
_mm_storeu_si128 (
159
181
buf. as_mut_ptr ( ) as * mut _ ,
160
182
_mm_packus_epi16 (
161
- _mm_srai_epi16 (
162
- _mm_adds_epi16 ( data[ i] , _mm_set1_epi16 ( 257 << ( SHIFT + 2 ) ) ) ,
163
- SHIFT + 3 ,
164
- ) ,
183
+ _mm_srai_epi16 ( data_with_offset, SHIFT + 3 ) ,
165
184
_mm_setzero_si128 ( ) ,
166
185
) ,
167
186
) ;
@@ -226,6 +245,10 @@ pub unsafe fn color_convert_line_ycbcr(y: &[u8], cb: &[u8], cr: &[u8], output: &
226
245
let b = _mm_packus_epi16 ( _mm_srai_epi16 ( b, SHIFT ) , zero) ;
227
246
228
247
// Shuffle rrrrrrrrggggggggbbbbbbbb to rgbrgbrgb...
248
+
249
+ // Control vectors for _mm_shuffle_epi8. -0x7F is selected so that the resulting position
250
+ // after _mm_shuffle_epi8 will be filled with 0, so that the r, g, and b vectors can then
251
+ // be OR-ed together.
229
252
let shufr = _mm_setr_epi8 (
230
253
0 , -0x7F , -0x7F , 1 , -0x7F , -0x7F , 2 , -0x7F , -0x7F , 3 , -0x7F , -0x7F , 4 , -0x7F , -0x7F , 5 ,
231
254
) ;
@@ -240,6 +263,9 @@ pub unsafe fn color_convert_line_ycbcr(y: &[u8], cb: &[u8], cr: &[u8], output: &
240
263
_mm_or_si128 ( _mm_shuffle_epi8 ( g, shufg) , _mm_shuffle_epi8 ( b, shufb) ) ,
241
264
) ;
242
265
266
+ // For the next part of the rgb vectors, we need to select R values from 6 up, G and B from
267
+ // 5 up. The highest bit of -0x7F + 6 is still set, so the corresponding location will
268
+ // still be 0.
243
269
let shufr1 = _mm_add_epi8 ( shufb, _mm_set1_epi8 ( 6 ) ) ;
244
270
let shufg1 = _mm_add_epi8 ( shufr, _mm_set1_epi8 ( 5 ) ) ;
245
271
let shufb1 = _mm_add_epi8 ( shufg, _mm_set1_epi8 ( 5 ) ) ;
0 commit comments