Skip to content

Commit 3aa8a4f

Browse files
committed
Add more comments.
1 parent 5a425fe commit 3aa8a4f

File tree

1 file changed

+30
-4
lines changed

1 file changed

+30
-4
lines changed

src/arch/ssse3.rs

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@ unsafe fn idct8(data: &mut [__m128i; 8]) {
1111
// _mm_mulhrs_epi16(a, b) is effectively equivalent to (a*b)>>15 (except for possibly some
1212
// slight differences in rounding).
1313

14+
// The code here is effectively equivalent to the calls to "kernel" in idct.rs, except that it
15+
// doesn't apply any further scaling and fixed point constants have a different precision.
16+
1417
let p2 = data[2];
1518
let p3 = data[6];
1619
let p1 = _mm_mulhrs_epi16(_mm_adds_epi16(p2, p3), _mm_set1_epi16(17734)); // 0.5411961
@@ -83,6 +86,11 @@ unsafe fn idct8(data: &mut [__m128i; 8]) {
8386
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
8487
#[target_feature(enable = "ssse3")]
8588
unsafe fn transpose8(data: &mut [__m128i; 8]) {
89+
// Transpose a 8x8 matrix with a sequence of interleaving operations.
90+
// Naming: dABl contains elements from the *l*ower halves of vectors A and B, interleaved, i.e.
91+
// A0 B0 A1 B1 ...
92+
// dABCDll contains elements from the lower quarter (ll) of vectors A, B, C, D, interleaved -
93+
// A0 B0 C0 D0 A1 B1 C1 D1 ...
8694
let d01l = _mm_unpacklo_epi16(data[0], data[1]);
8795
let d23l = _mm_unpacklo_epi16(data[2], data[3]);
8896
let d45l = _mm_unpacklo_epi16(data[4], data[5]);
@@ -91,6 +99,7 @@ unsafe fn transpose8(data: &mut [__m128i; 8]) {
9199
let d23h = _mm_unpackhi_epi16(data[2], data[3]);
92100
let d45h = _mm_unpackhi_epi16(data[4], data[5]);
93101
let d67h = _mm_unpackhi_epi16(data[6], data[7]);
102+
// Operating on 32-bits will interleave *consecutive pairs* of 16-bit integers.
94103
let d0123ll = _mm_unpacklo_epi32(d01l, d23l);
95104
let d0123lh = _mm_unpackhi_epi32(d01l, d23l);
96105
let d4567ll = _mm_unpacklo_epi32(d45l, d67l);
@@ -99,6 +108,7 @@ unsafe fn transpose8(data: &mut [__m128i; 8]) {
99108
let d0123hh = _mm_unpackhi_epi32(d01h, d23h);
100109
let d4567hl = _mm_unpacklo_epi32(d45h, d67h);
101110
let d4567hh = _mm_unpackhi_epi32(d45h, d67h);
111+
// Operating on 64-bits will interleave *consecutive quadruples* of 16-bit integers.
102112
data[0] = _mm_unpacklo_epi64(d0123ll, d4567ll);
103113
data[1] = _mm_unpackhi_epi64(d0123ll, d4567ll);
104114
data[2] = _mm_unpacklo_epi64(d0123lh, d4567lh);
@@ -137,6 +147,7 @@ pub unsafe fn dequantize_and_idct_block_8x8(
137147

138148
const SHIFT: i32 = 3;
139149

150+
// Read the DCT coefficients, scale them up and dequantize them.
140151
let mut data = [_mm_setzero_si128(); 8];
141152
for i in 0..8 {
142153
data[i] = _mm_slli_epi16(
@@ -148,20 +159,28 @@ pub unsafe fn dequantize_and_idct_block_8x8(
148159
);
149160
}
150161

162+
// Usual column IDCT - transpose - column IDCT - transpose approach.
151163
idct8(&mut data);
152164
transpose8(&mut data);
153165
idct8(&mut data);
154166
transpose8(&mut data);
155167

156168
for i in 0..8 {
157169
let mut buf = [0u8; 16];
170+
// The two passes of the IDCT algorithm give us a factor of 8, so the shift here is
171+
// increased by 3.
172+
// As values will be stored in a u8, they need to be 128-centered and not 0-centered.
173+
// We add 128 with the appropriate shift for that purpose.
174+
const OFFSET: i16 = 128 << (SHIFT + 3);
175+
// We want rounding right shift, so we should add (1/2) << (SHIFT+3) before shifting.
176+
const ROUNDING_BIAS: i16 = (1 << (SHIFT + 3)) >> 1;
177+
178+
let data_with_offset = _mm_adds_epi16(data[i], _mm_set1_epi16(OFFSET + ROUNDING_BIAS));
179+
158180
_mm_storeu_si128(
159181
buf.as_mut_ptr() as *mut _,
160182
_mm_packus_epi16(
161-
_mm_srai_epi16(
162-
_mm_adds_epi16(data[i], _mm_set1_epi16(257 << (SHIFT + 2))),
163-
SHIFT + 3,
164-
),
183+
_mm_srai_epi16(data_with_offset, SHIFT + 3),
165184
_mm_setzero_si128(),
166185
),
167186
);
@@ -226,6 +245,10 @@ pub unsafe fn color_convert_line_ycbcr(y: &[u8], cb: &[u8], cr: &[u8], output: &
226245
let b = _mm_packus_epi16(_mm_srai_epi16(b, SHIFT), zero);
227246

228247
// Shuffle rrrrrrrrggggggggbbbbbbbb to rgbrgbrgb...
248+
249+
// Control vectors for _mm_shuffle_epi8. -0x7F is selected so that the resulting position
250+
// after _mm_shuffle_epi8 will be filled with 0, so that the r, g, and b vectors can then
251+
// be OR-ed together.
229252
let shufr = _mm_setr_epi8(
230253
0, -0x7F, -0x7F, 1, -0x7F, -0x7F, 2, -0x7F, -0x7F, 3, -0x7F, -0x7F, 4, -0x7F, -0x7F, 5,
231254
);
@@ -240,6 +263,9 @@ pub unsafe fn color_convert_line_ycbcr(y: &[u8], cb: &[u8], cr: &[u8], output: &
240263
_mm_or_si128(_mm_shuffle_epi8(g, shufg), _mm_shuffle_epi8(b, shufb)),
241264
);
242265

266+
// For the next part of the rgb vectors, we need to select R values from 6 up, G and B from
267+
// 5 up. The highest bit of -0x7F + 6 is still set, so the corresponding location will
268+
// still be 0.
243269
let shufr1 = _mm_add_epi8(shufb, _mm_set1_epi8(6));
244270
let shufg1 = _mm_add_epi8(shufr, _mm_set1_epi8(5));
245271
let shufb1 = _mm_add_epi8(shufg, _mm_set1_epi8(5));

0 commit comments

Comments
 (0)