|
112 | 112 | FOLD_CONST_L .req q10l
|
113 | 113 | FOLD_CONST_H .req q10h
|
114 | 114 |
|
| 115 | + /* |
| 116 | + * Pairwise long polynomial multiplication of two 16-bit values |
| 117 | + * |
| 118 | + * { w0, w1 }, { y0, y1 } |
| 119 | + * |
| 120 | + * by two 64-bit values |
| 121 | + * |
| 122 | + * { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 } |
| 123 | + * |
| 124 | + * where each vector element is a byte, ordered from least to most |
| 125 | + * significant. The resulting 80-bit vectors are XOR'ed together. |
| 126 | + * |
| 127 | + * This can be implemented using 8x8 long polynomial multiplication, by |
| 128 | + * reorganizing the input so that each pairwise 8x8 multiplication |
| 129 | + * produces one of the terms from the decomposition below, and |
| 130 | + * combining the results of each rank and shifting them into place. |
| 131 | + * |
| 132 | + * Rank |
| 133 | + * 0 w0*x0 ^ | y0*z0 ^ |
| 134 | + * 1 (w0*x1 ^ w1*x0) << 8 ^ | (y0*z1 ^ y1*z0) << 8 ^ |
| 135 | + * 2 (w0*x2 ^ w1*x1) << 16 ^ | (y0*z2 ^ y1*z1) << 16 ^ |
| 136 | + * 3 (w0*x3 ^ w1*x2) << 24 ^ | (y0*z3 ^ y1*z2) << 24 ^ |
| 137 | + * 4 (w0*x4 ^ w1*x3) << 32 ^ | (y0*z4 ^ y1*z3) << 32 ^ |
| 138 | + * 5 (w0*x5 ^ w1*x4) << 40 ^ | (y0*z5 ^ y1*z4) << 40 ^ |
| 139 | + * 6 (w0*x6 ^ w1*x5) << 48 ^ | (y0*z6 ^ y1*z5) << 48 ^ |
| 140 | + * 7 (w0*x7 ^ w1*x6) << 56 ^ | (y0*z7 ^ y1*z6) << 56 ^ |
| 141 | + * 8 w1*x7 << 64 | y1*z7 << 64 |
| 142 | + * |
| 143 | + * The inputs can be reorganized into |
| 144 | + * |
| 145 | + * { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 } |
| 146 | + * { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 } |
| 147 | + * |
| 148 | + * and after performing 8x8->16 bit long polynomial multiplication of |
| 149 | + * each of the halves of the first vector with those of the second one, |
| 150 | + * we obtain the following four vectors of 16-bit elements: |
| 151 | + * |
| 152 | + * a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 } |
| 153 | + * b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 } |
| 154 | + * c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 } |
| 155 | + * d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 } |
| 156 | + * |
| 157 | + * Results b and c can be XORed together, as the vector elements have |
| 158 | + * matching ranks. Then, the final XOR can be pulled forward, and |
| 159 | + * applied between the halves of each of the remaining three vectors, |
| 160 | + * which are then shifted into place, and XORed together to produce the |
| 161 | + * final 80-bit result. |
| 162 | + */ |
| 163 | + .macro pmull16x64_p8, v16, v64 |
| 164 | + vext.8 q11, \v64, \v64, #1 |
| 165 | + vld1.64 {q12}, [r4, :128] |
| 166 | + vuzp.8 q11, \v64 |
| 167 | + vtbl.8 d24, {\v16\()_L-\v16\()_H}, d24 |
| 168 | + vtbl.8 d25, {\v16\()_L-\v16\()_H}, d25 |
| 169 | + bl __pmull16x64_p8 |
| 170 | + veor \v64, q12, q14 |
| 171 | + .endm |
| 172 | + |
| 173 | +__pmull16x64_p8: |
| 174 | + vmull.p8 q13, d23, d24 |
| 175 | + vmull.p8 q14, d23, d25 |
| 176 | + vmull.p8 q15, d22, d24 |
| 177 | + vmull.p8 q12, d22, d25 |
| 178 | + |
| 179 | + veor q14, q14, q15 |
| 180 | + veor d24, d24, d25 |
| 181 | + veor d26, d26, d27 |
| 182 | + veor d28, d28, d29 |
| 183 | + vmov.i32 d25, #0 |
| 184 | + vmov.i32 d29, #0 |
| 185 | + vext.8 q12, q12, q12, #14 |
| 186 | + vext.8 q14, q14, q14, #15 |
| 187 | + veor d24, d24, d26 |
| 188 | + bx lr |
| 189 | +ENDPROC(__pmull16x64_p8) |
| 190 | + |
115 | 191 | .macro pmull16x64_p64, v16, v64
|
116 | 192 | vmull.p64 q11, \v64\()l, \v16\()_L
|
117 | 193 | vmull.p64 \v64, \v64\()h, \v16\()_H
|
@@ -249,9 +325,9 @@ CPU_LE( vrev64.8 q0, q0 )
|
249 | 325 | vswp q0l, q0h
|
250 | 326 |
|
251 | 327 | // q1 = high order part of second chunk: q7 left-shifted by 'len' bytes.
|
252 |
| - mov_l r3, .Lbyteshift_table + 16 |
253 |
| - sub r3, r3, len |
254 |
| - vld1.8 {q2}, [r3] |
| 328 | + mov_l r1, .Lbyteshift_table + 16 |
| 329 | + sub r1, r1, len |
| 330 | + vld1.8 {q2}, [r1] |
255 | 331 | vtbl.8 q1l, {q7l-q7h}, q2l
|
256 | 332 | vtbl.8 q1h, {q7l-q7h}, q2h
|
257 | 333 |
|
@@ -341,9 +417,20 @@ ENTRY(crc_t10dif_pmull64)
|
341 | 417 |
|
342 | 418 | vmov.u16 r0, q0l[0]
|
343 | 419 | bx lr
|
344 |
| - |
345 | 420 | ENDPROC(crc_t10dif_pmull64)
|
346 | 421 |
|
| 422 | +ENTRY(crc_t10dif_pmull8) |
| 423 | + push {r4, lr} |
| 424 | + mov_l r4, .L16x64perm |
| 425 | + |
| 426 | + crct10dif p8 |
| 427 | + |
| 428 | +CPU_LE( vrev64.8 q7, q7 ) |
| 429 | + vswp q7l, q7h |
| 430 | + vst1.64 {q7}, [r3, :128] |
| 431 | + pop {r4, pc} |
| 432 | +ENDPROC(crc_t10dif_pmull8) |
| 433 | + |
347 | 434 | .section ".rodata", "a"
|
348 | 435 | .align 4
|
349 | 436 |
|
@@ -376,3 +463,6 @@ ENDPROC(crc_t10dif_pmull64)
|
376 | 463 | .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
|
377 | 464 | .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
|
378 | 465 | .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0
|
| 466 | + |
| 467 | +.L16x64perm: |
| 468 | + .quad 0x808080800000000, 0x909090901010101 |
0 commit comments