112
112
FOLD_CONST_L .req q10l
113
113
FOLD_CONST_H .req q10h
114
114
115
+ .macro pmull16x64_p64 , v16 , v64
116
+ vmull.p64 q11 , \v64\()l , \v16\()_L
117
+ vmull.p64 \v64 , \v64\()h , \v16\()_H
118
+ veor \v64 , \v64 , q11
119
+ .endm
120
+
115
121
// Fold reg1 , reg2 into the next 32 data bytes , storing the result back
116
122
// into reg1 , reg2.
117
- .macro fold_32_bytes , reg1 , reg2
118
- vld1. 64 {q11 - q12 }, [ buf ] !
123
+ .macro fold_32_bytes , reg1 , reg2 , p
124
+ vld1. 64 {q8 - q9 }, [ buf ] !
119
125
120
- vmull.p64 q8 , \reg1\()h , FOLD_CONST_H
121
- vmull.p64 \reg1 , \reg1\()l , FOLD_CONST_L
122
- vmull.p64 q9 , \reg2\()h , FOLD_CONST_H
123
- vmull.p64 \reg2 , \reg2\()l , FOLD_CONST_L
126
+ pmull16x64_\p FOLD_CONST , \reg1
127
+ pmull16x64_\p FOLD_CONST , \reg2
124
128
125
- CPU_LE( vrev64. 8 q11 , q11 )
126
- CPU_LE( vrev64. 8 q12 , q12 )
127
- vswp q11l , q11h
128
- vswp q12l , q12h
129
+ CPU_LE( vrev64. 8 q8 , q8 )
130
+ CPU_LE( vrev64. 8 q9 , q9 )
131
+ vswp q8l , q8h
132
+ vswp q9l , q9h
129
133
130
134
veor. 8 \reg1 , \reg1 , q8
131
135
veor. 8 \reg2 , \reg2 , q9
132
- veor. 8 \reg1 , \reg1 , q11
133
- veor. 8 \reg2 , \reg2 , q12
134
136
.endm
135
137
136
138
// Fold src_reg into dst_reg , optionally loading the next fold constants
137
- .macro fold_16_bytes , src_reg , dst_reg , load_next_consts
138
- vmull.p64 q8 , \src_reg\()l , FOLD_CONST_L
139
- vmull.p64 \src_reg , \src_reg\()h , FOLD_CONST_H
139
+ .macro fold_16_bytes , src_reg , dst_reg , p , load_next_consts
140
+ pmull16x64_\p FOLD_CONST , \src_reg
140
141
.ifnb \load_next_consts
141
142
vld1. 64 {FOLD_CONSTS} , [ fold_consts_ptr , : 128 ] !
142
143
.endif
143
- veor. 8 \dst_reg , \dst_reg , q8
144
144
veor. 8 \dst_reg , \dst_reg , \src_reg
145
145
.endm
146
146
147
- //
148
- // u16 crc_t10dif_pmull(u16 init_crc , const u8 * buf , size_t len) ;
149
- //
150
- // Assumes len >= 16 .
151
- //
152
- ENTRY(crc_t10dif_pmull)
153
-
147
+ .macro crct10dif , p
154
148
// For sizes less than 256 bytes , we can't fold 128 bytes at a time.
155
149
cmp len , # 256
156
- blt .Lless_than_256_bytes
150
+ blt .Lless_than_256_bytes\@
157
151
158
152
mov_l fold_consts_ptr , .Lfold_across_128_bytes_consts
159
153
@@ -194,27 +188,27 @@ CPU_LE( vrev64.8 q7, q7 )
194
188
195
189
// While >= 128 data bytes remain ( not counting q0 - q7) , fold the 128
196
190
// bytes q0 - q7 into them , storing the result back into q0 - q7.
197
- .Lfold_128_bytes_loop:
198
- fold_32_bytes q0 , q1
199
- fold_32_bytes q2 , q3
200
- fold_32_bytes q4 , q5
201
- fold_32_bytes q6 , q7
191
+ .Lfold_128_bytes_loop\@ :
192
+ fold_32_bytes q0 , q1 , \p
193
+ fold_32_bytes q2 , q3 , \p
194
+ fold_32_bytes q4 , q5 , \p
195
+ fold_32_bytes q6 , q7 , \p
202
196
subs len , len , # 128
203
- bge .Lfold_128_bytes_loop
197
+ bge .Lfold_128_bytes_loop\@
204
198
205
199
// Now fold the 112 bytes in q0 - q6 into the 16 bytes in q7.
206
200
207
201
// Fold across 64 bytes.
208
202
vld1. 64 {FOLD_CONSTS} , [ fold_consts_ptr , : 128 ] !
209
- fold_16_bytes q0 , q4
210
- fold_16_bytes q1 , q5
211
- fold_16_bytes q2 , q6
212
- fold_16_bytes q3 , q7 , 1
203
+ fold_16_bytes q0 , q4 , \p
204
+ fold_16_bytes q1 , q5 , \p
205
+ fold_16_bytes q2 , q6 , \p
206
+ fold_16_bytes q3 , q7 , \p , 1
213
207
// Fold across 32 bytes.
214
- fold_16_bytes q4 , q6
215
- fold_16_bytes q5 , q7 , 1
208
+ fold_16_bytes q4 , q6 , \p
209
+ fold_16_bytes q5 , q7 , \p , 1
216
210
// Fold across 16 bytes.
217
- fold_16_bytes q6 , q7
211
+ fold_16_bytes q6 , q7 , \p
218
212
219
213
// Add 128 to get the correct number of data bytes remaining in 0 ... 127
220
214
// ( not counting q7) , following the previous extra subtraction by 128 .
@@ -224,25 +218,23 @@ CPU_LE( vrev64.8 q7, q7 )
224
218
225
219
// While >= 16 data bytes remain ( not counting q7) , fold the 16 bytes q7
226
220
// into them , storing the result back into q7.
227
- blt .Lfold_16_bytes_loop_done
228
- .Lfold_16_bytes_loop:
229
- vmull.p64 q8 , q7l , FOLD_CONST_L
230
- vmull.p64 q7 , q7h , FOLD_CONST_H
231
- veor. 8 q7 , q7 , q8
221
+ blt .Lfold_16_bytes_loop_done\@
222
+ .Lfold_16_bytes_loop\@:
223
+ pmull16x64_\p FOLD_CONST , q7
232
224
vld1. 64 {q0} , [ buf ] !
233
225
CPU_LE( vrev64. 8 q0 , q0 )
234
226
vswp q0l , q0h
235
227
veor. 8 q7 , q7 , q0
236
228
subs len , len , # 16
237
- bge .Lfold_16_bytes_loop
229
+ bge .Lfold_16_bytes_loop\@
238
230
239
- .Lfold_16_bytes_loop_done:
231
+ .Lfold_16_bytes_loop_done\@ :
240
232
// Add 16 to get the correct number of data bytes remaining in 0 ... 15
241
233
// ( not counting q7) , following the previous extra subtraction by 16 .
242
234
adds len , len , # 16
243
- beq .Lreduce_final_16_bytes
235
+ beq .Lreduce_final_16_bytes\@
244
236
245
- .Lhandle_partial_segment:
237
+ .Lhandle_partial_segment\@ :
246
238
// Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
247
239
// 16 bytes are in q7 and the rest are the remaining data in 'buf' . To
248
240
// do this without needing a fold constant for each possible 'len' ,
@@ -277,12 +269,46 @@ CPU_LE( vrev64.8 q0, q0 )
277
269
vbsl. 8 q2 , q1 , q0
278
270
279
271
// Fold the first chunk into the second chunk , storing the result in q7.
280
- vmull.p64 q0 , q3l , FOLD_CONST_L
281
- vmull.p64 q7 , q3h , FOLD_CONST_H
282
- veor. 8 q7 , q7 , q0
283
- veor. 8 q7 , q7 , q2
272
+ pmull16x64_\p FOLD_CONST , q3
273
+ veor. 8 q7 , q3 , q2
274
+ b .Lreduce_final_16_bytes\@
275
+
276
+ .Lless_than_256_bytes\@:
277
+ // Checksumming a buffer of length 16 ... 255 bytes
278
+
279
+ mov_l fold_consts_ptr , .Lfold_across_16_bytes_consts
280
+
281
+ // Load the first 16 data bytes.
282
+ vld1. 64 {q7} , [ buf ] !
283
+ CPU_LE( vrev64. 8 q7 , q7 )
284
+ vswp q7l , q7h
285
+
286
+ // XOR the first 16 data * bits * with the initial CRC value.
287
+ vmov.i8 q0h , # 0
288
+ vmov.u16 q0h [ 3 ], init_crc
289
+ veor. 8 q7h , q7h , q0h
290
+
291
+ // Load the fold - across - 16 - bytes constants.
292
+ vld1. 64 {FOLD_CONSTS} , [ fold_consts_ptr , : 128 ] !
293
+
294
+ cmp len , # 16
295
+ beq .Lreduce_final_16_bytes\@ // len == 16
296
+ subs len , len , # 32
297
+ addlt len , len , # 16
298
+ blt .Lhandle_partial_segment\@ // 17 <= len <= 31
299
+ b .Lfold_16_bytes_loop\@ // 32 <= len <= 255
300
+
301
+ .Lreduce_final_16_bytes\@:
302
+ .endm
303
+
304
+ //
305
+ // u16 crc_t10dif_pmull(u16 init_crc , const u8 * buf , size_t len) ;
306
+ //
307
+ // Assumes len >= 16 .
308
+ //
309
+ ENTRY(crc_t10dif_pmull64)
310
+ crct10dif p64
284
311
285
- .Lreduce_final_16_bytes:
286
312
// Reduce the 128 - bit value M(x) , stored in q7 , to the final 16 - bit CRC.
287
313
288
314
// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))' .
@@ -316,31 +342,7 @@ CPU_LE( vrev64.8 q0, q0 )
316
342
vmov.u16 r0 , q0l [ 0 ]
317
343
bx lr
318
344
319
- .Lless_than_256_bytes:
320
- // Checksumming a buffer of length 16 ... 255 bytes
321
-
322
- mov_l fold_consts_ptr , .Lfold_across_16_bytes_consts
323
-
324
- // Load the first 16 data bytes.
325
- vld1. 64 {q7} , [ buf ] !
326
- CPU_LE( vrev64. 8 q7 , q7 )
327
- vswp q7l , q7h
328
-
329
- // XOR the first 16 data * bits * with the initial CRC value.
330
- vmov.i8 q0h , # 0
331
- vmov.u16 q0h [ 3 ], init_crc
332
- veor. 8 q7h , q7h , q0h
333
-
334
- // Load the fold - across - 16 - bytes constants.
335
- vld1. 64 {FOLD_CONSTS} , [ fold_consts_ptr , : 128 ] !
336
-
337
- cmp len , # 16
338
- beq .Lreduce_final_16_bytes // len == 16
339
- subs len , len , # 32
340
- addlt len , len , # 16
341
- blt .Lhandle_partial_segment // 17 <= len <= 31
342
- b .Lfold_16_bytes_loop // 32 <= len <= 255
343
- ENDPROC(crc_t10dif_pmull)
345
+ ENDPROC(crc_t10dif_pmull64)
344
346
345
347
. section ".rodata" , "a"
346
348
. align 4
0 commit comments