Skip to content

Commit a6478d6

Browse files
ardbiesheuvelctmarinas
authored andcommitted
arm64/crc32: Implement 4-way interleave using PMULL
Now that kernel mode NEON no longer disables preemption, using FP/SIMD in library code which is not obviously part of the crypto subsystem is no longer problematic, as it will no longer incur unexpected latencies. So accelerate the CRC-32 library code on arm64 to use a 4-way interleave, using PMULL instructions to implement the folding. On Apple M2, this results in a speedup of 2 - 2.8x when using input sizes of 1k - 8k. For smaller sizes, the overhead of preserving and restoring the FP/SIMD register file may not be worth it, so 1k is used as a threshold for choosing this code path. The coefficient tables were generated using code provided by Eric. [0] [0] https://github.com/ebiggers/libdeflate/blob/master/scripts/gen_crc32_multipliers.c Cc: Eric Biggers <ebiggers@kernel.org> Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Reviewed-by: Eric Biggers <ebiggers@google.com> Link: https://lore.kernel.org/r/20241018075347.2821102-8-ardb+git@google.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
1 parent b98b23e commit a6478d6

File tree

2 files changed

+276
-3
lines changed

2 files changed

+276
-3
lines changed

arch/arm64/lib/crc32-glue.c

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,40 @@
44
#include <linux/linkage.h>
55

66
#include <asm/alternative.h>
7+
#include <asm/cpufeature.h>
8+
#include <asm/neon.h>
9+
#include <asm/simd.h>
10+
11+
#include <crypto/internal/simd.h>
12+
13+
// The minimum input length to consider the 4-way interleaved code path
14+
static const size_t min_len = 1024;
715

816
asmlinkage u32 crc32_le_arm64(u32 crc, unsigned char const *p, size_t len);
917
asmlinkage u32 crc32c_le_arm64(u32 crc, unsigned char const *p, size_t len);
1018
asmlinkage u32 crc32_be_arm64(u32 crc, unsigned char const *p, size_t len);
1119

20+
asmlinkage u32 crc32_le_arm64_4way(u32 crc, unsigned char const *p, size_t len);
21+
asmlinkage u32 crc32c_le_arm64_4way(u32 crc, unsigned char const *p, size_t len);
22+
asmlinkage u32 crc32_be_arm64_4way(u32 crc, unsigned char const *p, size_t len);
23+
1224
u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
1325
{
1426
if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
1527
return crc32_le_base(crc, p, len);
1628

29+
if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
30+
kernel_neon_begin();
31+
crc = crc32_le_arm64_4way(crc, p, len);
32+
kernel_neon_end();
33+
34+
p += round_down(len, 64);
35+
len %= 64;
36+
37+
if (!len)
38+
return crc;
39+
}
40+
1741
return crc32_le_arm64(crc, p, len);
1842
}
1943

@@ -22,6 +46,18 @@ u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len)
2246
if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
2347
return __crc32c_le_base(crc, p, len);
2448

49+
if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
50+
kernel_neon_begin();
51+
crc = crc32c_le_arm64_4way(crc, p, len);
52+
kernel_neon_end();
53+
54+
p += round_down(len, 64);
55+
len %= 64;
56+
57+
if (!len)
58+
return crc;
59+
}
60+
2561
return crc32c_le_arm64(crc, p, len);
2662
}
2763

@@ -30,5 +66,17 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len)
3066
if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
3167
return crc32_be_base(crc, p, len);
3268

69+
if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
70+
kernel_neon_begin();
71+
crc = crc32_be_arm64_4way(crc, p, len);
72+
kernel_neon_end();
73+
74+
p += round_down(len, 64);
75+
len %= 64;
76+
77+
if (!len)
78+
return crc;
79+
}
80+
3381
return crc32_be_arm64(crc, p, len);
3482
}

arch/arm64/lib/crc32.S

Lines changed: 228 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,17 @@
11
/* SPDX-License-Identifier: GPL-2.0-only */
22
/*
3-
* Accelerated CRC32(C) using AArch64 CRC instructions
3+
* Accelerated CRC32(C) using AArch64 CRC and PMULL instructions
44
*
5-
* Copyright (C) 2016 - 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
5+
* Copyright (C) 2016 - 2018 Linaro Ltd.
6+
* Copyright (C) 2024 Google LLC
7+
*
8+
* Author: Ard Biesheuvel <ardb@kernel.org>
69
*/
710

811
#include <linux/linkage.h>
912
#include <asm/assembler.h>
1013

11-
.arch armv8-a+crc
14+
.cpu generic+crc+crypto
1215

1316
.macro bitle, reg
1417
.endm
@@ -135,3 +138,225 @@ SYM_FUNC_END(crc32c_le_arm64)
135138
SYM_FUNC_START(crc32_be_arm64)
136139
__crc32 order=be
137140
SYM_FUNC_END(crc32_be_arm64)
141+
142+
in .req x1
143+
len .req x2
144+
145+
/*
146+
* w0: input CRC at entry, output CRC at exit
147+
* x1: pointer to input buffer
148+
* x2: length of input in bytes
149+
*/
150+
.macro crc4way, insn, table, order=le
151+
bit\order w0
152+
lsr len, len, #6 // len := # of 64-byte blocks
153+
154+
/* Process up to 64 blocks of 64 bytes at a time */
155+
.La\@: mov x3, #64
156+
cmp len, #64
157+
csel x3, x3, len, hi // x3 := min(len, 64)
158+
sub len, len, x3
159+
160+
/* Divide the input into 4 contiguous blocks */
161+
add x4, x3, x3, lsl #1 // x4 := 3 * x3
162+
add x7, in, x3, lsl #4 // x7 := in + 16 * x3
163+
add x8, in, x3, lsl #5 // x8 := in + 32 * x3
164+
add x9, in, x4, lsl #4 // x9 := in + 16 * x4
165+
166+
/* Load the folding coefficients from the lookup table */
167+
adr_l x5, \table - 12 // entry 0 omitted
168+
add x5, x5, x4, lsl #2 // x5 += 12 * x3
169+
ldp s0, s1, [x5]
170+
ldr s2, [x5, #8]
171+
172+
/* Zero init partial CRCs for this iteration */
173+
mov w4, wzr
174+
mov w5, wzr
175+
mov w6, wzr
176+
mov x17, xzr
177+
178+
.Lb\@: sub x3, x3, #1
179+
\insn w6, w6, x17
180+
ldp x10, x11, [in], #16
181+
ldp x12, x13, [x7], #16
182+
ldp x14, x15, [x8], #16
183+
ldp x16, x17, [x9], #16
184+
185+
\order x10, x11, x12, x13, x14, x15, x16, x17
186+
187+
/* Apply the CRC transform to 4 16-byte blocks in parallel */
188+
\insn w0, w0, x10
189+
\insn w4, w4, x12
190+
\insn w5, w5, x14
191+
\insn w6, w6, x16
192+
\insn w0, w0, x11
193+
\insn w4, w4, x13
194+
\insn w5, w5, x15
195+
cbnz x3, .Lb\@
196+
197+
/* Combine the 4 partial results into w0 */
198+
mov v3.d[0], x0
199+
mov v4.d[0], x4
200+
mov v5.d[0], x5
201+
pmull v0.1q, v0.1d, v3.1d
202+
pmull v1.1q, v1.1d, v4.1d
203+
pmull v2.1q, v2.1d, v5.1d
204+
eor v0.8b, v0.8b, v1.8b
205+
eor v0.8b, v0.8b, v2.8b
206+
mov x5, v0.d[0]
207+
eor x5, x5, x17
208+
\insn w0, w6, x5
209+
210+
mov in, x9
211+
cbnz len, .La\@
212+
213+
bit\order w0
214+
ret
215+
.endm
216+
217+
.align 5
218+
SYM_FUNC_START(crc32c_le_arm64_4way)
219+
crc4way crc32cx, .L0
220+
SYM_FUNC_END(crc32c_le_arm64_4way)
221+
222+
.align 5
223+
SYM_FUNC_START(crc32_le_arm64_4way)
224+
crc4way crc32x, .L1
225+
SYM_FUNC_END(crc32_le_arm64_4way)
226+
227+
.align 5
228+
SYM_FUNC_START(crc32_be_arm64_4way)
229+
crc4way crc32x, .L1, be
230+
SYM_FUNC_END(crc32_be_arm64_4way)
231+
232+
.section .rodata, "a", %progbits
233+
.align 6
234+
.L0: .long 0xddc0152b, 0xba4fc28e, 0x493c7d27
235+
.long 0x0715ce53, 0x9e4addf8, 0xba4fc28e
236+
.long 0xc96cfdc0, 0x0715ce53, 0xddc0152b
237+
.long 0xab7aff2a, 0x0d3b6092, 0x9e4addf8
238+
.long 0x299847d5, 0x878a92a7, 0x39d3b296
239+
.long 0xb6dd949b, 0xab7aff2a, 0x0715ce53
240+
.long 0xa60ce07b, 0x83348832, 0x47db8317
241+
.long 0xd270f1a2, 0xb9e02b86, 0x0d3b6092
242+
.long 0x65863b64, 0xb6dd949b, 0xc96cfdc0
243+
.long 0xb3e32c28, 0xbac2fd7b, 0x878a92a7
244+
.long 0xf285651c, 0xce7f39f4, 0xdaece73e
245+
.long 0x271d9844, 0xd270f1a2, 0xab7aff2a
246+
.long 0x6cb08e5c, 0x2b3cac5d, 0x2162d385
247+
.long 0xcec3662e, 0x1b03397f, 0x83348832
248+
.long 0x8227bb8a, 0xb3e32c28, 0x299847d5
249+
.long 0xd7a4825c, 0xdd7e3b0c, 0xb9e02b86
250+
.long 0xf6076544, 0x10746f3c, 0x18b33a4e
251+
.long 0x98d8d9cb, 0x271d9844, 0xb6dd949b
252+
.long 0x57a3d037, 0x93a5f730, 0x78d9ccb7
253+
.long 0x3771e98f, 0x6b749fb2, 0xbac2fd7b
254+
.long 0xe0ac139e, 0xcec3662e, 0xa60ce07b
255+
.long 0x6f345e45, 0xe6fc4e6a, 0xce7f39f4
256+
.long 0xa2b73df1, 0xb0cd4768, 0x61d82e56
257+
.long 0x86d8e4d2, 0xd7a4825c, 0xd270f1a2
258+
.long 0xa90fd27a, 0x0167d312, 0xc619809d
259+
.long 0xca6ef3ac, 0x26f6a60a, 0x2b3cac5d
260+
.long 0x4597456a, 0x98d8d9cb, 0x65863b64
261+
.long 0xc9c8b782, 0x68bce87a, 0x1b03397f
262+
.long 0x62ec6c6d, 0x6956fc3b, 0xebb883bd
263+
.long 0x2342001e, 0x3771e98f, 0xb3e32c28
264+
.long 0xe8b6368b, 0x2178513a, 0x064f7f26
265+
.long 0x9ef68d35, 0x170076fa, 0xdd7e3b0c
266+
.long 0x0b0bf8ca, 0x6f345e45, 0xf285651c
267+
.long 0x02ee03b2, 0xff0dba97, 0x10746f3c
268+
.long 0x135c83fd, 0xf872e54c, 0xc7a68855
269+
.long 0x00bcf5f6, 0x86d8e4d2, 0x271d9844
270+
.long 0x58ca5f00, 0x5bb8f1bc, 0x8e766a0c
271+
.long 0xded288f8, 0xb3af077a, 0x93a5f730
272+
.long 0x37170390, 0xca6ef3ac, 0x6cb08e5c
273+
.long 0xf48642e9, 0xdd66cbbb, 0x6b749fb2
274+
.long 0xb25b29f2, 0xe9e28eb4, 0x1393e203
275+
.long 0x45cddf4e, 0xc9c8b782, 0xcec3662e
276+
.long 0xdfd94fb2, 0x93e106a4, 0x96c515bb
277+
.long 0x021ac5ef, 0xd813b325, 0xe6fc4e6a
278+
.long 0x8e1450f7, 0x2342001e, 0x8227bb8a
279+
.long 0xe0cdcf86, 0x6d9a4957, 0xb0cd4768
280+
.long 0x613eee91, 0xd2c3ed1a, 0x39c7ff35
281+
.long 0xbedc6ba1, 0x9ef68d35, 0xd7a4825c
282+
.long 0x0cd1526a, 0xf2271e60, 0x0ab3844b
283+
.long 0xd6c3a807, 0x2664fd8b, 0x0167d312
284+
.long 0x1d31175f, 0x02ee03b2, 0xf6076544
285+
.long 0x4be7fd90, 0x363bd6b3, 0x26f6a60a
286+
.long 0x6eeed1c9, 0x5fabe670, 0xa741c1bf
287+
.long 0xb3a6da94, 0x00bcf5f6, 0x98d8d9cb
288+
.long 0x2e7d11a7, 0x17f27698, 0x49c3cc9c
289+
.long 0x889774e1, 0xaa7c7ad5, 0x68bce87a
290+
.long 0x8a074012, 0xded288f8, 0x57a3d037
291+
.long 0xbd0bb25f, 0x6d390dec, 0x6956fc3b
292+
.long 0x3be3c09b, 0x6353c1cc, 0x42d98888
293+
.long 0x465a4eee, 0xf48642e9, 0x3771e98f
294+
.long 0x2e5f3c8c, 0xdd35bc8d, 0xb42ae3d9
295+
.long 0xa52f58ec, 0x9a5ede41, 0x2178513a
296+
.long 0x47972100, 0x45cddf4e, 0xe0ac139e
297+
.long 0x359674f7, 0xa51b6135, 0x170076fa
298+
299+
.L1: .long 0xaf449247, 0x81256527, 0xccaa009e
300+
.long 0x57c54819, 0x1d9513d7, 0x81256527
301+
.long 0x3f41287a, 0x57c54819, 0xaf449247
302+
.long 0xf5e48c85, 0x910eeec1, 0x1d9513d7
303+
.long 0x1f0c2cdd, 0x9026d5b1, 0xae0b5394
304+
.long 0x71d54a59, 0xf5e48c85, 0x57c54819
305+
.long 0x1c63267b, 0xfe807bbd, 0x0cbec0ed
306+
.long 0xd31343ea, 0xe95c1271, 0x910eeec1
307+
.long 0xf9d9c7ee, 0x71d54a59, 0x3f41287a
308+
.long 0x9ee62949, 0xcec97417, 0x9026d5b1
309+
.long 0xa55d1514, 0xf183c71b, 0xd1df2327
310+
.long 0x21aa2b26, 0xd31343ea, 0xf5e48c85
311+
.long 0x9d842b80, 0xeea395c4, 0x3c656ced
312+
.long 0xd8110ff1, 0xcd669a40, 0xfe807bbd
313+
.long 0x3f9e9356, 0x9ee62949, 0x1f0c2cdd
314+
.long 0x1d6708a0, 0x0c30f51d, 0xe95c1271
315+
.long 0xef82aa68, 0xdb3935ea, 0xb918a347
316+
.long 0xd14bcc9b, 0x21aa2b26, 0x71d54a59
317+
.long 0x99cce860, 0x356d209f, 0xff6f2fc2
318+
.long 0xd8af8e46, 0xc352f6de, 0xcec97417
319+
.long 0xf1996890, 0xd8110ff1, 0x1c63267b
320+
.long 0x631bc508, 0xe95c7216, 0xf183c71b
321+
.long 0x8511c306, 0x8e031a19, 0x9b9bdbd0
322+
.long 0xdb3839f3, 0x1d6708a0, 0xd31343ea
323+
.long 0x7a92fffb, 0xf7003835, 0x4470ac44
324+
.long 0x6ce68f2a, 0x00eba0c8, 0xeea395c4
325+
.long 0x4caaa263, 0xd14bcc9b, 0xf9d9c7ee
326+
.long 0xb46f7cff, 0x9a1b53c8, 0xcd669a40
327+
.long 0x60290934, 0x81b6f443, 0x6d40f445
328+
.long 0x8e976a7d, 0xd8af8e46, 0x9ee62949
329+
.long 0xdcf5088a, 0x9dbdc100, 0x145575d5
330+
.long 0x1753ab84, 0xbbf2f6d6, 0x0c30f51d
331+
.long 0x255b139e, 0x631bc508, 0xa55d1514
332+
.long 0xd784eaa8, 0xce26786c, 0xdb3935ea
333+
.long 0x6d2c864a, 0x8068c345, 0x2586d334
334+
.long 0x02072e24, 0xdb3839f3, 0x21aa2b26
335+
.long 0x06689b0a, 0x5efd72f5, 0xe0575528
336+
.long 0x1e52f5ea, 0x4117915b, 0x356d209f
337+
.long 0x1d3d1db6, 0x6ce68f2a, 0x9d842b80
338+
.long 0x3796455c, 0xb8e0e4a8, 0xc352f6de
339+
.long 0xdf3a4eb3, 0xc55a2330, 0xb84ffa9c
340+
.long 0x28ae0976, 0xb46f7cff, 0xd8110ff1
341+
.long 0x9764bc8d, 0xd7e7a22c, 0x712510f0
342+
.long 0x13a13e18, 0x3e9a43cd, 0xe95c7216
343+
.long 0xb8ee242e, 0x8e976a7d, 0x3f9e9356
344+
.long 0x0c540e7b, 0x753c81ff, 0x8e031a19
345+
.long 0x9924c781, 0xb9220208, 0x3edcde65
346+
.long 0x3954de39, 0x1753ab84, 0x1d6708a0
347+
.long 0xf32238b5, 0xbec81497, 0x9e70b943
348+
.long 0xbbd2cd2c, 0x0925d861, 0xf7003835
349+
.long 0xcc401304, 0xd784eaa8, 0xef82aa68
350+
.long 0x4987e684, 0x6044fbb0, 0x00eba0c8
351+
.long 0x3aa11427, 0x18fe3b4a, 0x87441142
352+
.long 0x297aad60, 0x02072e24, 0xd14bcc9b
353+
.long 0xf60c5e51, 0x6ef6f487, 0x5b7fdd0a
354+
.long 0x632d78c5, 0x3fc33de4, 0x9a1b53c8
355+
.long 0x25b8822a, 0x1e52f5ea, 0x99cce860
356+
.long 0xd4fc84bc, 0x1af62fb8, 0x81b6f443
357+
.long 0x5690aa32, 0xa91fdefb, 0x688a110e
358+
.long 0x1357a093, 0x3796455c, 0xd8af8e46
359+
.long 0x798fdd33, 0xaaa18a37, 0x357b9517
360+
.long 0xc2815395, 0x54d42691, 0x9dbdc100
361+
.long 0x21cfc0f7, 0x28ae0976, 0xf1996890
362+
.long 0xa0decef3, 0x7b4aa8b7, 0xbbf2f6d6

0 commit comments

Comments
 (0)