Skip to content

Commit e7c1d1c

Browse files
ardbiesheuvelherbertx
authored andcommitted
crypto: arm/crct10dif - Implement plain NEON variant
The CRC-T10DIF algorithm produces a 16-bit CRC, and this is reflected in the folding coefficients, which are also only 16 bits wide. This means that the polynomial multiplications involving these coefficients can be performed using 8-bit long polynomial multiplication (8x8 -> 16) in only a few steps, and this is an instruction that is part of the base NEON ISA, which is all most real ARMv7 cores implement. (The 64-bit PMULL instruction is part of the crypto extensions, which are only implemented by 64-bit cores) The final reduction is a bit more involved, but we can delegate that to the generic CRC-T10DIF implementation after folding the entire input into a 16 byte vector. This results in a speedup of around 6.6x on Cortex-A72 running in 32-bit mode. On Cortex-A8 (BeagleBone White), the results are substantially better than that, but not sufficiently reproducible (with tcrypt) to quote a number here. Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
1 parent 802d8d1 commit e7c1d1c

File tree

2 files changed

+134
-9
lines changed

2 files changed

+134
-9
lines changed

arch/arm/crypto/crct10dif-ce-core.S

Lines changed: 94 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,82 @@
112112
FOLD_CONST_L .req q10l
113113
FOLD_CONST_H .req q10h
114114

115+
/*
116+
* Pairwise long polynomial multiplication of two 16-bit values
117+
*
118+
* { w0, w1 }, { y0, y1 }
119+
*
120+
* by two 64-bit values
121+
*
122+
* { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
123+
*
124+
* where each vector element is a byte, ordered from least to most
125+
* significant. The resulting 80-bit vectors are XOR'ed together.
126+
*
127+
* This can be implemented using 8x8 long polynomial multiplication, by
128+
* reorganizing the input so that each pairwise 8x8 multiplication
129+
* produces one of the terms from the decomposition below, and
130+
* combining the results of each rank and shifting them into place.
131+
*
132+
* Rank
133+
* 0 w0*x0 ^ | y0*z0 ^
134+
* 1 (w0*x1 ^ w1*x0) << 8 ^ | (y0*z1 ^ y1*z0) << 8 ^
135+
* 2 (w0*x2 ^ w1*x1) << 16 ^ | (y0*z2 ^ y1*z1) << 16 ^
136+
* 3 (w0*x3 ^ w1*x2) << 24 ^ | (y0*z3 ^ y1*z2) << 24 ^
137+
* 4 (w0*x4 ^ w1*x3) << 32 ^ | (y0*z4 ^ y1*z3) << 32 ^
138+
* 5 (w0*x5 ^ w1*x4) << 40 ^ | (y0*z5 ^ y1*z4) << 40 ^
139+
* 6 (w0*x6 ^ w1*x5) << 48 ^ | (y0*z6 ^ y1*z5) << 48 ^
140+
* 7 (w0*x7 ^ w1*x6) << 56 ^ | (y0*z7 ^ y1*z6) << 56 ^
141+
* 8 w1*x7 << 64 | y1*z7 << 64
142+
*
143+
* The inputs can be reorganized into
144+
*
145+
* { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
146+
* { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
147+
*
148+
* and after performing 8x8->16 bit long polynomial multiplication of
149+
* each of the halves of the first vector with those of the second one,
150+
* we obtain the following four vectors of 16-bit elements:
151+
*
152+
* a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 }
153+
* b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
154+
* c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 }
155+
* d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 }
156+
*
157+
* Results b and c can be XORed together, as the vector elements have
158+
* matching ranks. Then, the final XOR can be pulled forward, and
159+
* applied between the halves of each of the remaining three vectors,
160+
* which are then shifted into place, and XORed together to produce the
161+
* final 80-bit result.
162+
*/
163+
.macro pmull16x64_p8, v16, v64
164+
vext.8 q11, \v64, \v64, #1
165+
vld1.64 {q12}, [r4, :128]
166+
vuzp.8 q11, \v64
167+
vtbl.8 d24, {\v16\()_L-\v16\()_H}, d24
168+
vtbl.8 d25, {\v16\()_L-\v16\()_H}, d25
169+
bl __pmull16x64_p8
170+
veor \v64, q12, q14
171+
.endm
172+
173+
__pmull16x64_p8:
174+
vmull.p8 q13, d23, d24
175+
vmull.p8 q14, d23, d25
176+
vmull.p8 q15, d22, d24
177+
vmull.p8 q12, d22, d25
178+
179+
veor q14, q14, q15
180+
veor d24, d24, d25
181+
veor d26, d26, d27
182+
veor d28, d28, d29
183+
vmov.i32 d25, #0
184+
vmov.i32 d29, #0
185+
vext.8 q12, q12, q12, #14
186+
vext.8 q14, q14, q14, #15
187+
veor d24, d24, d26
188+
bx lr
189+
ENDPROC(__pmull16x64_p8)
190+
115191
.macro pmull16x64_p64, v16, v64
116192
vmull.p64 q11, \v64\()l, \v16\()_L
117193
vmull.p64 \v64, \v64\()h, \v16\()_H
@@ -249,9 +325,9 @@ CPU_LE( vrev64.8 q0, q0 )
249325
vswp q0l, q0h
250326

251327
// q1 = high order part of second chunk: q7 left-shifted by 'len' bytes.
252-
mov_l r3, .Lbyteshift_table + 16
253-
sub r3, r3, len
254-
vld1.8 {q2}, [r3]
328+
mov_l r1, .Lbyteshift_table + 16
329+
sub r1, r1, len
330+
vld1.8 {q2}, [r1]
255331
vtbl.8 q1l, {q7l-q7h}, q2l
256332
vtbl.8 q1h, {q7l-q7h}, q2h
257333

@@ -341,9 +417,20 @@ ENTRY(crc_t10dif_pmull64)
341417

342418
vmov.u16 r0, q0l[0]
343419
bx lr
344-
345420
ENDPROC(crc_t10dif_pmull64)
346421

422+
ENTRY(crc_t10dif_pmull8)
423+
push {r4, lr}
424+
mov_l r4, .L16x64perm
425+
426+
crct10dif p8
427+
428+
CPU_LE( vrev64.8 q7, q7 )
429+
vswp q7l, q7h
430+
vst1.64 {q7}, [r3, :128]
431+
pop {r4, pc}
432+
ENDPROC(crc_t10dif_pmull8)
433+
347434
.section ".rodata", "a"
348435
.align 4
349436

@@ -376,3 +463,6 @@ ENDPROC(crc_t10dif_pmull64)
376463
.byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
377464
.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
378465
.byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0
466+
467+
.L16x64perm:
468+
.quad 0x808080800000000, 0x909090901010101

arch/arm/crypto/crct10dif-ce-glue.c

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
2121

2222
asmlinkage u16 crc_t10dif_pmull64(u16 init_crc, const u8 *buf, size_t len);
23+
asmlinkage void crc_t10dif_pmull8(u16 init_crc, const u8 *buf, size_t len,
24+
u8 out[16]);
2325

2426
static int crct10dif_init(struct shash_desc *desc)
2527
{
@@ -45,6 +47,27 @@ static int crct10dif_update_ce(struct shash_desc *desc, const u8 *data,
4547
return 0;
4648
}
4749

50+
static int crct10dif_update_neon(struct shash_desc *desc, const u8 *data,
51+
unsigned int length)
52+
{
53+
u16 *crcp = shash_desc_ctx(desc);
54+
u8 buf[16] __aligned(16);
55+
u16 crc = *crcp;
56+
57+
if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
58+
kernel_neon_begin();
59+
crc_t10dif_pmull8(crc, data, length, buf);
60+
kernel_neon_end();
61+
62+
crc = 0;
63+
data = buf;
64+
length = sizeof(buf);
65+
}
66+
67+
*crcp = crc_t10dif_generic(crc, data, length);
68+
return 0;
69+
}
70+
4871
static int crct10dif_final(struct shash_desc *desc, u8 *out)
4972
{
5073
u16 *crc = shash_desc_ctx(desc);
@@ -53,7 +76,19 @@ static int crct10dif_final(struct shash_desc *desc, u8 *out)
5376
return 0;
5477
}
5578

56-
static struct shash_alg crc_t10dif_alg = {
79+
static struct shash_alg algs[] = {{
80+
.digestsize = CRC_T10DIF_DIGEST_SIZE,
81+
.init = crct10dif_init,
82+
.update = crct10dif_update_neon,
83+
.final = crct10dif_final,
84+
.descsize = CRC_T10DIF_DIGEST_SIZE,
85+
86+
.base.cra_name = "crct10dif",
87+
.base.cra_driver_name = "crct10dif-arm-neon",
88+
.base.cra_priority = 150,
89+
.base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
90+
.base.cra_module = THIS_MODULE,
91+
}, {
5792
.digestsize = CRC_T10DIF_DIGEST_SIZE,
5893
.init = crct10dif_init,
5994
.update = crct10dif_update_ce,
@@ -65,19 +100,19 @@ static struct shash_alg crc_t10dif_alg = {
65100
.base.cra_priority = 200,
66101
.base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
67102
.base.cra_module = THIS_MODULE,
68-
};
103+
}};
69104

70105
static int __init crc_t10dif_mod_init(void)
71106
{
72-
if (!(elf_hwcap2 & HWCAP2_PMULL))
107+
if (!(elf_hwcap & HWCAP_NEON))
73108
return -ENODEV;
74109

75-
return crypto_register_shash(&crc_t10dif_alg);
110+
return crypto_register_shashes(algs, 1 + !!(elf_hwcap2 & HWCAP2_PMULL));
76111
}
77112

78113
static void __exit crc_t10dif_mod_exit(void)
79114
{
80-
crypto_unregister_shash(&crc_t10dif_alg);
115+
crypto_unregister_shashes(algs, 1 + !!(elf_hwcap2 & HWCAP2_PMULL));
81116
}
82117

83118
module_init(crc_t10dif_mod_init);

0 commit comments

Comments
 (0)