Skip to content

Commit a04c192

Browse files
charlie-rivospalmer-dabbelt
authored andcommitted
riscv: Add checksum library
Provide a 32 and 64 bit version of do_csum. When compiled for 32-bit will load from the buffer in groups of 32 bits, and when compiled for 64-bit will load in groups of 64 bits. Additionally provide riscv optimized implementation of csum_ipv6_magic. Signed-off-by: Charlie Jenkins <charlie@rivosinc.com> Acked-by: Conor Dooley <conor.dooley@microchip.com> Reviewed-by: Xiao Wang <xiao.w.wang@intel.com> Link: https://lore.kernel.org/r/20240108-optimize_checksum-v15-4-1c50de5f2167@rivosinc.com Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
1 parent e11e367 commit a04c192

File tree

3 files changed

+338
-0
lines changed

3 files changed

+338
-0
lines changed

arch/riscv/include/asm/checksum.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,17 @@
1212

1313
#define ip_fast_csum ip_fast_csum
1414

15+
extern unsigned int do_csum(const unsigned char *buff, int len);
16+
#define do_csum do_csum
17+
18+
/* Default version is sufficient for 32 bit */
19+
#ifndef CONFIG_32BIT
20+
#define _HAVE_ARCH_IPV6_CSUM
21+
__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
22+
const struct in6_addr *daddr,
23+
__u32 len, __u8 proto, __wsum sum);
24+
#endif
25+
1526
/* Define riscv versions of functions before importing asm-generic/checksum.h */
1627
#include <asm-generic/checksum.h>
1728

arch/riscv/lib/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ lib-y += memmove.o
66
lib-y += strcmp.o
77
lib-y += strlen.o
88
lib-y += strncmp.o
9+
lib-y += csum.o
910
lib-$(CONFIG_MMU) += uaccess.o
1011
lib-$(CONFIG_64BIT) += tishift.o
1112
lib-$(CONFIG_RISCV_ISA_ZICBOZ) += clear_page.o

arch/riscv/lib/csum.c

Lines changed: 326 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,326 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
/*
3+
* Checksum library
4+
*
5+
* Influenced by arch/arm64/lib/csum.c
6+
* Copyright (C) 2023 Rivos Inc.
7+
*/
8+
#include <linux/bitops.h>
9+
#include <linux/compiler.h>
10+
#include <linux/jump_label.h>
11+
#include <linux/kasan-checks.h>
12+
#include <linux/kernel.h>
13+
14+
#include <asm/cpufeature.h>
15+
16+
#include <net/checksum.h>
17+
18+
/* Default version is sufficient for 32 bit */
19+
#ifndef CONFIG_32BIT
20+
__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
21+
const struct in6_addr *daddr,
22+
__u32 len, __u8 proto, __wsum csum)
23+
{
24+
unsigned int ulen, uproto;
25+
unsigned long sum = (__force unsigned long)csum;
26+
27+
sum += (__force unsigned long)saddr->s6_addr32[0];
28+
sum += (__force unsigned long)saddr->s6_addr32[1];
29+
sum += (__force unsigned long)saddr->s6_addr32[2];
30+
sum += (__force unsigned long)saddr->s6_addr32[3];
31+
32+
sum += (__force unsigned long)daddr->s6_addr32[0];
33+
sum += (__force unsigned long)daddr->s6_addr32[1];
34+
sum += (__force unsigned long)daddr->s6_addr32[2];
35+
sum += (__force unsigned long)daddr->s6_addr32[3];
36+
37+
ulen = (__force unsigned int)htonl((unsigned int)len);
38+
sum += ulen;
39+
40+
uproto = (__force unsigned int)htonl(proto);
41+
sum += uproto;
42+
43+
/*
44+
* Zbb support saves 4 instructions, so not worth checking without
45+
* alternatives if supported
46+
*/
47+
if (IS_ENABLED(CONFIG_RISCV_ISA_ZBB) &&
48+
IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) {
49+
unsigned long fold_temp;
50+
51+
/*
52+
* Zbb is likely available when the kernel is compiled with Zbb
53+
* support, so nop when Zbb is available and jump when Zbb is
54+
* not available.
55+
*/
56+
asm_volatile_goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0,
57+
RISCV_ISA_EXT_ZBB, 1)
58+
:
59+
:
60+
:
61+
: no_zbb);
62+
asm(".option push \n\
63+
.option arch,+zbb \n\
64+
rori %[fold_temp], %[sum], 32 \n\
65+
add %[sum], %[fold_temp], %[sum] \n\
66+
srli %[sum], %[sum], 32 \n\
67+
not %[fold_temp], %[sum] \n\
68+
roriw %[sum], %[sum], 16 \n\
69+
subw %[sum], %[fold_temp], %[sum] \n\
70+
.option pop"
71+
: [sum] "+r" (sum), [fold_temp] "=&r" (fold_temp));
72+
return (__force __sum16)(sum >> 16);
73+
}
74+
no_zbb:
75+
sum += ror64(sum, 32);
76+
sum >>= 32;
77+
return csum_fold((__force __wsum)sum);
78+
}
79+
EXPORT_SYMBOL(csum_ipv6_magic);
80+
#endif /* !CONFIG_32BIT */
81+
82+
#ifdef CONFIG_32BIT
83+
#define OFFSET_MASK 3
84+
#elif CONFIG_64BIT
85+
#define OFFSET_MASK 7
86+
#endif
87+
88+
static inline __no_sanitize_address unsigned long
89+
do_csum_common(const unsigned long *ptr, const unsigned long *end,
90+
unsigned long data)
91+
{
92+
unsigned int shift;
93+
unsigned long csum = 0, carry = 0;
94+
95+
/*
96+
* Do 32-bit reads on RV32 and 64-bit reads otherwise. This should be
97+
* faster than doing 32-bit reads on architectures that support larger
98+
* reads.
99+
*/
100+
while (ptr < end) {
101+
csum += data;
102+
carry += csum < data;
103+
data = *(ptr++);
104+
}
105+
106+
/*
107+
* Perform alignment (and over-read) bytes on the tail if any bytes
108+
* leftover.
109+
*/
110+
shift = ((long)ptr - (long)end) * 8;
111+
#ifdef __LITTLE_ENDIAN
112+
data = (data << shift) >> shift;
113+
#else
114+
data = (data >> shift) << shift;
115+
#endif
116+
csum += data;
117+
carry += csum < data;
118+
csum += carry;
119+
csum += csum < carry;
120+
121+
return csum;
122+
}
123+
124+
/*
125+
* Algorithm accounts for buff being misaligned.
126+
* If buff is not aligned, will over-read bytes but not use the bytes that it
127+
* shouldn't. The same thing will occur on the tail-end of the read.
128+
*/
129+
static inline __no_sanitize_address unsigned int
130+
do_csum_with_alignment(const unsigned char *buff, int len)
131+
{
132+
unsigned int offset, shift;
133+
unsigned long csum, data;
134+
const unsigned long *ptr, *end;
135+
136+
/*
137+
* Align address to closest word (double word on rv64) that comes before
138+
* buff. This should always be in the same page and cache line.
139+
* Directly call KASAN with the alignment we will be using.
140+
*/
141+
offset = (unsigned long)buff & OFFSET_MASK;
142+
kasan_check_read(buff, len);
143+
ptr = (const unsigned long *)(buff - offset);
144+
145+
/*
146+
* Clear the most significant bytes that were over-read if buff was not
147+
* aligned.
148+
*/
149+
shift = offset * 8;
150+
data = *(ptr++);
151+
#ifdef __LITTLE_ENDIAN
152+
data = (data >> shift) << shift;
153+
#else
154+
data = (data << shift) >> shift;
155+
#endif
156+
end = (const unsigned long *)(buff + len);
157+
csum = do_csum_common(ptr, end, data);
158+
159+
/*
160+
* Zbb support saves 6 instructions, so not worth checking without
161+
* alternatives if supported
162+
*/
163+
if (IS_ENABLED(CONFIG_RISCV_ISA_ZBB) &&
164+
IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) {
165+
unsigned long fold_temp;
166+
167+
/*
168+
* Zbb is likely available when the kernel is compiled with Zbb
169+
* support, so nop when Zbb is available and jump when Zbb is
170+
* not available.
171+
*/
172+
asm_volatile_goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0,
173+
RISCV_ISA_EXT_ZBB, 1)
174+
:
175+
:
176+
:
177+
: no_zbb);
178+
179+
#ifdef CONFIG_32BIT
180+
asm_volatile_goto(".option push \n\
181+
.option arch,+zbb \n\
182+
rori %[fold_temp], %[csum], 16 \n\
183+
andi %[offset], %[offset], 1 \n\
184+
add %[csum], %[fold_temp], %[csum] \n\
185+
beq %[offset], zero, %l[end] \n\
186+
rev8 %[csum], %[csum] \n\
187+
.option pop"
188+
: [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp)
189+
: [offset] "r" (offset)
190+
:
191+
: end);
192+
193+
return (unsigned short)csum;
194+
#else /* !CONFIG_32BIT */
195+
asm_volatile_goto(".option push \n\
196+
.option arch,+zbb \n\
197+
rori %[fold_temp], %[csum], 32 \n\
198+
add %[csum], %[fold_temp], %[csum] \n\
199+
srli %[csum], %[csum], 32 \n\
200+
roriw %[fold_temp], %[csum], 16 \n\
201+
addw %[csum], %[fold_temp], %[csum] \n\
202+
andi %[offset], %[offset], 1 \n\
203+
beq %[offset], zero, %l[end] \n\
204+
rev8 %[csum], %[csum] \n\
205+
.option pop"
206+
: [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp)
207+
: [offset] "r" (offset)
208+
:
209+
: end);
210+
211+
return (csum << 16) >> 48;
212+
#endif /* !CONFIG_32BIT */
213+
end:
214+
return csum >> 16;
215+
}
216+
no_zbb:
217+
#ifndef CONFIG_32BIT
218+
csum += ror64(csum, 32);
219+
csum >>= 32;
220+
#endif
221+
csum = (u32)csum + ror32((u32)csum, 16);
222+
if (offset & 1)
223+
return (u16)swab32(csum);
224+
return csum >> 16;
225+
}
226+
227+
/*
228+
* Does not perform alignment, should only be used if machine has fast
229+
* misaligned accesses, or when buff is known to be aligned.
230+
*/
231+
static inline __no_sanitize_address unsigned int
232+
do_csum_no_alignment(const unsigned char *buff, int len)
233+
{
234+
unsigned long csum, data;
235+
const unsigned long *ptr, *end;
236+
237+
ptr = (const unsigned long *)(buff);
238+
data = *(ptr++);
239+
240+
kasan_check_read(buff, len);
241+
242+
end = (const unsigned long *)(buff + len);
243+
csum = do_csum_common(ptr, end, data);
244+
245+
/*
246+
* Zbb support saves 6 instructions, so not worth checking without
247+
* alternatives if supported
248+
*/
249+
if (IS_ENABLED(CONFIG_RISCV_ISA_ZBB) &&
250+
IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) {
251+
unsigned long fold_temp;
252+
253+
/*
254+
* Zbb is likely available when the kernel is compiled with Zbb
255+
* support, so nop when Zbb is available and jump when Zbb is
256+
* not available.
257+
*/
258+
asm_volatile_goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0,
259+
RISCV_ISA_EXT_ZBB, 1)
260+
:
261+
:
262+
:
263+
: no_zbb);
264+
265+
#ifdef CONFIG_32BIT
266+
asm (".option push \n\
267+
.option arch,+zbb \n\
268+
rori %[fold_temp], %[csum], 16 \n\
269+
add %[csum], %[fold_temp], %[csum] \n\
270+
.option pop"
271+
: [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp)
272+
:
273+
: );
274+
275+
#else /* !CONFIG_32BIT */
276+
asm (".option push \n\
277+
.option arch,+zbb \n\
278+
rori %[fold_temp], %[csum], 32 \n\
279+
add %[csum], %[fold_temp], %[csum] \n\
280+
srli %[csum], %[csum], 32 \n\
281+
roriw %[fold_temp], %[csum], 16 \n\
282+
addw %[csum], %[fold_temp], %[csum] \n\
283+
.option pop"
284+
: [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp)
285+
:
286+
: );
287+
#endif /* !CONFIG_32BIT */
288+
return csum >> 16;
289+
}
290+
no_zbb:
291+
#ifndef CONFIG_32BIT
292+
csum += ror64(csum, 32);
293+
csum >>= 32;
294+
#endif
295+
csum = (u32)csum + ror32((u32)csum, 16);
296+
return csum >> 16;
297+
}
298+
299+
/*
300+
* Perform a checksum on an arbitrary memory address.
301+
* Will do a light-weight address alignment if buff is misaligned, unless
302+
* cpu supports fast misaligned accesses.
303+
*/
304+
unsigned int do_csum(const unsigned char *buff, int len)
305+
{
306+
if (unlikely(len <= 0))
307+
return 0;
308+
309+
/*
310+
* Significant performance gains can be seen by not doing alignment
311+
* on machines with fast misaligned accesses.
312+
*
313+
* There is some duplicate code between the "with_alignment" and
314+
* "no_alignment" implmentations, but the overlap is too awkward to be
315+
* able to fit in one function without introducing multiple static
316+
* branches. The largest chunk of overlap was delegated into the
317+
* do_csum_common function.
318+
*/
319+
if (static_branch_likely(&fast_misaligned_access_speed_key))
320+
return do_csum_no_alignment(buff, len);
321+
322+
if (((unsigned long)buff & OFFSET_MASK) == 0)
323+
return do_csum_no_alignment(buff, len);
324+
325+
return do_csum_with_alignment(buff, len);
326+
}

0 commit comments

Comments
 (0)