Skip to content

Commit c640868

Browse files
Merge patch series "riscv: Add fine-tuned checksum functions"
Charlie Jenkins <charlie@rivosinc.com> says: Each architecture generally implements fine-tuned checksum functions to leverage the instruction set. This patch adds the main checksum functions that are used in networking. Tested on QEMU, this series allows the CHECKSUM_KUNIT tests to complete an average of 50.9% faster. This patch takes heavy use of the Zbb extension using alternatives patching. To test this patch, enable the configs for KUNIT, then CHECKSUM_KUNIT. I have attempted to make these functions as optimal as possible, but I have not ran anything on actual riscv hardware. My performance testing has been limited to inspecting the assembly, running the algorithms on x86 hardware, and running in QEMU. ip_fast_csum is a relatively small function so even though it is possible to read 64 bits at a time on compatible hardware, the bottleneck becomes the clean up and setup code so loading 32 bits at a time is actually faster. * b4-shazam-merge: kunit: Add tests for csum_ipv6_magic and ip_fast_csum riscv: Add checksum library riscv: Add checksum header riscv: Add static key for misaligned accesses asm-generic: Improve csum_fold Link: https://lore.kernel.org/r/20240108-optimize_checksum-v15-0-1c50de5f2167@rivosinc.com Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
2 parents 0de6528 + 6f4c45c commit c640868

File tree

7 files changed

+796
-8
lines changed

7 files changed

+796
-8
lines changed

arch/riscv/include/asm/checksum.h

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
/* SPDX-License-Identifier: GPL-2.0 */
2+
/*
3+
* Checksum routines
4+
*
5+
* Copyright (C) 2023 Rivos Inc.
6+
*/
7+
#ifndef __ASM_RISCV_CHECKSUM_H
8+
#define __ASM_RISCV_CHECKSUM_H
9+
10+
#include <linux/in6.h>
11+
#include <linux/uaccess.h>
12+
13+
#define ip_fast_csum ip_fast_csum
14+
15+
extern unsigned int do_csum(const unsigned char *buff, int len);
16+
#define do_csum do_csum
17+
18+
/* Default version is sufficient for 32 bit */
19+
#ifndef CONFIG_32BIT
20+
#define _HAVE_ARCH_IPV6_CSUM
21+
__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
22+
const struct in6_addr *daddr,
23+
__u32 len, __u8 proto, __wsum sum);
24+
#endif
25+
26+
/* Define riscv versions of functions before importing asm-generic/checksum.h */
27+
#include <asm-generic/checksum.h>
28+
29+
/**
30+
* Quickly compute an IP checksum with the assumption that IPv4 headers will
31+
* always be in multiples of 32-bits, and have an ihl of at least 5.
32+
*
33+
* @ihl: the number of 32 bit segments and must be greater than or equal to 5.
34+
* @iph: assumed to be word aligned given that NET_IP_ALIGN is set to 2 on
35+
* riscv, defining IP headers to be aligned.
36+
*/
37+
static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
38+
{
39+
unsigned long csum = 0;
40+
int pos = 0;
41+
42+
do {
43+
csum += ((const unsigned int *)iph)[pos];
44+
if (IS_ENABLED(CONFIG_32BIT))
45+
csum += csum < ((const unsigned int *)iph)[pos];
46+
} while (++pos < ihl);
47+
48+
/*
49+
* ZBB only saves three instructions on 32-bit and five on 64-bit so not
50+
* worth checking if supported without Alternatives.
51+
*/
52+
if (IS_ENABLED(CONFIG_RISCV_ISA_ZBB) &&
53+
IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) {
54+
unsigned long fold_temp;
55+
56+
asm_volatile_goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0,
57+
RISCV_ISA_EXT_ZBB, 1)
58+
:
59+
:
60+
:
61+
: no_zbb);
62+
63+
if (IS_ENABLED(CONFIG_32BIT)) {
64+
asm(".option push \n\
65+
.option arch,+zbb \n\
66+
not %[fold_temp], %[csum] \n\
67+
rori %[csum], %[csum], 16 \n\
68+
sub %[csum], %[fold_temp], %[csum] \n\
69+
.option pop"
70+
: [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp));
71+
} else {
72+
asm(".option push \n\
73+
.option arch,+zbb \n\
74+
rori %[fold_temp], %[csum], 32 \n\
75+
add %[csum], %[fold_temp], %[csum] \n\
76+
srli %[csum], %[csum], 32 \n\
77+
not %[fold_temp], %[csum] \n\
78+
roriw %[csum], %[csum], 16 \n\
79+
subw %[csum], %[fold_temp], %[csum] \n\
80+
.option pop"
81+
: [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp));
82+
}
83+
return (__force __sum16)(csum >> 16);
84+
}
85+
no_zbb:
86+
#ifndef CONFIG_32BIT
87+
csum += ror64(csum, 32);
88+
csum >>= 32;
89+
#endif
90+
return csum_fold((__force __wsum)csum);
91+
}
92+
93+
#endif /* __ASM_RISCV_CHECKSUM_H */

arch/riscv/include/asm/cpufeature.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,4 +135,6 @@ static __always_inline bool riscv_cpu_has_extension_unlikely(int cpu, const unsi
135135
return __riscv_isa_extension_available(hart_isa[cpu].isa, ext);
136136
}
137137

138+
DECLARE_STATIC_KEY_FALSE(fast_misaligned_access_speed_key);
139+
138140
#endif

arch/riscv/kernel/cpufeature.c

Lines changed: 87 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,10 @@
88

99
#include <linux/acpi.h>
1010
#include <linux/bitmap.h>
11+
#include <linux/cpu.h>
1112
#include <linux/cpuhotplug.h>
1213
#include <linux/ctype.h>
14+
#include <linux/jump_label.h>
1315
#include <linux/log2.h>
1416
#include <linux/memory.h>
1517
#include <linux/module.h>
@@ -44,6 +46,8 @@ struct riscv_isainfo hart_isa[NR_CPUS];
4446
/* Performance information */
4547
DEFINE_PER_CPU(long, misaligned_access_speed);
4648

49+
static cpumask_t fast_misaligned_access;
50+
4751
/**
4852
* riscv_isa_extension_base() - Get base extension word
4953
*
@@ -784,6 +788,16 @@ static int check_unaligned_access(void *param)
784788
(speed == RISCV_HWPROBE_MISALIGNED_FAST) ? "fast" : "slow");
785789

786790
per_cpu(misaligned_access_speed, cpu) = speed;
791+
792+
/*
793+
* Set the value of fast_misaligned_access of a CPU. These operations
794+
* are atomic to avoid race conditions.
795+
*/
796+
if (speed == RISCV_HWPROBE_MISALIGNED_FAST)
797+
cpumask_set_cpu(cpu, &fast_misaligned_access);
798+
else
799+
cpumask_clear_cpu(cpu, &fast_misaligned_access);
800+
787801
return 0;
788802
}
789803

@@ -796,13 +810,69 @@ static void check_unaligned_access_nonboot_cpu(void *param)
796810
check_unaligned_access(pages[cpu]);
797811
}
798812

813+
DEFINE_STATIC_KEY_FALSE(fast_misaligned_access_speed_key);
814+
815+
static void modify_unaligned_access_branches(cpumask_t *mask, int weight)
816+
{
817+
if (cpumask_weight(mask) == weight)
818+
static_branch_enable_cpuslocked(&fast_misaligned_access_speed_key);
819+
else
820+
static_branch_disable_cpuslocked(&fast_misaligned_access_speed_key);
821+
}
822+
823+
static void set_unaligned_access_static_branches_except_cpu(int cpu)
824+
{
825+
/*
826+
* Same as set_unaligned_access_static_branches, except excludes the
827+
* given CPU from the result. When a CPU is hotplugged into an offline
828+
* state, this function is called before the CPU is set to offline in
829+
* the cpumask, and thus the CPU needs to be explicitly excluded.
830+
*/
831+
832+
cpumask_t fast_except_me;
833+
834+
cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask);
835+
cpumask_clear_cpu(cpu, &fast_except_me);
836+
837+
modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1);
838+
}
839+
840+
static void set_unaligned_access_static_branches(void)
841+
{
842+
/*
843+
* This will be called after check_unaligned_access_all_cpus so the
844+
* result of unaligned access speed for all CPUs will be available.
845+
*
846+
* To avoid the number of online cpus changing between reading
847+
* cpu_online_mask and calling num_online_cpus, cpus_read_lock must be
848+
* held before calling this function.
849+
*/
850+
851+
cpumask_t fast_and_online;
852+
853+
cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask);
854+
855+
modify_unaligned_access_branches(&fast_and_online, num_online_cpus());
856+
}
857+
858+
static int lock_and_set_unaligned_access_static_branch(void)
859+
{
860+
cpus_read_lock();
861+
set_unaligned_access_static_branches();
862+
cpus_read_unlock();
863+
864+
return 0;
865+
}
866+
867+
arch_initcall_sync(lock_and_set_unaligned_access_static_branch);
868+
799869
static int riscv_online_cpu(unsigned int cpu)
800870
{
801871
static struct page *buf;
802872

803873
/* We are already set since the last check */
804874
if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN)
805-
return 0;
875+
goto exit;
806876

807877
buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
808878
if (!buf) {
@@ -812,6 +882,17 @@ static int riscv_online_cpu(unsigned int cpu)
812882

813883
check_unaligned_access(buf);
814884
__free_pages(buf, MISALIGNED_BUFFER_ORDER);
885+
886+
exit:
887+
set_unaligned_access_static_branches();
888+
889+
return 0;
890+
}
891+
892+
static int riscv_offline_cpu(unsigned int cpu)
893+
{
894+
set_unaligned_access_static_branches_except_cpu(cpu);
895+
815896
return 0;
816897
}
817898

@@ -846,9 +927,12 @@ static int check_unaligned_access_all_cpus(void)
846927
/* Check core 0. */
847928
smp_call_on_cpu(0, check_unaligned_access, bufs[0], true);
848929

849-
/* Setup hotplug callback for any new CPUs that come online. */
930+
/*
931+
* Setup hotplug callbacks for any new CPUs that come online or go
932+
* offline.
933+
*/
850934
cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
851-
riscv_online_cpu, NULL);
935+
riscv_online_cpu, riscv_offline_cpu);
852936

853937
out:
854938
unaligned_emulation_finish();

arch/riscv/lib/Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,11 @@ lib-y += memmove.o
66
lib-y += strcmp.o
77
lib-y += strlen.o
88
lib-y += strncmp.o
9+
lib-y += csum.o
910
ifeq ($(CONFIG_MMU), y)
10-
lib-y += uaccess.o
1111
lib-$(CONFIG_RISCV_ISA_V) += uaccess_vector.o
1212
endif
13+
lib-$(CONFIG_MMU) += uaccess.o
1314
lib-$(CONFIG_64BIT) += tishift.o
1415
lib-$(CONFIG_RISCV_ISA_ZICBOZ) += clear_page.o
1516

0 commit comments

Comments
 (0)