Skip to content

Commit 580253b

Browse files
Merge patch series "RISC-V: Probe for misaligned access speed"
Evan Green <evan@rivosinc.com> says: The current setting for the hwprobe bit indicating misaligned access speed is controlled by a vendor-specific feature probe function. This is essentially a per-SoC table we have to maintain on behalf of each vendor going forward. Let's convert that instead to something we detect at runtime. We have two assembly routines at the heart of our probe: one that does a bunch of word-sized accesses (without aligning its input buffer), and the other that does byte accesses. If we can move a larger number of bytes using misaligned word accesses than we can with the same amount of time doing byte accesses, then we can declare misaligned accesses as "fast". The tradeoff of reducing this maintenance burden is boot time. We spend 4-6 jiffies per core doing this measurement (0-2 on jiffie edge alignment, and 4 on measurement). The timing loop was based on raid6_choose_gen(), which uses (16+1)*N jiffies (where N is the number of algorithms). By taking only the fastest iteration out of all attempts for use in the comparison, variance between runs is very low. On my THead C906, it looks like this: [ 0.047563] cpu0: Ratio of byte access time to unaligned word access is 4.34, unaligned accesses are fast Several others have chimed in with results on slow machines with the older algorithm, which took all runs into account, including noise like interrupts. Even with this variation, results indicate that in all cases (fast, slow, and emulated) the measured numbers are nowhere near each other (always multiple factors away). * b4-shazam-merge: RISC-V: alternative: Remove feature_probe_func RISC-V: Probe for unaligned access speed Link: https://lore.kernel.org/r/20230818194136.4084400-1-evan@rivosinc.com Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
2 parents e0152e7 + f2d14bc commit 580253b

File tree

10 files changed

+198
-39
lines changed

10 files changed

+198
-39
lines changed

Documentation/riscv/hwprobe.rst

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -87,13 +87,12 @@ The following keys are defined:
8787
emulated via software, either in or below the kernel. These accesses are
8888
always extremely slow.
8989

90-
* :c:macro:`RISCV_HWPROBE_MISALIGNED_SLOW`: Misaligned accesses are supported
91-
in hardware, but are slower than the corresponding aligned accesses
92-
sequences.
90+
* :c:macro:`RISCV_HWPROBE_MISALIGNED_SLOW`: Misaligned accesses are slower
91+
than equivalent byte accesses. Misaligned accesses may be supported
92+
directly in hardware, or trapped and emulated by software.
9393

94-
* :c:macro:`RISCV_HWPROBE_MISALIGNED_FAST`: Misaligned accesses are supported
95-
in hardware and are faster than the corresponding aligned accesses
96-
sequences.
94+
* :c:macro:`RISCV_HWPROBE_MISALIGNED_FAST`: Misaligned accesses are faster
95+
than equivalent byte accesses.
9796

9897
* :c:macro:`RISCV_HWPROBE_MISALIGNED_UNSUPPORTED`: Misaligned accesses are
9998
not supported at all and will generate a misaligned address fault.

arch/riscv/errata/thead/errata.c

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -120,11 +120,3 @@ void thead_errata_patch_func(struct alt_entry *begin, struct alt_entry *end,
120120
if (stage == RISCV_ALTERNATIVES_EARLY_BOOT)
121121
local_flush_icache_all();
122122
}
123-
124-
void thead_feature_probe_func(unsigned int cpu,
125-
unsigned long archid,
126-
unsigned long impid)
127-
{
128-
if ((archid == 0) && (impid == 0))
129-
per_cpu(misaligned_access_speed, cpu) = RISCV_HWPROBE_MISALIGNED_FAST;
130-
}

arch/riscv/include/asm/alternative.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
#define ALT_OLD_PTR(a) __ALT_PTR(a, old_offset)
3131
#define ALT_ALT_PTR(a) __ALT_PTR(a, alt_offset)
3232

33-
void probe_vendor_features(unsigned int cpu);
3433
void __init apply_boot_alternatives(void);
3534
void __init apply_early_boot_alternatives(void);
3635
void apply_module_alternatives(void *start, size_t length);
@@ -53,15 +52,11 @@ void thead_errata_patch_func(struct alt_entry *begin, struct alt_entry *end,
5352
unsigned long archid, unsigned long impid,
5453
unsigned int stage);
5554

56-
void thead_feature_probe_func(unsigned int cpu, unsigned long archid,
57-
unsigned long impid);
58-
5955
void riscv_cpufeature_patch_func(struct alt_entry *begin, struct alt_entry *end,
6056
unsigned int stage);
6157

6258
#else /* CONFIG_RISCV_ALTERNATIVE */
6359

64-
static inline void probe_vendor_features(unsigned int cpu) { }
6560
static inline void apply_boot_alternatives(void) { }
6661
static inline void apply_early_boot_alternatives(void) { }
6762
static inline void apply_module_alternatives(void *start, size_t length) { }

arch/riscv/include/asm/cpufeature.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,4 +30,6 @@ DECLARE_PER_CPU(long, misaligned_access_speed);
3030
/* Per-cpu ISA extensions. */
3131
extern struct riscv_isainfo hart_isa[NR_CPUS];
3232

33+
void check_unaligned_access(int cpu);
34+
3335
#endif

arch/riscv/kernel/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ extra-y += vmlinux.lds
3838
obj-y += head.o
3939
obj-y += soc.o
4040
obj-$(CONFIG_RISCV_ALTERNATIVE) += alternative.o
41+
obj-y += copy-unaligned.o
4142
obj-y += cpu.o
4243
obj-y += cpufeature.o
4344
obj-y += entry.o

arch/riscv/kernel/alternative.c

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,6 @@ struct cpu_manufacturer_info_t {
2727
void (*patch_func)(struct alt_entry *begin, struct alt_entry *end,
2828
unsigned long archid, unsigned long impid,
2929
unsigned int stage);
30-
void (*feature_probe_func)(unsigned int cpu, unsigned long archid,
31-
unsigned long impid);
3230
};
3331

3432
static void riscv_fill_cpu_mfr_info(struct cpu_manufacturer_info_t *cpu_mfr_info)
@@ -43,7 +41,6 @@ static void riscv_fill_cpu_mfr_info(struct cpu_manufacturer_info_t *cpu_mfr_info
4341
cpu_mfr_info->imp_id = sbi_get_mimpid();
4442
#endif
4543

46-
cpu_mfr_info->feature_probe_func = NULL;
4744
switch (cpu_mfr_info->vendor_id) {
4845
#ifdef CONFIG_ERRATA_SIFIVE
4946
case SIFIVE_VENDOR_ID:
@@ -53,7 +50,6 @@ static void riscv_fill_cpu_mfr_info(struct cpu_manufacturer_info_t *cpu_mfr_info
5350
#ifdef CONFIG_ERRATA_THEAD
5451
case THEAD_VENDOR_ID:
5552
cpu_mfr_info->patch_func = thead_errata_patch_func;
56-
cpu_mfr_info->feature_probe_func = thead_feature_probe_func;
5753
break;
5854
#endif
5955
default:
@@ -143,20 +139,6 @@ void riscv_alternative_fix_offsets(void *alt_ptr, unsigned int len,
143139
}
144140
}
145141

146-
/* Called on each CPU as it starts */
147-
void probe_vendor_features(unsigned int cpu)
148-
{
149-
struct cpu_manufacturer_info_t cpu_mfr_info;
150-
151-
riscv_fill_cpu_mfr_info(&cpu_mfr_info);
152-
if (!cpu_mfr_info.feature_probe_func)
153-
return;
154-
155-
cpu_mfr_info.feature_probe_func(cpu,
156-
cpu_mfr_info.arch_id,
157-
cpu_mfr_info.imp_id);
158-
}
159-
160142
/*
161143
* This is called very early in the boot process (directly after we run
162144
* a feature detect on the boot CPU). No need to worry about other CPUs
@@ -211,7 +193,6 @@ void __init apply_boot_alternatives(void)
211193
/* If called on non-boot cpu things could go wrong */
212194
WARN_ON(smp_processor_id() != 0);
213195

214-
probe_vendor_features(0);
215196
_apply_alternatives((struct alt_entry *)__alt_start,
216197
(struct alt_entry *)__alt_end,
217198
RISCV_ALTERNATIVES_BOOT);

arch/riscv/kernel/copy-unaligned.S

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
/* SPDX-License-Identifier: GPL-2.0 */
2+
/* Copyright (C) 2023 Rivos Inc. */
3+
4+
#include <linux/linkage.h>
5+
#include <asm/asm.h>
6+
7+
.text
8+
9+
/* void __riscv_copy_words_unaligned(void *, const void *, size_t) */
10+
/* Performs a memcpy without aligning buffers, using word loads and stores. */
11+
/* Note: The size is truncated to a multiple of 8 * SZREG */
12+
ENTRY(__riscv_copy_words_unaligned)
13+
andi a4, a2, ~((8*SZREG)-1)
14+
beqz a4, 2f
15+
add a3, a1, a4
16+
1:
17+
REG_L a4, 0(a1)
18+
REG_L a5, SZREG(a1)
19+
REG_L a6, 2*SZREG(a1)
20+
REG_L a7, 3*SZREG(a1)
21+
REG_L t0, 4*SZREG(a1)
22+
REG_L t1, 5*SZREG(a1)
23+
REG_L t2, 6*SZREG(a1)
24+
REG_L t3, 7*SZREG(a1)
25+
REG_S a4, 0(a0)
26+
REG_S a5, SZREG(a0)
27+
REG_S a6, 2*SZREG(a0)
28+
REG_S a7, 3*SZREG(a0)
29+
REG_S t0, 4*SZREG(a0)
30+
REG_S t1, 5*SZREG(a0)
31+
REG_S t2, 6*SZREG(a0)
32+
REG_S t3, 7*SZREG(a0)
33+
addi a0, a0, 8*SZREG
34+
addi a1, a1, 8*SZREG
35+
bltu a1, a3, 1b
36+
37+
2:
38+
ret
39+
END(__riscv_copy_words_unaligned)
40+
41+
/* void __riscv_copy_bytes_unaligned(void *, const void *, size_t) */
42+
/* Performs a memcpy without aligning buffers, using only byte accesses. */
43+
/* Note: The size is truncated to a multiple of 8 */
44+
ENTRY(__riscv_copy_bytes_unaligned)
45+
andi a4, a2, ~(8-1)
46+
beqz a4, 2f
47+
add a3, a1, a4
48+
1:
49+
lb a4, 0(a1)
50+
lb a5, 1(a1)
51+
lb a6, 2(a1)
52+
lb a7, 3(a1)
53+
lb t0, 4(a1)
54+
lb t1, 5(a1)
55+
lb t2, 6(a1)
56+
lb t3, 7(a1)
57+
sb a4, 0(a0)
58+
sb a5, 1(a0)
59+
sb a6, 2(a0)
60+
sb a7, 3(a0)
61+
sb t0, 4(a0)
62+
sb t1, 5(a0)
63+
sb t2, 6(a0)
64+
sb t3, 7(a0)
65+
addi a0, a0, 8
66+
addi a1, a1, 8
67+
bltu a1, a3, 1b
68+
69+
2:
70+
ret
71+
END(__riscv_copy_bytes_unaligned)

arch/riscv/kernel/copy-unaligned.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
/* SPDX-License-Identifier: GPL-2.0 */
2+
/*
3+
* Copyright (C) 2023 Rivos, Inc.
4+
*/
5+
#ifndef __RISCV_KERNEL_COPY_UNALIGNED_H
6+
#define __RISCV_KERNEL_COPY_UNALIGNED_H
7+
8+
#include <linux/types.h>
9+
10+
void __riscv_copy_words_unaligned(void *dst, const void *src, size_t size);
11+
void __riscv_copy_bytes_unaligned(void *dst, const void *src, size_t size);
12+
13+
#endif /* __RISCV_KERNEL_COPY_UNALIGNED_H */

arch/riscv/kernel/cpufeature.c

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,19 @@
1818
#include <asm/cacheflush.h>
1919
#include <asm/cpufeature.h>
2020
#include <asm/hwcap.h>
21+
#include <asm/hwprobe.h>
2122
#include <asm/patch.h>
2223
#include <asm/processor.h>
2324
#include <asm/vector.h>
2425

26+
#include "copy-unaligned.h"
27+
2528
#define NUM_ALPHA_EXTS ('z' - 'a' + 1)
2629

30+
#define MISALIGNED_ACCESS_JIFFIES_LG2 1
31+
#define MISALIGNED_BUFFER_SIZE 0x4000
32+
#define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
33+
2734
unsigned long elf_hwcap __read_mostly;
2835

2936
/* Host ISA bitmap */
@@ -549,6 +556,103 @@ unsigned long riscv_get_elf_hwcap(void)
549556
return hwcap;
550557
}
551558

559+
void check_unaligned_access(int cpu)
560+
{
561+
u64 start_cycles, end_cycles;
562+
u64 word_cycles;
563+
u64 byte_cycles;
564+
int ratio;
565+
unsigned long start_jiffies, now;
566+
struct page *page;
567+
void *dst;
568+
void *src;
569+
long speed = RISCV_HWPROBE_MISALIGNED_SLOW;
570+
571+
page = alloc_pages(GFP_NOWAIT, get_order(MISALIGNED_BUFFER_SIZE));
572+
if (!page) {
573+
pr_warn("Can't alloc pages to measure memcpy performance");
574+
return;
575+
}
576+
577+
/* Make an unaligned destination buffer. */
578+
dst = (void *)((unsigned long)page_address(page) | 0x1);
579+
/* Unalign src as well, but differently (off by 1 + 2 = 3). */
580+
src = dst + (MISALIGNED_BUFFER_SIZE / 2);
581+
src += 2;
582+
word_cycles = -1ULL;
583+
/* Do a warmup. */
584+
__riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
585+
preempt_disable();
586+
start_jiffies = jiffies;
587+
while ((now = jiffies) == start_jiffies)
588+
cpu_relax();
589+
590+
/*
591+
* For a fixed amount of time, repeatedly try the function, and take
592+
* the best time in cycles as the measurement.
593+
*/
594+
while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
595+
start_cycles = get_cycles64();
596+
/* Ensure the CSR read can't reorder WRT to the copy. */
597+
mb();
598+
__riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
599+
/* Ensure the copy ends before the end time is snapped. */
600+
mb();
601+
end_cycles = get_cycles64();
602+
if ((end_cycles - start_cycles) < word_cycles)
603+
word_cycles = end_cycles - start_cycles;
604+
}
605+
606+
byte_cycles = -1ULL;
607+
__riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
608+
start_jiffies = jiffies;
609+
while ((now = jiffies) == start_jiffies)
610+
cpu_relax();
611+
612+
while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
613+
start_cycles = get_cycles64();
614+
mb();
615+
__riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
616+
mb();
617+
end_cycles = get_cycles64();
618+
if ((end_cycles - start_cycles) < byte_cycles)
619+
byte_cycles = end_cycles - start_cycles;
620+
}
621+
622+
preempt_enable();
623+
624+
/* Don't divide by zero. */
625+
if (!word_cycles || !byte_cycles) {
626+
pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n",
627+
cpu);
628+
629+
goto out;
630+
}
631+
632+
if (word_cycles < byte_cycles)
633+
speed = RISCV_HWPROBE_MISALIGNED_FAST;
634+
635+
ratio = div_u64((byte_cycles * 100), word_cycles);
636+
pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n",
637+
cpu,
638+
ratio / 100,
639+
ratio % 100,
640+
(speed == RISCV_HWPROBE_MISALIGNED_FAST) ? "fast" : "slow");
641+
642+
per_cpu(misaligned_access_speed, cpu) = speed;
643+
644+
out:
645+
__free_pages(page, get_order(MISALIGNED_BUFFER_SIZE));
646+
}
647+
648+
static int check_unaligned_access_boot_cpu(void)
649+
{
650+
check_unaligned_access(0);
651+
return 0;
652+
}
653+
654+
arch_initcall(check_unaligned_access_boot_cpu);
655+
552656
#ifdef CONFIG_RISCV_ALTERNATIVE
553657
/*
554658
* Alternative patch sites consider 48 bits when determining when to patch

arch/riscv/kernel/smpboot.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include <linux/sched/task_stack.h>
2727
#include <linux/sched/mm.h>
2828
#include <asm/cpu_ops.h>
29+
#include <asm/cpufeature.h>
2930
#include <asm/irq.h>
3031
#include <asm/mmu_context.h>
3132
#include <asm/numa.h>
@@ -245,7 +246,7 @@ asmlinkage __visible void smp_callin(void)
245246

246247
numa_add_cpu(curr_cpuid);
247248
set_cpu_online(curr_cpuid, 1);
248-
probe_vendor_features(curr_cpuid);
249+
check_unaligned_access(curr_cpuid);
249250

250251
if (has_vector()) {
251252
if (riscv_v_setup_vsize())

0 commit comments

Comments
 (0)