Skip to content

Commit ca96b16

Browse files
mjguziktorvalds
authored andcommitted
x86: bring back rep movsq for user access on CPUs without ERMS
Intel CPUs ship with ERMS for over a decade, but this is not true for AMD. In particular one reasonably recent uarch (EPYC 7R13) does not have it (or at least the bit is inactive when running on the Amazon EC2 cloud -- I found rather conflicting information about AMD CPUs vs the extension). Hand-rolled mov loops executing in this case are quite pessimal compared to rep movsq for bigger sizes. While the upper limit depends on uarch, everyone is well south of 1KB AFAICS and sizes bigger than that are common. While technically ancient CPUs may be suffering from rep usage, gcc has been emitting it for years all over kernel code, so I don't think this is a legitimate concern. Sample result from read1_processes from will-it-scale (4KB reads/s): before: 1507021 after: 1721828 (+14%) Note that the cutoff point for rep usage is set to 64 bytes, which is way too conservative but I'm sticking to what was done in 47ee3f1 ("x86: re-introduce support for ERMS copies for user space accesses"). That is to say *some* copies will now go slower, which is fixable but beyond the scope of this patch. Signed-off-by: Mateusz Guzik <mjguzik@gmail.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 8724596 commit ca96b16

File tree

2 files changed

+14
-45
lines changed

2 files changed

+14
-45
lines changed

arch/x86/include/asm/uaccess_64.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ copy_user_generic(void *to, const void *from, unsigned long len)
116116
"2:\n"
117117
_ASM_EXTABLE_UA(1b, 2b)
118118
:"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
119-
: : "memory", "rax", "r8", "r9", "r10", "r11");
119+
: : "memory", "rax");
120120
clac();
121121
return len;
122122
}

arch/x86/lib/copy_user_64.S

Lines changed: 13 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
* NOTE! The calling convention is very intentionally the same as
2828
* for 'rep movs', so that we can rewrite the function call with
2929
* just a plain 'rep movs' on machines that have FSRM. But to make
30-
* it simpler for us, we can clobber rsi/rdi and rax/r8-r11 freely.
30+
* it simpler for us, we can clobber rsi/rdi and rax freely.
3131
*/
3232
SYM_FUNC_START(rep_movs_alternative)
3333
cmpq $64,%rcx
@@ -68,55 +68,24 @@ SYM_FUNC_START(rep_movs_alternative)
6868
_ASM_EXTABLE_UA( 3b, .Lcopy_user_tail)
6969

7070
.Llarge:
71-
0: ALTERNATIVE "jmp .Lunrolled", "rep movsb", X86_FEATURE_ERMS
71+
0: ALTERNATIVE "jmp .Llarge_movsq", "rep movsb", X86_FEATURE_ERMS
7272
1: RET
7373

74-
_ASM_EXTABLE_UA( 0b, 1b)
74+
_ASM_EXTABLE_UA( 0b, 1b)
7575

76-
.p2align 4
77-
.Lunrolled:
78-
10: movq (%rsi),%r8
79-
11: movq 8(%rsi),%r9
80-
12: movq 16(%rsi),%r10
81-
13: movq 24(%rsi),%r11
82-
14: movq %r8,(%rdi)
83-
15: movq %r9,8(%rdi)
84-
16: movq %r10,16(%rdi)
85-
17: movq %r11,24(%rdi)
86-
20: movq 32(%rsi),%r8
87-
21: movq 40(%rsi),%r9
88-
22: movq 48(%rsi),%r10
89-
23: movq 56(%rsi),%r11
90-
24: movq %r8,32(%rdi)
91-
25: movq %r9,40(%rdi)
92-
26: movq %r10,48(%rdi)
93-
27: movq %r11,56(%rdi)
94-
addq $64,%rsi
95-
addq $64,%rdi
96-
subq $64,%rcx
97-
cmpq $64,%rcx
98-
jae .Lunrolled
99-
cmpl $8,%ecx
100-
jae .Lword
76+
.Llarge_movsq:
77+
movq %rcx,%rax
78+
shrq $3,%rcx
79+
andl $7,%eax
80+
0: rep movsq
81+
movl %eax,%ecx
10182
testl %ecx,%ecx
10283
jne .Lcopy_user_tail
10384
RET
10485

105-
_ASM_EXTABLE_UA(10b, .Lcopy_user_tail)
106-
_ASM_EXTABLE_UA(11b, .Lcopy_user_tail)
107-
_ASM_EXTABLE_UA(12b, .Lcopy_user_tail)
108-
_ASM_EXTABLE_UA(13b, .Lcopy_user_tail)
109-
_ASM_EXTABLE_UA(14b, .Lcopy_user_tail)
110-
_ASM_EXTABLE_UA(15b, .Lcopy_user_tail)
111-
_ASM_EXTABLE_UA(16b, .Lcopy_user_tail)
112-
_ASM_EXTABLE_UA(17b, .Lcopy_user_tail)
113-
_ASM_EXTABLE_UA(20b, .Lcopy_user_tail)
114-
_ASM_EXTABLE_UA(21b, .Lcopy_user_tail)
115-
_ASM_EXTABLE_UA(22b, .Lcopy_user_tail)
116-
_ASM_EXTABLE_UA(23b, .Lcopy_user_tail)
117-
_ASM_EXTABLE_UA(24b, .Lcopy_user_tail)
118-
_ASM_EXTABLE_UA(25b, .Lcopy_user_tail)
119-
_ASM_EXTABLE_UA(26b, .Lcopy_user_tail)
120-
_ASM_EXTABLE_UA(27b, .Lcopy_user_tail)
86+
1: leaq (%rax,%rcx,8),%rcx
87+
jmp .Lcopy_user_tail
88+
89+
_ASM_EXTABLE_UA( 0b, 1b)
12190
SYM_FUNC_END(rep_movs_alternative)
12291
EXPORT_SYMBOL(rep_movs_alternative)

0 commit comments

Comments
 (0)