Skip to content

Commit 3f0d6ec

Browse files
committed
Merge tag 'core-entry-2020-08-04' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull generic kernel entry/exit code from Thomas Gleixner: "Generic implementation of common syscall, interrupt and exception entry/exit functionality based on the recent X86 effort to ensure correctness of entry/exit vs RCU and instrumentation. As this functionality and the required entry/exit sequences are not architecture specific, sharing them allows other architectures to benefit instead of copying the same code over and over again. This branch was kept standalone to allow others to work on it. The conversion of x86 comes in a seperate pull request which obviously is based on this branch" * tag 'core-entry-2020-08-04' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: entry: Correct __secure_computing() stub entry: Correct 'noinstr' attributes entry: Provide infrastructure for work before transitioning to guest mode entry: Provide generic interrupt entry/exit code entry: Provide generic syscall exit function entry: Provide generic syscall entry functionality seccomp: Provide stub for __secure_computing()
2 parents 442489c + 3135f5b commit 3f0d6ec

File tree

10 files changed

+907
-0
lines changed

10 files changed

+907
-0
lines changed

arch/Kconfig

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ config HAVE_IMA_KEXEC
2727
config HOTPLUG_SMT
2828
bool
2929

30+
config GENERIC_ENTRY
31+
bool
32+
3033
config OPROFILE
3134
tristate "OProfile system profiling"
3235
depends on PROFILING

include/linux/entry-common.h

Lines changed: 372 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,372 @@
1+
/* SPDX-License-Identifier: GPL-2.0 */
2+
#ifndef __LINUX_ENTRYCOMMON_H
3+
#define __LINUX_ENTRYCOMMON_H
4+
5+
#include <linux/tracehook.h>
6+
#include <linux/syscalls.h>
7+
#include <linux/seccomp.h>
8+
#include <linux/sched.h>
9+
10+
#include <asm/entry-common.h>
11+
12+
/*
13+
* Define dummy _TIF work flags if not defined by the architecture or for
14+
* disabled functionality.
15+
*/
16+
#ifndef _TIF_SYSCALL_EMU
17+
# define _TIF_SYSCALL_EMU (0)
18+
#endif
19+
20+
#ifndef _TIF_SYSCALL_TRACEPOINT
21+
# define _TIF_SYSCALL_TRACEPOINT (0)
22+
#endif
23+
24+
#ifndef _TIF_SECCOMP
25+
# define _TIF_SECCOMP (0)
26+
#endif
27+
28+
#ifndef _TIF_SYSCALL_AUDIT
29+
# define _TIF_SYSCALL_AUDIT (0)
30+
#endif
31+
32+
#ifndef _TIF_PATCH_PENDING
33+
# define _TIF_PATCH_PENDING (0)
34+
#endif
35+
36+
#ifndef _TIF_UPROBE
37+
# define _TIF_UPROBE (0)
38+
#endif
39+
40+
/*
41+
* TIF flags handled in syscall_enter_from_usermode()
42+
*/
43+
#ifndef ARCH_SYSCALL_ENTER_WORK
44+
# define ARCH_SYSCALL_ENTER_WORK (0)
45+
#endif
46+
47+
#define SYSCALL_ENTER_WORK \
48+
(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \
49+
_TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_EMU | \
50+
ARCH_SYSCALL_ENTER_WORK)
51+
52+
/*
53+
* TIF flags handled in syscall_exit_to_user_mode()
54+
*/
55+
#ifndef ARCH_SYSCALL_EXIT_WORK
56+
# define ARCH_SYSCALL_EXIT_WORK (0)
57+
#endif
58+
59+
#define SYSCALL_EXIT_WORK \
60+
(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
61+
_TIF_SYSCALL_TRACEPOINT | ARCH_SYSCALL_EXIT_WORK)
62+
63+
/*
64+
* TIF flags handled in exit_to_user_mode_loop()
65+
*/
66+
#ifndef ARCH_EXIT_TO_USER_MODE_WORK
67+
# define ARCH_EXIT_TO_USER_MODE_WORK (0)
68+
#endif
69+
70+
#define EXIT_TO_USER_MODE_WORK \
71+
(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
72+
_TIF_NEED_RESCHED | _TIF_PATCH_PENDING | \
73+
ARCH_EXIT_TO_USER_MODE_WORK)
74+
75+
/**
76+
* arch_check_user_regs - Architecture specific sanity check for user mode regs
77+
* @regs: Pointer to currents pt_regs
78+
*
79+
* Defaults to an empty implementation. Can be replaced by architecture
80+
* specific code.
81+
*
82+
* Invoked from syscall_enter_from_user_mode() in the non-instrumentable
83+
* section. Use __always_inline so the compiler cannot push it out of line
84+
* and make it instrumentable.
85+
*/
86+
static __always_inline void arch_check_user_regs(struct pt_regs *regs);
87+
88+
#ifndef arch_check_user_regs
89+
static __always_inline void arch_check_user_regs(struct pt_regs *regs) {}
90+
#endif
91+
92+
/**
93+
* arch_syscall_enter_tracehook - Wrapper around tracehook_report_syscall_entry()
94+
* @regs: Pointer to currents pt_regs
95+
*
96+
* Returns: 0 on success or an error code to skip the syscall.
97+
*
98+
* Defaults to tracehook_report_syscall_entry(). Can be replaced by
99+
* architecture specific code.
100+
*
101+
* Invoked from syscall_enter_from_user_mode()
102+
*/
103+
static inline __must_check int arch_syscall_enter_tracehook(struct pt_regs *regs);
104+
105+
#ifndef arch_syscall_enter_tracehook
106+
static inline __must_check int arch_syscall_enter_tracehook(struct pt_regs *regs)
107+
{
108+
return tracehook_report_syscall_entry(regs);
109+
}
110+
#endif
111+
112+
/**
113+
* syscall_enter_from_user_mode - Check and handle work before invoking
114+
* a syscall
115+
* @regs: Pointer to currents pt_regs
116+
* @syscall: The syscall number
117+
*
118+
* Invoked from architecture specific syscall entry code with interrupts
119+
* disabled. The calling code has to be non-instrumentable. When the
120+
* function returns all state is correct and the subsequent functions can be
121+
* instrumented.
122+
*
123+
* Returns: The original or a modified syscall number
124+
*
125+
* If the returned syscall number is -1 then the syscall should be
126+
* skipped. In this case the caller may invoke syscall_set_error() or
127+
* syscall_set_return_value() first. If neither of those are called and -1
128+
* is returned, then the syscall will fail with ENOSYS.
129+
*
130+
* The following functionality is handled here:
131+
*
132+
* 1) Establish state (lockdep, RCU (context tracking), tracing)
133+
* 2) TIF flag dependent invocations of arch_syscall_enter_tracehook(),
134+
* __secure_computing(), trace_sys_enter()
135+
* 3) Invocation of audit_syscall_entry()
136+
*/
137+
long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall);
138+
139+
/**
140+
* local_irq_enable_exit_to_user - Exit to user variant of local_irq_enable()
141+
* @ti_work: Cached TIF flags gathered with interrupts disabled
142+
*
143+
* Defaults to local_irq_enable(). Can be supplied by architecture specific
144+
* code.
145+
*/
146+
static inline void local_irq_enable_exit_to_user(unsigned long ti_work);
147+
148+
#ifndef local_irq_enable_exit_to_user
149+
static inline void local_irq_enable_exit_to_user(unsigned long ti_work)
150+
{
151+
local_irq_enable();
152+
}
153+
#endif
154+
155+
/**
156+
* local_irq_disable_exit_to_user - Exit to user variant of local_irq_disable()
157+
*
158+
* Defaults to local_irq_disable(). Can be supplied by architecture specific
159+
* code.
160+
*/
161+
static inline void local_irq_disable_exit_to_user(void);
162+
163+
#ifndef local_irq_disable_exit_to_user
164+
static inline void local_irq_disable_exit_to_user(void)
165+
{
166+
local_irq_disable();
167+
}
168+
#endif
169+
170+
/**
171+
* arch_exit_to_user_mode_work - Architecture specific TIF work for exit
172+
* to user mode.
173+
* @regs: Pointer to currents pt_regs
174+
* @ti_work: Cached TIF flags gathered with interrupts disabled
175+
*
176+
* Invoked from exit_to_user_mode_loop() with interrupt enabled
177+
*
178+
* Defaults to NOOP. Can be supplied by architecture specific code.
179+
*/
180+
static inline void arch_exit_to_user_mode_work(struct pt_regs *regs,
181+
unsigned long ti_work);
182+
183+
#ifndef arch_exit_to_user_mode_work
184+
static inline void arch_exit_to_user_mode_work(struct pt_regs *regs,
185+
unsigned long ti_work)
186+
{
187+
}
188+
#endif
189+
190+
/**
191+
* arch_exit_to_user_mode_prepare - Architecture specific preparation for
192+
* exit to user mode.
193+
* @regs: Pointer to currents pt_regs
194+
* @ti_work: Cached TIF flags gathered with interrupts disabled
195+
*
196+
* Invoked from exit_to_user_mode_prepare() with interrupt disabled as the last
197+
* function before return. Defaults to NOOP.
198+
*/
199+
static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
200+
unsigned long ti_work);
201+
202+
#ifndef arch_exit_to_user_mode_prepare
203+
static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
204+
unsigned long ti_work)
205+
{
206+
}
207+
#endif
208+
209+
/**
210+
* arch_exit_to_user_mode - Architecture specific final work before
211+
* exit to user mode.
212+
*
213+
* Invoked from exit_to_user_mode() with interrupt disabled as the last
214+
* function before return. Defaults to NOOP.
215+
*
216+
* This needs to be __always_inline because it is non-instrumentable code
217+
* invoked after context tracking switched to user mode.
218+
*
219+
* An architecture implementation must not do anything complex, no locking
220+
* etc. The main purpose is for speculation mitigations.
221+
*/
222+
static __always_inline void arch_exit_to_user_mode(void);
223+
224+
#ifndef arch_exit_to_user_mode
225+
static __always_inline void arch_exit_to_user_mode(void) { }
226+
#endif
227+
228+
/**
229+
* arch_do_signal - Architecture specific signal delivery function
230+
* @regs: Pointer to currents pt_regs
231+
*
232+
* Invoked from exit_to_user_mode_loop().
233+
*/
234+
void arch_do_signal(struct pt_regs *regs);
235+
236+
/**
237+
* arch_syscall_exit_tracehook - Wrapper around tracehook_report_syscall_exit()
238+
* @regs: Pointer to currents pt_regs
239+
* @step: Indicator for single step
240+
*
241+
* Defaults to tracehook_report_syscall_exit(). Can be replaced by
242+
* architecture specific code.
243+
*
244+
* Invoked from syscall_exit_to_user_mode()
245+
*/
246+
static inline void arch_syscall_exit_tracehook(struct pt_regs *regs, bool step);
247+
248+
#ifndef arch_syscall_exit_tracehook
249+
static inline void arch_syscall_exit_tracehook(struct pt_regs *regs, bool step)
250+
{
251+
tracehook_report_syscall_exit(regs, step);
252+
}
253+
#endif
254+
255+
/**
256+
* syscall_exit_to_user_mode - Handle work before returning to user mode
257+
* @regs: Pointer to currents pt_regs
258+
*
259+
* Invoked with interrupts enabled and fully valid regs. Returns with all
260+
* work handled, interrupts disabled such that the caller can immediately
261+
* switch to user mode. Called from architecture specific syscall and ret
262+
* from fork code.
263+
*
264+
* The call order is:
265+
* 1) One-time syscall exit work:
266+
* - rseq syscall exit
267+
* - audit
268+
* - syscall tracing
269+
* - tracehook (single stepping)
270+
*
271+
* 2) Preparatory work
272+
* - Exit to user mode loop (common TIF handling). Invokes
273+
* arch_exit_to_user_mode_work() for architecture specific TIF work
274+
* - Architecture specific one time work arch_exit_to_user_mode_prepare()
275+
* - Address limit and lockdep checks
276+
*
277+
* 3) Final transition (lockdep, tracing, context tracking, RCU). Invokes
278+
* arch_exit_to_user_mode() to handle e.g. speculation mitigations
279+
*/
280+
void syscall_exit_to_user_mode(struct pt_regs *regs);
281+
282+
/**
283+
* irqentry_enter_from_user_mode - Establish state before invoking the irq handler
284+
* @regs: Pointer to currents pt_regs
285+
*
286+
* Invoked from architecture specific entry code with interrupts disabled.
287+
* Can only be called when the interrupt entry came from user mode. The
288+
* calling code must be non-instrumentable. When the function returns all
289+
* state is correct and the subsequent functions can be instrumented.
290+
*
291+
* The function establishes state (lockdep, RCU (context tracking), tracing)
292+
*/
293+
void irqentry_enter_from_user_mode(struct pt_regs *regs);
294+
295+
/**
296+
* irqentry_exit_to_user_mode - Interrupt exit work
297+
* @regs: Pointer to current's pt_regs
298+
*
299+
* Invoked with interrupts disbled and fully valid regs. Returns with all
300+
* work handled, interrupts disabled such that the caller can immediately
301+
* switch to user mode. Called from architecture specific interrupt
302+
* handling code.
303+
*
304+
* The call order is #2 and #3 as described in syscall_exit_to_user_mode().
305+
* Interrupt exit is not invoking #1 which is the syscall specific one time
306+
* work.
307+
*/
308+
void irqentry_exit_to_user_mode(struct pt_regs *regs);
309+
310+
#ifndef irqentry_state
311+
typedef struct irqentry_state {
312+
bool exit_rcu;
313+
} irqentry_state_t;
314+
#endif
315+
316+
/**
317+
* irqentry_enter - Handle state tracking on ordinary interrupt entries
318+
* @regs: Pointer to pt_regs of interrupted context
319+
*
320+
* Invokes:
321+
* - lockdep irqflag state tracking as low level ASM entry disabled
322+
* interrupts.
323+
*
324+
* - Context tracking if the exception hit user mode.
325+
*
326+
* - The hardirq tracer to keep the state consistent as low level ASM
327+
* entry disabled interrupts.
328+
*
329+
* As a precondition, this requires that the entry came from user mode,
330+
* idle, or a kernel context in which RCU is watching.
331+
*
332+
* For kernel mode entries RCU handling is done conditional. If RCU is
333+
* watching then the only RCU requirement is to check whether the tick has
334+
* to be restarted. If RCU is not watching then rcu_irq_enter() has to be
335+
* invoked on entry and rcu_irq_exit() on exit.
336+
*
337+
* Avoiding the rcu_irq_enter/exit() calls is an optimization but also
338+
* solves the problem of kernel mode pagefaults which can schedule, which
339+
* is not possible after invoking rcu_irq_enter() without undoing it.
340+
*
341+
* For user mode entries irqentry_enter_from_user_mode() is invoked to
342+
* establish the proper context for NOHZ_FULL. Otherwise scheduling on exit
343+
* would not be possible.
344+
*
345+
* Returns: An opaque object that must be passed to idtentry_exit()
346+
*/
347+
irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs);
348+
349+
/**
350+
* irqentry_exit_cond_resched - Conditionally reschedule on return from interrupt
351+
*
352+
* Conditional reschedule with additional sanity checks.
353+
*/
354+
void irqentry_exit_cond_resched(void);
355+
356+
/**
357+
* irqentry_exit - Handle return from exception that used irqentry_enter()
358+
* @regs: Pointer to pt_regs (exception entry regs)
359+
* @state: Return value from matching call to irqentry_enter()
360+
*
361+
* Depending on the return target (kernel/user) this runs the necessary
362+
* preemption and work checks if possible and reguired and returns to
363+
* the caller with interrupts disabled and no further work pending.
364+
*
365+
* This is the last action before returning to the low level ASM code which
366+
* just needs to return to the appropriate context.
367+
*
368+
* Counterpart to irqentry_enter().
369+
*/
370+
void noinstr irqentry_exit(struct pt_regs *regs, irqentry_state_t state);
371+
372+
#endif

0 commit comments

Comments
 (0)