Skip to content

Commit d789309

Browse files
committed
x86/smp: Cure kexec() vs. mwait_play_dead() breakage
TLDR: It's a mess. When kexec() is executed on a system with offline CPUs, which are parked in mwait_play_dead() it can end up in a triple fault during the bootup of the kexec kernel or cause hard to diagnose data corruption. The reason is that kexec() eventually overwrites the previous kernel's text, page tables, data and stack. If it writes to the cache line which is monitored by a previously offlined CPU, MWAIT resumes execution and ends up executing the wrong text, dereferencing overwritten page tables or corrupting the kexec kernels data. Cure this by bringing the offlined CPUs out of MWAIT into HLT. Write to the monitored cache line of each offline CPU, which makes MWAIT resume execution. The written control word tells the offlined CPUs to issue HLT, which does not have the MWAIT problem. That does not help, if a stray NMI, MCE or SMI hits the offlined CPUs as those make it come out of HLT. A follow up change will put them into INIT, which protects at least against NMI and SMI. Fixes: ea53069 ("x86, hotplug: Use mwait to offline a processor, fix the legacy case") Reported-by: Ashok Raj <ashok.raj@intel.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Tested-by: Ashok Raj <ashok.raj@intel.com> Reviewed-by: Ashok Raj <ashok.raj@intel.com> Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230615193330.492257119@linutronix.de
1 parent f9c9987 commit d789309

File tree

3 files changed

+66
-0
lines changed

3 files changed

+66
-0
lines changed

arch/x86/include/asm/smp.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,8 @@ void wbinvd_on_cpu(int cpu);
132132
int wbinvd_on_all_cpus(void);
133133
void cond_wakeup_cpu0(void);
134134

135+
void smp_kick_mwait_play_dead(void);
136+
135137
void native_smp_send_reschedule(int cpu);
136138
void native_send_call_func_ipi(const struct cpumask *mask);
137139
void native_send_call_func_single_ipi(int cpu);

arch/x86/kernel/smp.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include <linux/interrupt.h>
2222
#include <linux/cpu.h>
2323
#include <linux/gfp.h>
24+
#include <linux/kexec.h>
2425

2526
#include <asm/mtrr.h>
2627
#include <asm/tlbflush.h>
@@ -157,6 +158,10 @@ static void native_stop_other_cpus(int wait)
157158
if (atomic_cmpxchg(&stopping_cpu, -1, cpu) != -1)
158159
return;
159160

161+
/* For kexec, ensure that offline CPUs are out of MWAIT and in HLT */
162+
if (kexec_in_progress)
163+
smp_kick_mwait_play_dead();
164+
160165
/*
161166
* 1) Send an IPI on the reboot vector to all other CPUs.
162167
*

arch/x86/kernel/smpboot.c

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
#include <linux/tboot.h>
5454
#include <linux/gfp.h>
5555
#include <linux/cpuidle.h>
56+
#include <linux/kexec.h>
5657
#include <linux/numa.h>
5758
#include <linux/pgtable.h>
5859
#include <linux/overflow.h>
@@ -106,6 +107,9 @@ struct mwait_cpu_dead {
106107
unsigned int status;
107108
};
108109

110+
#define CPUDEAD_MWAIT_WAIT 0xDEADBEEF
111+
#define CPUDEAD_MWAIT_KEXEC_HLT 0x4A17DEAD
112+
109113
/*
110114
* Cache line aligned data for mwait_play_dead(). Separate on purpose so
111115
* that it's unlikely to be touched by other CPUs.
@@ -173,6 +177,10 @@ static void smp_callin(void)
173177
{
174178
int cpuid;
175179

180+
/* Mop up eventual mwait_play_dead() wreckage */
181+
this_cpu_write(mwait_cpu_dead.status, 0);
182+
this_cpu_write(mwait_cpu_dead.control, 0);
183+
176184
/*
177185
* If waken up by an INIT in an 82489DX configuration
178186
* cpu_callout_mask guarantees we don't get here before
@@ -1807,6 +1815,10 @@ static inline void mwait_play_dead(void)
18071815
(highest_subcstate - 1);
18081816
}
18091817

1818+
/* Set up state for the kexec() hack below */
1819+
md->status = CPUDEAD_MWAIT_WAIT;
1820+
md->control = CPUDEAD_MWAIT_WAIT;
1821+
18101822
wbinvd();
18111823

18121824
while (1) {
@@ -1824,10 +1836,57 @@ static inline void mwait_play_dead(void)
18241836
mb();
18251837
__mwait(eax, 0);
18261838

1839+
if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
1840+
/*
1841+
* Kexec is about to happen. Don't go back into mwait() as
1842+
* the kexec kernel might overwrite text and data including
1843+
* page tables and stack. So mwait() would resume when the
1844+
* monitor cache line is written to and then the CPU goes
1845+
* south due to overwritten text, page tables and stack.
1846+
*
1847+
* Note: This does _NOT_ protect against a stray MCE, NMI,
1848+
* SMI. They will resume execution at the instruction
1849+
* following the HLT instruction and run into the problem
1850+
* which this is trying to prevent.
1851+
*/
1852+
WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
1853+
while(1)
1854+
native_halt();
1855+
}
1856+
18271857
cond_wakeup_cpu0();
18281858
}
18291859
}
18301860

1861+
/*
1862+
* Kick all "offline" CPUs out of mwait on kexec(). See comment in
1863+
* mwait_play_dead().
1864+
*/
1865+
void smp_kick_mwait_play_dead(void)
1866+
{
1867+
u32 newstate = CPUDEAD_MWAIT_KEXEC_HLT;
1868+
struct mwait_cpu_dead *md;
1869+
unsigned int cpu, i;
1870+
1871+
for_each_cpu_andnot(cpu, cpu_present_mask, cpu_online_mask) {
1872+
md = per_cpu_ptr(&mwait_cpu_dead, cpu);
1873+
1874+
/* Does it sit in mwait_play_dead() ? */
1875+
if (READ_ONCE(md->status) != CPUDEAD_MWAIT_WAIT)
1876+
continue;
1877+
1878+
/* Wait up to 5ms */
1879+
for (i = 0; READ_ONCE(md->status) != newstate && i < 1000; i++) {
1880+
/* Bring it out of mwait */
1881+
WRITE_ONCE(md->control, newstate);
1882+
udelay(5);
1883+
}
1884+
1885+
if (READ_ONCE(md->status) != newstate)
1886+
pr_err_once("CPU%u is stuck in mwait_play_dead()\n", cpu);
1887+
}
1888+
}
1889+
18311890
void __noreturn hlt_play_dead(void)
18321891
{
18331892
if (__this_cpu_read(cpu_info.x86) >= 4)

0 commit comments

Comments
 (0)