Skip to content

Commit 1f5e7eb

Browse files
committed
x86/smp: Make stop_other_cpus() more robust
Tony reported intermittent lockups on poweroff. His analysis identified the wbinvd() in stop_this_cpu() as the culprit. This was added to ensure that on SME enabled machines a kexec() does not leave any stale data in the caches when switching from encrypted to non-encrypted mode or vice versa. That wbinvd() is conditional on the SME feature bit which is read directly from CPUID. But that readout does not check whether the CPUID leaf is available or not. If it's not available the CPU will return the value of the highest supported leaf instead. Depending on the content the "SME" bit might be set or not. That's incorrect but harmless. Making the CPUID readout conditional makes the observed hangs go away, but it does not fix the underlying problem: CPU0 CPU1 stop_other_cpus() send_IPIs(REBOOT); stop_this_cpu() while (num_online_cpus() > 1); set_online(false); proceed... -> hang wbinvd() WBINVD is an expensive operation and if multiple CPUs issue it at the same time the resulting delays are even larger. But CPU0 already observed num_online_cpus() going down to 1 and proceeds which causes the system to hang. This issue exists independent of WBINVD, but the delays caused by WBINVD make it more prominent. Make this more robust by adding a cpumask which is initialized to the online CPU mask before sending the IPIs and CPUs clear their bit in stop_this_cpu() after the WBINVD completed. Check for that cpumask to become empty in stop_other_cpus() instead of watching num_online_cpus(). The cpumask cannot plug all holes either, but it's better than a raw counter and allows to restrict the NMI fallback IPI to be sent only the CPUs which have not reported within the timeout window. Fixes: 08f253e ("x86/cpu: Clear SME feature flag when not in use") Reported-by: Tony Battersby <tonyb@cybernetics.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de> Reviewed-by: Ashok Raj <ashok.raj@intel.com> Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/3817d810-e0f1-8ef8-0bbd-663b919ca49b@cybernetics.com Link: https://lore.kernel.org/r/87h6r770bv.ffs@tglx
1 parent 7877cb9 commit 1f5e7eb

File tree

3 files changed

+64
-23
lines changed

3 files changed

+64
-23
lines changed

arch/x86/include/asm/cpu.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,4 +98,6 @@ extern u64 x86_read_arch_cap_msr(void);
9898
int intel_find_matching_signature(void *mc, unsigned int csig, int cpf);
9999
int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type);
100100

101+
extern struct cpumask cpus_stop_mask;
102+
101103
#endif /* _ASM_X86_CPU_H */

arch/x86/kernel/process.c

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -759,13 +759,23 @@ bool xen_set_default_idle(void)
759759
}
760760
#endif
761761

762+
struct cpumask cpus_stop_mask;
763+
762764
void __noreturn stop_this_cpu(void *dummy)
763765
{
766+
unsigned int cpu = smp_processor_id();
767+
764768
local_irq_disable();
769+
765770
/*
766-
* Remove this CPU:
771+
* Remove this CPU from the online mask and disable it
772+
* unconditionally. This might be redundant in case that the reboot
773+
* vector was handled late and stop_other_cpus() sent an NMI.
774+
*
775+
* According to SDM and APM NMIs can be accepted even after soft
776+
* disabling the local APIC.
767777
*/
768-
set_cpu_online(smp_processor_id(), false);
778+
set_cpu_online(cpu, false);
769779
disable_local_APIC();
770780
mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
771781

@@ -783,6 +793,15 @@ void __noreturn stop_this_cpu(void *dummy)
783793
*/
784794
if (cpuid_eax(0x8000001f) & BIT(0))
785795
native_wbinvd();
796+
797+
/*
798+
* This brings a cache line back and dirties it, but
799+
* native_stop_other_cpus() will overwrite cpus_stop_mask after it
800+
* observed that all CPUs reported stop. This write will invalidate
801+
* the related cache line on this CPU.
802+
*/
803+
cpumask_clear_cpu(cpu, &cpus_stop_mask);
804+
786805
for (;;) {
787806
/*
788807
* Use native_halt() so that memory contents don't change

arch/x86/kernel/smp.c

Lines changed: 41 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include <asm/mmu_context.h>
2828
#include <asm/proto.h>
2929
#include <asm/apic.h>
30+
#include <asm/cpu.h>
3031
#include <asm/idtentry.h>
3132
#include <asm/nmi.h>
3233
#include <asm/mce.h>
@@ -146,31 +147,43 @@ static int register_stop_handler(void)
146147

147148
static void native_stop_other_cpus(int wait)
148149
{
149-
unsigned long flags;
150-
unsigned long timeout;
150+
unsigned int cpu = smp_processor_id();
151+
unsigned long flags, timeout;
151152

152153
if (reboot_force)
153154
return;
154155

155-
/*
156-
* Use an own vector here because smp_call_function
157-
* does lots of things not suitable in a panic situation.
158-
*/
156+
/* Only proceed if this is the first CPU to reach this code */
157+
if (atomic_cmpxchg(&stopping_cpu, -1, cpu) != -1)
158+
return;
159159

160160
/*
161-
* We start by using the REBOOT_VECTOR irq.
162-
* The irq is treated as a sync point to allow critical
163-
* regions of code on other cpus to release their spin locks
164-
* and re-enable irqs. Jumping straight to an NMI might
165-
* accidentally cause deadlocks with further shutdown/panic
166-
* code. By syncing, we give the cpus up to one second to
167-
* finish their work before we force them off with the NMI.
161+
* 1) Send an IPI on the reboot vector to all other CPUs.
162+
*
163+
* The other CPUs should react on it after leaving critical
164+
* sections and re-enabling interrupts. They might still hold
165+
* locks, but there is nothing which can be done about that.
166+
*
167+
* 2) Wait for all other CPUs to report that they reached the
168+
* HLT loop in stop_this_cpu()
169+
*
170+
* 3) If #2 timed out send an NMI to the CPUs which did not
171+
* yet report
172+
*
173+
* 4) Wait for all other CPUs to report that they reached the
174+
* HLT loop in stop_this_cpu()
175+
*
176+
* #3 can obviously race against a CPU reaching the HLT loop late.
177+
* That CPU will have reported already and the "have all CPUs
178+
* reached HLT" condition will be true despite the fact that the
179+
* other CPU is still handling the NMI. Again, there is no
180+
* protection against that as "disabled" APICs still respond to
181+
* NMIs.
168182
*/
169-
if (num_online_cpus() > 1) {
170-
/* did someone beat us here? */
171-
if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1)
172-
return;
183+
cpumask_copy(&cpus_stop_mask, cpu_online_mask);
184+
cpumask_clear_cpu(cpu, &cpus_stop_mask);
173185

186+
if (!cpumask_empty(&cpus_stop_mask)) {
174187
/* sync above data before sending IRQ */
175188
wmb();
176189

@@ -183,12 +196,12 @@ static void native_stop_other_cpus(int wait)
183196
* CPUs reach shutdown state.
184197
*/
185198
timeout = USEC_PER_SEC;
186-
while (num_online_cpus() > 1 && timeout--)
199+
while (!cpumask_empty(&cpus_stop_mask) && timeout--)
187200
udelay(1);
188201
}
189202

190203
/* if the REBOOT_VECTOR didn't work, try with the NMI */
191-
if (num_online_cpus() > 1) {
204+
if (!cpumask_empty(&cpus_stop_mask)) {
192205
/*
193206
* If NMI IPI is enabled, try to register the stop handler
194207
* and send the IPI. In any case try to wait for the other
@@ -200,22 +213,29 @@ static void native_stop_other_cpus(int wait)
200213

201214
pr_emerg("Shutting down cpus with NMI\n");
202215

203-
apic_send_IPI_allbutself(NMI_VECTOR);
216+
for_each_cpu(cpu, &cpus_stop_mask)
217+
apic->send_IPI(cpu, NMI_VECTOR);
204218
}
205219
/*
206220
* Don't wait longer than 10 ms if the caller didn't
207221
* request it. If wait is true, the machine hangs here if
208222
* one or more CPUs do not reach shutdown state.
209223
*/
210224
timeout = USEC_PER_MSEC * 10;
211-
while (num_online_cpus() > 1 && (wait || timeout--))
225+
while (!cpumask_empty(&cpus_stop_mask) && (wait || timeout--))
212226
udelay(1);
213227
}
214228

215229
local_irq_save(flags);
216230
disable_local_APIC();
217231
mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
218232
local_irq_restore(flags);
233+
234+
/*
235+
* Ensure that the cpus_stop_mask cache lines are invalidated on
236+
* the other CPUs. See comment vs. SME in stop_this_cpu().
237+
*/
238+
cpumask_clear(&cpus_stop_mask);
219239
}
220240

221241
/*

0 commit comments

Comments
 (0)