Skip to content
This repository was archived by the owner on Nov 8, 2023. It is now read-only.

Commit 3186b61

Browse files
paulmckrcuIngo Molnar
authored andcommitted
x86/nmi: Upgrade NMI backtrace stall checks & messages
The commit to improve NMI stall debuggability: 344da54 ("x86/nmi: Print reasons why backtrace NMIs are ignored") ... has shown value, but widespread use has also identified a few opportunities for improvement. The systems have (as usual) shown far more creativity than that commit's author, demonstrating yet again that failing CPUs can do whatever they want. In addition, the current message format is less friendly than one might like to those attempting to use these messages to identify failing CPUs. Therefore, separately flag CPUs that, during the full time that the stack-backtrace request was waiting, were always in an NMI handler, were never in an NMI handler, or exited one NMI handler. Also, split the message identifying the CPU and the time since that CPU's last NMI-related activity so that a single line identifies the CPU without any other variable information, greatly reducing the processing overhead required to identify repeat-offender CPUs. Co-developed-by: Breno Leitao <leitao@debian.org> Signed-off-by: Breno Leitao <leitao@debian.org> Signed-off-by: Paul E. McKenney <paulmck@kernel.org> Signed-off-by: Ingo Molnar <mingo@kernel.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Link: https://lore.kernel.org/r/ab4d70c8-c874-42dc-b206-643018922393@paulmck-laptop
1 parent b6540de commit 3186b61

File tree

1 file changed

+14
-10
lines changed

1 file changed

+14
-10
lines changed

arch/x86/kernel/nmi.c

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -580,7 +580,7 @@ EXPORT_SYMBOL_GPL(asm_exc_nmi_kvm_vmx);
580580

581581
static char *nmi_check_stall_msg[] = {
582582
/* */
583-
/* +--------- nsp->idt_seq_snap & 0x1: CPU is in NMI handler. */
583+
/* +--------- nmi_seq & 0x1: CPU is currently in NMI handler. */
584584
/* | +------ cpu_is_offline(cpu) */
585585
/* | | +--- nsp->idt_calls_snap != atomic_long_read(&nsp->idt_calls): */
586586
/* | | | NMI handler has been invoked. */
@@ -628,22 +628,26 @@ void nmi_backtrace_stall_check(const struct cpumask *btp)
628628
nmi_seq = READ_ONCE(nsp->idt_nmi_seq);
629629
if (nsp->idt_nmi_seq_snap + 1 == nmi_seq && (nmi_seq & 0x1)) {
630630
msgp = "CPU entered NMI handler function, but has not exited";
631-
} else if ((nsp->idt_nmi_seq_snap & 0x1) != (nmi_seq & 0x1)) {
632-
msgp = "CPU is handling NMIs";
633-
} else {
634-
idx = ((nsp->idt_seq_snap & 0x1) << 2) |
631+
} else if (nsp->idt_nmi_seq_snap == nmi_seq ||
632+
nsp->idt_nmi_seq_snap + 1 == nmi_seq) {
633+
idx = ((nmi_seq & 0x1) << 2) |
635634
(cpu_is_offline(cpu) << 1) |
636635
(nsp->idt_calls_snap != atomic_long_read(&nsp->idt_calls));
637636
msgp = nmi_check_stall_msg[idx];
638637
if (nsp->idt_ignored_snap != READ_ONCE(nsp->idt_ignored) && (idx & 0x1))
639638
modp = ", but OK because ignore_nmis was set";
640-
if (nmi_seq & 0x1)
641-
msghp = " (CPU currently in NMI handler function)";
642-
else if (nsp->idt_nmi_seq_snap + 1 == nmi_seq)
639+
if (nsp->idt_nmi_seq_snap + 1 == nmi_seq)
643640
msghp = " (CPU exited one NMI handler function)";
641+
else if (nmi_seq & 0x1)
642+
msghp = " (CPU currently in NMI handler function)";
643+
else
644+
msghp = " (CPU was never in an NMI handler function)";
645+
} else {
646+
msgp = "CPU is handling NMIs";
644647
}
645-
pr_alert("%s: CPU %d: %s%s%s, last activity: %lu jiffies ago.\n",
646-
__func__, cpu, msgp, modp, msghp, j - READ_ONCE(nsp->recv_jiffies));
648+
pr_alert("%s: CPU %d: %s%s%s\n", __func__, cpu, msgp, modp, msghp);
649+
pr_alert("%s: last activity: %lu jiffies ago.\n",
650+
__func__, j - READ_ONCE(nsp->recv_jiffies));
647651
}
648652
}
649653

0 commit comments

Comments
 (0)