Skip to content

Commit c3629dd

Browse files
committed
x86/mce: Prevent duplicate error records
A legitimate use case of the MCA infrastructure is to have the firmware log all uncorrectable errors and also, have the OS see all correctable errors. The uncorrectable, UCNA errors are usually configured to be reported through an SMI. CMCI, which is the correctable error reporting interrupt, uses SMI too and having both enabled, leads to unnecessary overhead. So what ends up happening is, people disable CMCI in the wild and leave on only the UCNA SMI. When CMCI is disabled, the MCA infrastructure resorts to polling the MCA banks. If a MCA MSR is shared between the logical threads, one error ends up getting logged multiple times as the polling runs on every logical thread. Therefore, introduce locking on the Intel side of the polling routine to prevent such duplicate error records from appearing. Based on a patch by Aristeu Rozanski <aris@ruivo.org>. Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> Tested-by: Tony Luck <tony.luck@intel.com> Acked-by: Aristeu Rozanski <aris@ruivo.org> Link: https://lore.kernel.org/r/20230515143225.GC4090740@cathedrallabs.org
1 parent fdf0eaf commit c3629dd

File tree

3 files changed

+27
-2
lines changed

3 files changed

+27
-2
lines changed

arch/x86/kernel/cpu/mce/core.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1608,6 +1608,13 @@ static void __start_timer(struct timer_list *t, unsigned long interval)
16081608
local_irq_restore(flags);
16091609
}
16101610

1611+
static void mc_poll_banks_default(void)
1612+
{
1613+
machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
1614+
}
1615+
1616+
void (*mc_poll_banks)(void) = mc_poll_banks_default;
1617+
16111618
static void mce_timer_fn(struct timer_list *t)
16121619
{
16131620
struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
@@ -1618,7 +1625,7 @@ static void mce_timer_fn(struct timer_list *t)
16181625
iv = __this_cpu_read(mce_next_interval);
16191626

16201627
if (mce_available(this_cpu_ptr(&cpu_info))) {
1621-
machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
1628+
mc_poll_banks();
16221629

16231630
if (mce_intel_cmci_poll()) {
16241631
iv = mce_adjust_timer(iv);

arch/x86/kernel/cpu/mce/intel.c

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,13 @@ static DEFINE_PER_CPU(int, cmci_backoff_cnt);
5656
*/
5757
static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
5858

59+
/*
60+
* On systems that do support CMCI but it's disabled, polling for MCEs can
61+
* cause the same event to be reported multiple times because IA32_MCi_STATUS
62+
* is shared by the same package.
63+
*/
64+
static DEFINE_SPINLOCK(cmci_poll_lock);
65+
5966
#define CMCI_THRESHOLD 1
6067
#define CMCI_POLL_INTERVAL (30 * HZ)
6168
#define CMCI_STORM_INTERVAL (HZ)
@@ -426,12 +433,22 @@ void cmci_disable_bank(int bank)
426433
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
427434
}
428435

436+
/* Bank polling function when CMCI is disabled. */
437+
static void cmci_mc_poll_banks(void)
438+
{
439+
spin_lock(&cmci_poll_lock);
440+
machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
441+
spin_unlock(&cmci_poll_lock);
442+
}
443+
429444
void intel_init_cmci(void)
430445
{
431446
int banks;
432447

433-
if (!cmci_supported(&banks))
448+
if (!cmci_supported(&banks)) {
449+
mc_poll_banks = cmci_mc_poll_banks;
434450
return;
451+
}
435452

436453
mce_threshold_vector = intel_threshold_interrupt;
437454
cmci_discover(banks);

arch/x86/kernel/cpu/mce/internal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,4 +274,5 @@ static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg)
274274
return 0;
275275
}
276276

277+
extern void (*mc_poll_banks)(void);
277278
#endif /* __X86_MCE_INTERNAL_H__ */

0 commit comments

Comments
 (0)