Skip to content

Commit 28c59d9

Browse files
committed
Merge tag 'ras_core_for_v6.6_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 RAS updates from Borislav Petkov: - Add a quirk for AMD Zen machines where Instruction Fetch unit poison consumption MCEs are not delivered synchronously but still within the same context, which can lead to erroneously increased error severity and unneeded kernel panics - Do not log errors caught by polling shared MCA banks as they materialize as duplicated error records otherwise * tag 'ras_core_for_v6.6_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/MCE: Always save CS register on AMD Zen IF Poison errors x86/mce: Prevent duplicate error records
2 parents 7e5e832 + 4240e2e commit 28c59d9

File tree

3 files changed

+57
-3
lines changed

3 files changed

+57
-3
lines changed

arch/x86/kernel/cpu/mce/core.c

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -842,6 +842,26 @@ static noinstr bool quirk_skylake_repmov(void)
842842
return false;
843843
}
844844

845+
/*
846+
* Some Zen-based Instruction Fetch Units set EIPV=RIPV=0 on poison consumption
847+
* errors. This means mce_gather_info() will not save the "ip" and "cs" registers.
848+
*
849+
* However, the context is still valid, so save the "cs" register for later use.
850+
*
851+
* The "ip" register is truly unknown, so don't save it or fixup EIPV/RIPV.
852+
*
853+
* The Instruction Fetch Unit is at MCA bank 1 for all affected systems.
854+
*/
855+
static __always_inline void quirk_zen_ifu(int bank, struct mce *m, struct pt_regs *regs)
856+
{
857+
if (bank != 1)
858+
return;
859+
if (!(m->status & MCI_STATUS_POISON))
860+
return;
861+
862+
m->cs = regs->cs;
863+
}
864+
845865
/*
846866
* Do a quick check if any of the events requires a panic.
847867
* This decides if we keep the events around or clear them.
@@ -861,6 +881,9 @@ static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned lo
861881
if (mce_flags.snb_ifu_quirk)
862882
quirk_sandybridge_ifu(i, m, regs);
863883

884+
if (mce_flags.zen_ifu_quirk)
885+
quirk_zen_ifu(i, m, regs);
886+
864887
m->bank = i;
865888
if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) {
866889
mce_read_aux(m, i);
@@ -1608,6 +1631,13 @@ static void __start_timer(struct timer_list *t, unsigned long interval)
16081631
local_irq_restore(flags);
16091632
}
16101633

1634+
static void mc_poll_banks_default(void)
1635+
{
1636+
machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
1637+
}
1638+
1639+
void (*mc_poll_banks)(void) = mc_poll_banks_default;
1640+
16111641
static void mce_timer_fn(struct timer_list *t)
16121642
{
16131643
struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
@@ -1618,7 +1648,7 @@ static void mce_timer_fn(struct timer_list *t)
16181648
iv = __this_cpu_read(mce_next_interval);
16191649

16201650
if (mce_available(this_cpu_ptr(&cpu_info))) {
1621-
machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
1651+
mc_poll_banks();
16221652

16231653
if (mce_intel_cmci_poll()) {
16241654
iv = mce_adjust_timer(iv);
@@ -1842,6 +1872,9 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
18421872
if (c->x86 == 0x15 && c->x86_model <= 0xf)
18431873
mce_flags.overflow_recov = 1;
18441874

1875+
if (c->x86 >= 0x17 && c->x86 <= 0x1A)
1876+
mce_flags.zen_ifu_quirk = 1;
1877+
18451878
}
18461879

18471880
if (c->x86_vendor == X86_VENDOR_INTEL) {

arch/x86/kernel/cpu/mce/intel.c

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,13 @@ static DEFINE_PER_CPU(int, cmci_backoff_cnt);
5656
*/
5757
static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
5858

59+
/*
60+
* On systems that do support CMCI but it's disabled, polling for MCEs can
61+
* cause the same event to be reported multiple times because IA32_MCi_STATUS
62+
* is shared by the same package.
63+
*/
64+
static DEFINE_SPINLOCK(cmci_poll_lock);
65+
5966
#define CMCI_THRESHOLD 1
6067
#define CMCI_POLL_INTERVAL (30 * HZ)
6168
#define CMCI_STORM_INTERVAL (HZ)
@@ -426,12 +433,22 @@ void cmci_disable_bank(int bank)
426433
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
427434
}
428435

436+
/* Bank polling function when CMCI is disabled. */
437+
static void cmci_mc_poll_banks(void)
438+
{
439+
spin_lock(&cmci_poll_lock);
440+
machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
441+
spin_unlock(&cmci_poll_lock);
442+
}
443+
429444
void intel_init_cmci(void)
430445
{
431446
int banks;
432447

433-
if (!cmci_supported(&banks))
448+
if (!cmci_supported(&banks)) {
449+
mc_poll_banks = cmci_mc_poll_banks;
434450
return;
451+
}
435452

436453
mce_threshold_vector = intel_threshold_interrupt;
437454
cmci_discover(banks);

arch/x86/kernel/cpu/mce/internal.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,9 @@ struct mce_vendor_flags {
157157
*/
158158
smca : 1,
159159

160+
/* Zen IFU quirk */
161+
zen_ifu_quirk : 1,
162+
160163
/* AMD-style error thresholding banks present. */
161164
amd_threshold : 1,
162165

@@ -172,7 +175,7 @@ struct mce_vendor_flags {
172175
/* Skylake, Cascade Lake, Cooper Lake REP;MOVS* quirk */
173176
skx_repmov_quirk : 1,
174177

175-
__reserved_0 : 56;
178+
__reserved_0 : 55;
176179
};
177180

178181
extern struct mce_vendor_flags mce_flags;
@@ -274,4 +277,5 @@ static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg)
274277
return 0;
275278
}
276279

280+
extern void (*mc_poll_banks)(void);
277281
#endif /* __X86_MCE_INTERNAL_H__ */

0 commit comments

Comments
 (0)