Skip to content

Commit a5db1b2

Browse files
qzhuo2aegl
authored andcommitted
EDAC/ie31200: Switch Raptor Lake-S to interrupt mode
Raptor Lake-S SoCs notify correctable memory errors via CMCI (Corrected Machine Check Interrupt). Switch Raptor Lake-S EDAC support from polling to interrupt mode by registering the callback to the MCE decode notifier chain. Note that as Raptor Lake-S SoCs may not recover from uncorrectable memory errors, the system will hang as soon as this type of error occurs, and the registered callback on the MCE decode chain will not be executed. This is the expected behavior. Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com> Signed-off-by: Tony Luck <tony.luck@intel.com> Tested-by: Gary Wang <gary.c.wang@intel.com> Link: https://lore.kernel.org/r/20250310011411.31685-12-qiuxu.zhuo@intel.com
1 parent d074228 commit a5db1b2

File tree

2 files changed

+78
-7
lines changed

2 files changed

+78
-7
lines changed

drivers/edac/Kconfig

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ config EDAC_I3200
168168

169169
config EDAC_IE31200
170170
tristate "Intel e312xx"
171-
depends on PCI && X86
171+
depends on PCI && X86 && X86_MCE_INTEL
172172
help
173173
Support for error detection and correction on the Intel
174174
E3-1200 based DRAM controllers.

drivers/edac/ie31200_edac.c

Lines changed: 77 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
#include <linux/edac.h>
5252

5353
#include <linux/io-64-nonatomic-lo-hi.h>
54+
#include <asm/mce.h>
5455
#include "edac_module.h"
5556

5657
#define EDAC_MOD_STR "ie31200_edac"
@@ -123,6 +124,7 @@ static int ie31200_registered = 1;
123124

124125
struct res_config {
125126
enum mem_type mtype;
127+
bool cmci;
126128
int imc_num;
127129
/* Host MMIO configuration register */
128130
u64 reg_mchbar_mask;
@@ -172,6 +174,7 @@ struct ie31200_error_info {
172174
u16 errsts;
173175
u16 errsts2;
174176
u64 eccerrlog[IE31200_CHANNELS];
177+
u64 erraddr;
175178
};
176179

177180
static const struct ie31200_dev_info ie31200_devs[] = {
@@ -327,13 +330,13 @@ static void ie31200_process_error_info(struct mem_ctl_info *mci,
327330
log = info->eccerrlog[channel];
328331
if (log & cfg->reg_eccerrlog_ue_mask) {
329332
edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1,
330-
0, 0, 0,
333+
info->erraddr >> PAGE_SHIFT, 0, 0,
331334
field_get(cfg->reg_eccerrlog_rank_mask, log),
332335
channel, -1,
333336
"ie31200 UE", "");
334337
} else if (log & cfg->reg_eccerrlog_ce_mask) {
335338
edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1,
336-
0, 0,
339+
info->erraddr >> PAGE_SHIFT, 0,
337340
field_get(cfg->reg_eccerrlog_syndrome_mask, log),
338341
field_get(cfg->reg_eccerrlog_rank_mask, log),
339342
channel, -1,
@@ -342,14 +345,20 @@ static void ie31200_process_error_info(struct mem_ctl_info *mci,
342345
}
343346
}
344347

345-
static void ie31200_check(struct mem_ctl_info *mci)
348+
static void __ie31200_check(struct mem_ctl_info *mci, struct mce *mce)
346349
{
347350
struct ie31200_error_info info;
348351

352+
info.erraddr = mce ? mce->addr : 0;
349353
ie31200_get_and_clear_error_info(mci, &info);
350354
ie31200_process_error_info(mci, &info);
351355
}
352356

357+
static void ie31200_check(struct mem_ctl_info *mci)
358+
{
359+
__ie31200_check(mci, NULL);
360+
}
361+
353362
static void __iomem *ie31200_map_mchbar(struct pci_dev *pdev, struct res_config *cfg, int mc)
354363
{
355364
union {
@@ -459,7 +468,7 @@ static int ie31200_register_mci(struct pci_dev *pdev, struct res_config *cfg, in
459468
mci->mod_name = EDAC_MOD_STR;
460469
mci->ctl_name = ie31200_devs[mc].ctl_name;
461470
mci->dev_name = pci_name(pdev);
462-
mci->edac_check = ie31200_check;
471+
mci->edac_check = cfg->cmci ? NULL : ie31200_check;
463472
mci->ctl_page_to_phys = NULL;
464473
priv = mci->pvt_info;
465474
priv->window = window;
@@ -499,6 +508,58 @@ static int ie31200_register_mci(struct pci_dev *pdev, struct res_config *cfg, in
499508
return ret;
500509
}
501510

511+
static void mce_check(struct mce *mce)
512+
{
513+
struct ie31200_priv *priv;
514+
int i;
515+
516+
for (i = 0; i < IE31200_IMC_NUM; i++) {
517+
priv = ie31200_pvt.priv[i];
518+
if (!priv)
519+
continue;
520+
521+
__ie31200_check(priv->mci, mce);
522+
}
523+
}
524+
525+
static int mce_handler(struct notifier_block *nb, unsigned long val, void *data)
526+
{
527+
struct mce *mce = (struct mce *)data;
528+
char *type;
529+
530+
if (mce->kflags & MCE_HANDLED_CEC)
531+
return NOTIFY_DONE;
532+
533+
/*
534+
* Ignore unless this is a memory related error.
535+
* Don't check MCI_STATUS_ADDRV since it's not set on some CPUs.
536+
*/
537+
if ((mce->status & 0xefff) >> 7 != 1)
538+
return NOTIFY_DONE;
539+
540+
type = mce->mcgstatus & MCG_STATUS_MCIP ? "Exception" : "Event";
541+
542+
edac_dbg(0, "CPU %d: Machine Check %s: 0x%llx Bank %d: 0x%llx\n",
543+
mce->extcpu, type, mce->mcgstatus,
544+
mce->bank, mce->status);
545+
edac_dbg(0, "TSC 0x%llx\n", mce->tsc);
546+
edac_dbg(0, "ADDR 0x%llx\n", mce->addr);
547+
edac_dbg(0, "MISC 0x%llx\n", mce->misc);
548+
edac_dbg(0, "PROCESSOR %u:0x%x TIME %llu SOCKET %u APIC 0x%x\n",
549+
mce->cpuvendor, mce->cpuid, mce->time,
550+
mce->socketid, mce->apicid);
551+
552+
mce_check(mce);
553+
mce->kflags |= MCE_HANDLED_EDAC;
554+
555+
return NOTIFY_DONE;
556+
}
557+
558+
static struct notifier_block ie31200_mce_dec = {
559+
.notifier_call = mce_handler,
560+
.priority = MCE_PRIO_EDAC,
561+
};
562+
502563
static void ie31200_unregister_mcis(void)
503564
{
504565
struct ie31200_priv *priv;
@@ -534,6 +595,13 @@ static int ie31200_probe1(struct pci_dev *pdev, struct res_config *cfg)
534595
goto fail_register;
535596
}
536597

598+
if (cfg->cmci) {
599+
mce_register_decode_chain(&ie31200_mce_dec);
600+
edac_op_state = EDAC_OPSTATE_INT;
601+
} else {
602+
edac_op_state = EDAC_OPSTATE_POLL;
603+
}
604+
537605
/* get this far and it's successful. */
538606
edac_dbg(3, "MC: success\n");
539607
return 0;
@@ -560,9 +628,13 @@ static int ie31200_init_one(struct pci_dev *pdev,
560628

561629
static void ie31200_remove_one(struct pci_dev *pdev)
562630
{
631+
struct ie31200_priv *priv = ie31200_pvt.priv[0];
632+
563633
edac_dbg(0, "\n");
564634
pci_dev_put(mci_pdev);
565635
mci_pdev = NULL;
636+
if (priv->cfg->cmci)
637+
mce_unregister_decode_chain(&ie31200_mce_dec);
566638
ie31200_unregister_mcis();
567639
}
568640

@@ -612,6 +684,7 @@ static struct res_config skl_cfg = {
612684

613685
struct res_config rpl_s_cfg = {
614686
.mtype = MEM_DDR5,
687+
.cmci = true,
615688
.imc_num = 2,
616689
.reg_mchbar_mask = GENMASK_ULL(41, 17),
617690
.reg_mchbar_window_size = BIT_ULL(16),
@@ -677,8 +750,6 @@ static int __init ie31200_init(void)
677750
int pci_rc, i;
678751

679752
edac_dbg(3, "MC:\n");
680-
/* Ensure that the OPSTATE is set correctly for POLL or NMI */
681-
opstate_init();
682753

683754
pci_rc = pci_register_driver(&ie31200_driver);
684755
if (pci_rc < 0)

0 commit comments

Comments
 (0)