Skip to content

Commit bb511d4

Browse files
committed
Merge tag 'edac_updates_for_v6.6' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras
Pull intel EDAC fixes from Tony Luck: - Old igen6 driver could lose pending events during initialization - Sapphire Rapids workstations have fewer memory controllers than their bigger siblings. This confused the driver. * tag 'edac_updates_for_v6.6' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras: EDAC/igen6: Fix the issue of no error events EDAC/i10nm: Skip the absent memory controllers
2 parents a55b0a0 + ce53ad8 commit bb511d4

File tree

2 files changed

+53
-9
lines changed

2 files changed

+53
-9
lines changed

drivers/edac/i10nm_base.c

Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -658,13 +658,49 @@ static struct pci_dev *get_ddr_munit(struct skx_dev *d, int i, u32 *offset, unsi
658658
return mdev;
659659
}
660660

661+
/**
662+
* i10nm_imc_absent() - Check whether the memory controller @imc is absent
663+
*
664+
* @imc : The pointer to the structure of memory controller EDAC device.
665+
*
666+
* RETURNS : true if the memory controller EDAC device is absent, false otherwise.
667+
*/
668+
static bool i10nm_imc_absent(struct skx_imc *imc)
669+
{
670+
u32 mcmtr;
671+
int i;
672+
673+
switch (res_cfg->type) {
674+
case SPR:
675+
for (i = 0; i < res_cfg->ddr_chan_num; i++) {
676+
mcmtr = I10NM_GET_MCMTR(imc, i);
677+
edac_dbg(1, "ch%d mcmtr reg %x\n", i, mcmtr);
678+
if (mcmtr != ~0)
679+
return false;
680+
}
681+
682+
/*
683+
* Some workstations' absent memory controllers still
684+
* appear as PCIe devices, misleading the EDAC driver.
685+
* By observing that the MMIO registers of these absent
686+
* memory controllers consistently hold the value of ~0.
687+
*
688+
* We identify a memory controller as absent by checking
689+
* if its MMIO register "mcmtr" == ~0 in all its channels.
690+
*/
691+
return true;
692+
default:
693+
return false;
694+
}
695+
}
696+
661697
static int i10nm_get_ddr_munits(void)
662698
{
663699
struct pci_dev *mdev;
664700
void __iomem *mbase;
665701
unsigned long size;
666702
struct skx_dev *d;
667-
int i, j = 0;
703+
int i, lmc, j = 0;
668704
u32 reg, off;
669705
u64 base;
670706

@@ -690,7 +726,7 @@ static int i10nm_get_ddr_munits(void)
690726
edac_dbg(2, "socket%d mmio base 0x%llx (reg 0x%x)\n",
691727
j++, base, reg);
692728

693-
for (i = 0; i < res_cfg->ddr_imc_num; i++) {
729+
for (lmc = 0, i = 0; i < res_cfg->ddr_imc_num; i++) {
694730
mdev = get_ddr_munit(d, i, &off, &size);
695731

696732
if (i == 0 && !mdev) {
@@ -700,8 +736,6 @@ static int i10nm_get_ddr_munits(void)
700736
if (!mdev)
701737
continue;
702738

703-
d->imc[i].mdev = mdev;
704-
705739
edac_dbg(2, "mc%d mmio base 0x%llx size 0x%lx (reg 0x%x)\n",
706740
i, base + off, size, reg);
707741

@@ -712,7 +746,17 @@ static int i10nm_get_ddr_munits(void)
712746
return -ENODEV;
713747
}
714748

715-
d->imc[i].mbase = mbase;
749+
d->imc[lmc].mbase = mbase;
750+
if (i10nm_imc_absent(&d->imc[lmc])) {
751+
pci_dev_put(mdev);
752+
iounmap(mbase);
753+
d->imc[lmc].mbase = NULL;
754+
edac_dbg(2, "Skip absent mc%d\n", i);
755+
continue;
756+
} else {
757+
d->imc[lmc].mdev = mdev;
758+
lmc++;
759+
}
716760
}
717761
}
718762

drivers/edac/igen6_edac.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
#include "edac_mc.h"
2828
#include "edac_module.h"
2929

30-
#define IGEN6_REVISION "v2.5"
30+
#define IGEN6_REVISION "v2.5.1"
3131

3232
#define EDAC_MOD_STR "igen6_edac"
3333
#define IGEN6_NMI_NAME "igen6_ibecc"
@@ -1216,9 +1216,6 @@ static int igen6_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
12161216
INIT_WORK(&ecclog_work, ecclog_work_cb);
12171217
init_irq_work(&ecclog_irq_work, ecclog_irq_work_cb);
12181218

1219-
/* Check if any pending errors before registering the NMI handler */
1220-
ecclog_handler();
1221-
12221219
rc = register_err_handler();
12231220
if (rc)
12241221
goto fail3;
@@ -1230,6 +1227,9 @@ static int igen6_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
12301227
goto fail4;
12311228
}
12321229

1230+
/* Check if any pending errors before/during the registration of the error handler */
1231+
ecclog_handler();
1232+
12331233
igen6_debug_setup();
12341234
return 0;
12351235
fail4:

0 commit comments

Comments
 (0)