Skip to content

Commit c545f5e

Browse files
qzhuo2aegl
authored andcommitted
EDAC/i10nm: Skip the absent memory controllers
Some Sapphire Rapids workstations' absent memory controllers still appear as PCIe devices that fool the i10nm_edac driver and result in "shift exponent -66 is negative" call traces from skx_get_dimm_info(). Skip the absent memory controllers to avoid the call traces. Reported-by: Kai-Heng Feng <kai.heng.feng@canonical.com> Closes: https://lore.kernel.org/linux-edac/CAAd53p41Ku1m1rapeqb1xtD+kKuk+BaUW=dumuoF0ZO3GhFjFA@mail.gmail.com/T/#m5de16dce60a8c836ec235868c7c16e3fefad0cc2 Tested-by: Kai-Heng Feng <kai.heng.feng@canonical.com> Reported-by: Koba Ko <koba.ko@canonical.com> Closes: https://lore.kernel.org/linux-edac/SA1PR11MB71305B71CCCC3D9305835202892AA@SA1PR11MB7130.namprd11.prod.outlook.com/T/#t Tested-by: Koba Ko <koba.ko@canonical.com> Fixes: d4dc89d ("EDAC, i10nm: Add a driver for Intel 10nm server processors") Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com> Signed-off-by: Tony Luck <tony.luck@intel.com> Link: https://lore.kernel.org/r/20230710013232.59712-1-qiuxu.zhuo@intel.com
1 parent 6eaae19 commit c545f5e

File tree

1 file changed

+49
-5
lines changed

1 file changed

+49
-5
lines changed

drivers/edac/i10nm_base.c

Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -658,13 +658,49 @@ static struct pci_dev *get_ddr_munit(struct skx_dev *d, int i, u32 *offset, unsi
658658
return mdev;
659659
}
660660

661+
/**
662+
* i10nm_imc_absent() - Check whether the memory controller @imc is absent
663+
*
664+
* @imc : The pointer to the structure of memory controller EDAC device.
665+
*
666+
* RETURNS : true if the memory controller EDAC device is absent, false otherwise.
667+
*/
668+
static bool i10nm_imc_absent(struct skx_imc *imc)
669+
{
670+
u32 mcmtr;
671+
int i;
672+
673+
switch (res_cfg->type) {
674+
case SPR:
675+
for (i = 0; i < res_cfg->ddr_chan_num; i++) {
676+
mcmtr = I10NM_GET_MCMTR(imc, i);
677+
edac_dbg(1, "ch%d mcmtr reg %x\n", i, mcmtr);
678+
if (mcmtr != ~0)
679+
return false;
680+
}
681+
682+
/*
683+
* Some workstations' absent memory controllers still
684+
* appear as PCIe devices, misleading the EDAC driver.
685+
* By observing that the MMIO registers of these absent
686+
* memory controllers consistently hold the value of ~0.
687+
*
688+
* We identify a memory controller as absent by checking
689+
* if its MMIO register "mcmtr" == ~0 in all its channels.
690+
*/
691+
return true;
692+
default:
693+
return false;
694+
}
695+
}
696+
661697
static int i10nm_get_ddr_munits(void)
662698
{
663699
struct pci_dev *mdev;
664700
void __iomem *mbase;
665701
unsigned long size;
666702
struct skx_dev *d;
667-
int i, j = 0;
703+
int i, lmc, j = 0;
668704
u32 reg, off;
669705
u64 base;
670706

@@ -690,7 +726,7 @@ static int i10nm_get_ddr_munits(void)
690726
edac_dbg(2, "socket%d mmio base 0x%llx (reg 0x%x)\n",
691727
j++, base, reg);
692728

693-
for (i = 0; i < res_cfg->ddr_imc_num; i++) {
729+
for (lmc = 0, i = 0; i < res_cfg->ddr_imc_num; i++) {
694730
mdev = get_ddr_munit(d, i, &off, &size);
695731

696732
if (i == 0 && !mdev) {
@@ -700,8 +736,6 @@ static int i10nm_get_ddr_munits(void)
700736
if (!mdev)
701737
continue;
702738

703-
d->imc[i].mdev = mdev;
704-
705739
edac_dbg(2, "mc%d mmio base 0x%llx size 0x%lx (reg 0x%x)\n",
706740
i, base + off, size, reg);
707741

@@ -712,7 +746,17 @@ static int i10nm_get_ddr_munits(void)
712746
return -ENODEV;
713747
}
714748

715-
d->imc[i].mbase = mbase;
749+
d->imc[lmc].mbase = mbase;
750+
if (i10nm_imc_absent(&d->imc[lmc])) {
751+
pci_dev_put(mdev);
752+
iounmap(mbase);
753+
d->imc[lmc].mbase = NULL;
754+
edac_dbg(2, "Skip absent mc%d\n", i);
755+
continue;
756+
} else {
757+
d->imc[lmc].mdev = mdev;
758+
lmc++;
759+
}
716760
}
717761
}
718762

0 commit comments

Comments
 (0)