Skip to content

Commit 6ac0788

Browse files
ktbowman-amddjbw
authored andcommitted
cxl/pci: Add RCH downstream port error logging
RCH downstream port error logging is missing in the current CXL driver. The missing AER and RAS error logging is needed for communicating driver error details to userspace. Update the driver to include PCIe AER and CXL RAS error logging. Add RCH downstream port error handling into the existing RCiEP handler. The downstream port error handler is added to the RCiEP error handler because the downstream port is implemented in a RCRB, is not PCI enumerable, and as a result is not directly accessible to the PCI AER root port driver. The AER root port driver calls the RCiEP handler for handling RCD errors and RCH downstream port protocol errors. Update existing RCiEP correctable and uncorrectable handlers to also call the RCH handler. The RCH handler will read the RCH AER registers, check for error severity, and if an error exists will log using an existing kernel AER trace routine. The RCH handler will also log downstream port RAS errors if they exist. Co-developed-by: Robert Richter <rrichter@amd.com> Signed-off-by: Terry Bowman <terry.bowman@amd.com> Signed-off-by: Robert Richter <rrichter@amd.com> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> Reviewed-by: Dave Jiang <dave.jiang@intel.com> Link: https://lore.kernel.org/r/20231018171713.1883517-16-rrichter@amd.com Signed-off-by: Dan Williams <dan.j.williams@intel.com>
1 parent 6c5f3aa commit 6ac0788

File tree

1 file changed

+96
-0
lines changed

1 file changed

+96
-0
lines changed

drivers/cxl/core/pci.c

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -777,12 +777,105 @@ void cxl_setup_parent_dport(struct device *host, struct cxl_dport *dport)
777777
}
778778
EXPORT_SYMBOL_NS_GPL(cxl_setup_parent_dport, CXL);
779779

780+
static void cxl_handle_rdport_cor_ras(struct cxl_dev_state *cxlds,
781+
struct cxl_dport *dport)
782+
{
783+
return __cxl_handle_cor_ras(cxlds, dport->regs.ras);
784+
}
785+
786+
static bool cxl_handle_rdport_ras(struct cxl_dev_state *cxlds,
787+
struct cxl_dport *dport)
788+
{
789+
return __cxl_handle_ras(cxlds, dport->regs.ras);
790+
}
791+
792+
/*
793+
* Copy the AER capability registers using 32 bit read accesses.
794+
* This is necessary because RCRB AER capability is MMIO mapped. Clear the
795+
* status after copying.
796+
*
797+
* @aer_base: base address of AER capability block in RCRB
798+
* @aer_regs: destination for copying AER capability
799+
*/
800+
static bool cxl_rch_get_aer_info(void __iomem *aer_base,
801+
struct aer_capability_regs *aer_regs)
802+
{
803+
int read_cnt = sizeof(struct aer_capability_regs) / sizeof(u32);
804+
u32 *aer_regs_buf = (u32 *)aer_regs;
805+
int n;
806+
807+
if (!aer_base)
808+
return false;
809+
810+
/* Use readl() to guarantee 32-bit accesses */
811+
for (n = 0; n < read_cnt; n++)
812+
aer_regs_buf[n] = readl(aer_base + n * sizeof(u32));
813+
814+
writel(aer_regs->uncor_status, aer_base + PCI_ERR_UNCOR_STATUS);
815+
writel(aer_regs->cor_status, aer_base + PCI_ERR_COR_STATUS);
816+
817+
return true;
818+
}
819+
820+
/* Get AER severity. Return false if there is no error. */
821+
static bool cxl_rch_get_aer_severity(struct aer_capability_regs *aer_regs,
822+
int *severity)
823+
{
824+
if (aer_regs->uncor_status & ~aer_regs->uncor_mask) {
825+
if (aer_regs->uncor_status & PCI_ERR_ROOT_FATAL_RCV)
826+
*severity = AER_FATAL;
827+
else
828+
*severity = AER_NONFATAL;
829+
return true;
830+
}
831+
832+
if (aer_regs->cor_status & ~aer_regs->cor_mask) {
833+
*severity = AER_CORRECTABLE;
834+
return true;
835+
}
836+
837+
return false;
838+
}
839+
840+
static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds)
841+
{
842+
struct pci_dev *pdev = to_pci_dev(cxlds->dev);
843+
struct aer_capability_regs aer_regs;
844+
struct cxl_dport *dport;
845+
struct cxl_port *port;
846+
int severity;
847+
848+
port = cxl_pci_find_port(pdev, &dport);
849+
if (!port)
850+
return;
851+
852+
put_device(&port->dev);
853+
854+
if (!cxl_rch_get_aer_info(dport->regs.dport_aer, &aer_regs))
855+
return;
856+
857+
if (!cxl_rch_get_aer_severity(&aer_regs, &severity))
858+
return;
859+
860+
pci_print_aer(pdev, severity, &aer_regs);
861+
862+
if (severity == AER_CORRECTABLE)
863+
cxl_handle_rdport_cor_ras(cxlds, dport);
864+
else
865+
cxl_handle_rdport_ras(cxlds, dport);
866+
}
867+
868+
#else
869+
static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { }
780870
#endif
781871

782872
void cxl_cor_error_detected(struct pci_dev *pdev)
783873
{
784874
struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
785875

876+
if (cxlds->rcd)
877+
cxl_handle_rdport_errors(cxlds);
878+
786879
cxl_handle_endpoint_cor_ras(cxlds);
787880
}
788881
EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, CXL);
@@ -795,6 +888,9 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
795888
struct device *dev = &cxlmd->dev;
796889
bool ue;
797890

891+
if (cxlds->rcd)
892+
cxl_handle_rdport_errors(cxlds);
893+
798894
/*
799895
* A frozen channel indicates an impending reset which is fatal to
800896
* CXL.mem operation, and will likely crash the system. On the off

0 commit comments

Comments
 (0)