Skip to content

Commit 36f257e

Browse files
skoralahdavejiang
authored andcommitted
acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors
When PCIe AER is in FW-First, OS should process CXL Protocol errors from CPER records. Introduce support for handling and logging CXL Protocol errors. The defined trace events cxl_aer_uncorrectable_error and cxl_aer_correctable_error trace native CXL AER endpoint errors. Reuse them to trace FW-First Protocol errors. Since the CXL code is required to be called from process context and GHES is in interrupt context, use workqueues for processing. Similar to CXL CPER event handling, use kfifo to handle errors as it simplifies queue processing by providing lock free fifo operations. Add the ability for the CXL sub-system to register a workqueue to process CXL CPER protocol errors. [DJ: return cxl_cper_register_prot_err_work() directly in cxl_ras_init()] Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> Reviewed-by: Li Ming <ming.li@zohomail.com> Reviewed-by: Alison Schofield <alison.schofield@intel.com> Reviewed-by: Ira Weiny <ira.weiny@intel.com> Reviewed-by: Tony Luck <tony.luck@intel.com> Link: https://patch.msgid.link/20250310223839.31342-2-Smita.KoralahalliChannabasappa@amd.com Signed-off-by: Dave Jiang <dave.jiang@intel.com>
1 parent 315c2f0 commit 36f257e

File tree

7 files changed

+158
-0
lines changed

7 files changed

+158
-0
lines changed

drivers/acpi/apei/ghes.c

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -674,6 +674,15 @@ static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata,
674674
schedule_work(&entry->work);
675675
}
676676

677+
/* Room for 8 entries */
678+
#define CXL_CPER_PROT_ERR_FIFO_DEPTH 8
679+
static DEFINE_KFIFO(cxl_cper_prot_err_fifo, struct cxl_cper_prot_err_work_data,
680+
CXL_CPER_PROT_ERR_FIFO_DEPTH);
681+
682+
/* Synchronize schedule_work() with cxl_cper_prot_err_work changes */
683+
static DEFINE_SPINLOCK(cxl_cper_prot_err_work_lock);
684+
struct work_struct *cxl_cper_prot_err_work;
685+
677686
static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err,
678687
int severity)
679688
{
@@ -700,6 +709,11 @@ static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err,
700709
if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER))
701710
pr_warn(FW_WARN "CXL CPER no device serial number\n");
702711

712+
guard(spinlock_irqsave)(&cxl_cper_prot_err_work_lock);
713+
714+
if (!cxl_cper_prot_err_work)
715+
return;
716+
703717
switch (prot_err->agent_type) {
704718
case RCD:
705719
case DEVICE:
@@ -721,9 +735,44 @@ static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err,
721735
prot_err->agent_type);
722736
return;
723737
}
738+
739+
if (!kfifo_put(&cxl_cper_prot_err_fifo, wd)) {
740+
pr_err_ratelimited("CXL CPER kfifo overflow\n");
741+
return;
742+
}
743+
744+
schedule_work(cxl_cper_prot_err_work);
724745
#endif
725746
}
726747

748+
int cxl_cper_register_prot_err_work(struct work_struct *work)
749+
{
750+
if (cxl_cper_prot_err_work)
751+
return -EINVAL;
752+
753+
guard(spinlock)(&cxl_cper_prot_err_work_lock);
754+
cxl_cper_prot_err_work = work;
755+
return 0;
756+
}
757+
EXPORT_SYMBOL_NS_GPL(cxl_cper_register_prot_err_work, "CXL");
758+
759+
int cxl_cper_unregister_prot_err_work(struct work_struct *work)
760+
{
761+
if (cxl_cper_prot_err_work != work)
762+
return -EINVAL;
763+
764+
guard(spinlock)(&cxl_cper_prot_err_work_lock);
765+
cxl_cper_prot_err_work = NULL;
766+
return 0;
767+
}
768+
EXPORT_SYMBOL_NS_GPL(cxl_cper_unregister_prot_err_work, "CXL");
769+
770+
int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd)
771+
{
772+
return kfifo_get(&cxl_cper_prot_err_fifo, wd);
773+
}
774+
EXPORT_SYMBOL_NS_GPL(cxl_cper_prot_err_kfifo_get, "CXL");
775+
727776
/* Room for 8 entries for each of the 4 event log queues */
728777
#define CXL_CPER_FIFO_DEPTH 32
729778
DEFINE_KFIFO(cxl_cper_fifo, struct cxl_cper_work_data, CXL_CPER_FIFO_DEPTH);

drivers/cxl/core/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,6 @@ cxl_core-y += pci.o
1414
cxl_core-y += hdm.o
1515
cxl_core-y += pmu.o
1616
cxl_core-y += cdat.o
17+
cxl_core-y += ras.o
1718
cxl_core-$(CONFIG_TRACING) += trace.o
1819
cxl_core-$(CONFIG_CXL_REGION) += region.o

drivers/cxl/core/core.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,4 +115,7 @@ bool cxl_need_node_perf_attrs_update(int nid);
115115
int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port,
116116
struct access_coordinate *c);
117117

118+
int cxl_ras_init(void);
119+
void cxl_ras_exit(void);
120+
118121
#endif /* __CXL_CORE_H__ */

drivers/cxl/core/port.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2339,8 +2339,14 @@ static __init int cxl_core_init(void)
23392339
if (rc)
23402340
goto err_region;
23412341

2342+
rc = cxl_ras_init();
2343+
if (rc)
2344+
goto err_ras;
2345+
23422346
return 0;
23432347

2348+
err_ras:
2349+
cxl_region_exit();
23442350
err_region:
23452351
bus_unregister(&cxl_bus_type);
23462352
err_bus:
@@ -2352,6 +2358,7 @@ static __init int cxl_core_init(void)
23522358

23532359
static void cxl_core_exit(void)
23542360
{
2361+
cxl_ras_exit();
23552362
cxl_region_exit();
23562363
bus_unregister(&cxl_bus_type);
23572364
destroy_workqueue(cxl_bus_wq);

drivers/cxl/core/ras.c

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
// SPDX-License-Identifier: GPL-2.0-only
2+
/* Copyright(c) 2025 AMD Corporation. All rights reserved. */
3+
4+
#include <linux/pci.h>
5+
#include <linux/aer.h>
6+
#include <cxl/event.h>
7+
#include <cxlmem.h>
8+
#include "trace.h"
9+
10+
static void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev,
11+
struct cxl_ras_capability_regs ras_cap)
12+
{
13+
u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
14+
struct cxl_dev_state *cxlds;
15+
16+
cxlds = pci_get_drvdata(pdev);
17+
if (!cxlds)
18+
return;
19+
20+
trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
21+
}
22+
23+
static void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev,
24+
struct cxl_ras_capability_regs ras_cap)
25+
{
26+
u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
27+
struct cxl_dev_state *cxlds;
28+
u32 fe;
29+
30+
cxlds = pci_get_drvdata(pdev);
31+
if (!cxlds)
32+
return;
33+
34+
if (hweight32(status) > 1)
35+
fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
36+
ras_cap.cap_control));
37+
else
38+
fe = status;
39+
40+
trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
41+
ras_cap.header_log);
42+
}
43+
44+
static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
45+
{
46+
unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device,
47+
data->prot_err.agent_addr.function);
48+
struct pci_dev *pdev __free(pci_dev_put) =
49+
pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment,
50+
data->prot_err.agent_addr.bus,
51+
devfn);
52+
53+
if (!pdev)
54+
return;
55+
56+
guard(device)(&pdev->dev);
57+
58+
if (data->severity == AER_CORRECTABLE)
59+
cxl_cper_trace_corr_prot_err(pdev, data->ras_cap);
60+
else
61+
cxl_cper_trace_uncorr_prot_err(pdev, data->ras_cap);
62+
}
63+
64+
static void cxl_cper_prot_err_work_fn(struct work_struct *work)
65+
{
66+
struct cxl_cper_prot_err_work_data wd;
67+
68+
while (cxl_cper_prot_err_kfifo_get(&wd))
69+
cxl_cper_handle_prot_err(&wd);
70+
}
71+
static DECLARE_WORK(cxl_cper_prot_err_work, cxl_cper_prot_err_work_fn);
72+
73+
int cxl_ras_init(void)
74+
{
75+
return cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
76+
}
77+
78+
void cxl_ras_exit(void)
79+
{
80+
cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work);
81+
cancel_work_sync(&cxl_cper_prot_err_work);
82+
}

include/cxl/event.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,9 @@ struct cxl_cper_prot_err_work_data {
254254
int cxl_cper_register_work(struct work_struct *work);
255255
int cxl_cper_unregister_work(struct work_struct *work);
256256
int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd);
257+
int cxl_cper_register_prot_err_work(struct work_struct *work);
258+
int cxl_cper_unregister_prot_err_work(struct work_struct *work);
259+
int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd);
257260
#else
258261
static inline int cxl_cper_register_work(struct work_struct *work)
259262
{
@@ -268,6 +271,18 @@ static inline int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd)
268271
{
269272
return 0;
270273
}
274+
static inline int cxl_cper_register_prot_err_work(struct work_struct *work)
275+
{
276+
return 0;
277+
}
278+
static inline int cxl_cper_unregister_prot_err_work(struct work_struct *work)
279+
{
280+
return 0;
281+
}
282+
static inline int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd)
283+
{
284+
return 0;
285+
}
271286
#endif
272287

273288
#endif /* _LINUX_CXL_EVENT_H */

tools/testing/cxl/Kbuild

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ cxl_core-y += $(CXL_CORE_SRC)/pci.o
6161
cxl_core-y += $(CXL_CORE_SRC)/hdm.o
6262
cxl_core-y += $(CXL_CORE_SRC)/pmu.o
6363
cxl_core-y += $(CXL_CORE_SRC)/cdat.o
64+
cxl_core-y += $(CXL_CORE_SRC)/ras.o
6465
cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o
6566
cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o
6667
cxl_core-y += config_check.o

0 commit comments

Comments
 (0)