Skip to content

Commit 9387c6a

Browse files
committed
Merge branch 'for-6.15/fw-first-error-logging' into cxl-for-next2
Add logging support for CXL CPER endpoint and port protocol errors. Including the 2 patches that was completed later. Link: https://lore.kernel.org/linux-cxl/20250123084421.127697-1-Smita.KoralahalliChannabasappa@amd.com/ Link: https://lore.kernel.org/linux-cxl/20250310223839.31342-1-Smita.KoralahalliChannabasappa@amd.com/
2 parents 58d60bb + 02f4f01 commit 9387c6a

File tree

12 files changed

+396
-105
lines changed

12 files changed

+396
-105
lines changed

drivers/acpi/apei/ghes.c

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -674,6 +674,105 @@ static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata,
674674
schedule_work(&entry->work);
675675
}
676676

677+
/* Room for 8 entries */
678+
#define CXL_CPER_PROT_ERR_FIFO_DEPTH 8
679+
static DEFINE_KFIFO(cxl_cper_prot_err_fifo, struct cxl_cper_prot_err_work_data,
680+
CXL_CPER_PROT_ERR_FIFO_DEPTH);
681+
682+
/* Synchronize schedule_work() with cxl_cper_prot_err_work changes */
683+
static DEFINE_SPINLOCK(cxl_cper_prot_err_work_lock);
684+
struct work_struct *cxl_cper_prot_err_work;
685+
686+
static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err,
687+
int severity)
688+
{
689+
#ifdef CONFIG_ACPI_APEI_PCIEAER
690+
struct cxl_cper_prot_err_work_data wd;
691+
u8 *dvsec_start, *cap_start;
692+
693+
if (!(prot_err->valid_bits & PROT_ERR_VALID_AGENT_ADDRESS)) {
694+
pr_err_ratelimited("CXL CPER invalid agent type\n");
695+
return;
696+
}
697+
698+
if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
699+
pr_err_ratelimited("CXL CPER invalid protocol error log\n");
700+
return;
701+
}
702+
703+
if (prot_err->err_len != sizeof(struct cxl_ras_capability_regs)) {
704+
pr_err_ratelimited("CXL CPER invalid RAS Cap size (%u)\n",
705+
prot_err->err_len);
706+
return;
707+
}
708+
709+
if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER))
710+
pr_warn(FW_WARN "CXL CPER no device serial number\n");
711+
712+
guard(spinlock_irqsave)(&cxl_cper_prot_err_work_lock);
713+
714+
if (!cxl_cper_prot_err_work)
715+
return;
716+
717+
switch (prot_err->agent_type) {
718+
case RCD:
719+
case DEVICE:
720+
case LD:
721+
case FMLD:
722+
case RP:
723+
case DSP:
724+
case USP:
725+
memcpy(&wd.prot_err, prot_err, sizeof(wd.prot_err));
726+
727+
dvsec_start = (u8 *)(prot_err + 1);
728+
cap_start = dvsec_start + prot_err->dvsec_len;
729+
730+
memcpy(&wd.ras_cap, cap_start, sizeof(wd.ras_cap));
731+
wd.severity = cper_severity_to_aer(severity);
732+
break;
733+
default:
734+
pr_err_ratelimited("CXL CPER invalid agent type: %d\n",
735+
prot_err->agent_type);
736+
return;
737+
}
738+
739+
if (!kfifo_put(&cxl_cper_prot_err_fifo, wd)) {
740+
pr_err_ratelimited("CXL CPER kfifo overflow\n");
741+
return;
742+
}
743+
744+
schedule_work(cxl_cper_prot_err_work);
745+
#endif
746+
}
747+
748+
int cxl_cper_register_prot_err_work(struct work_struct *work)
749+
{
750+
if (cxl_cper_prot_err_work)
751+
return -EINVAL;
752+
753+
guard(spinlock)(&cxl_cper_prot_err_work_lock);
754+
cxl_cper_prot_err_work = work;
755+
return 0;
756+
}
757+
EXPORT_SYMBOL_NS_GPL(cxl_cper_register_prot_err_work, "CXL");
758+
759+
int cxl_cper_unregister_prot_err_work(struct work_struct *work)
760+
{
761+
if (cxl_cper_prot_err_work != work)
762+
return -EINVAL;
763+
764+
guard(spinlock)(&cxl_cper_prot_err_work_lock);
765+
cxl_cper_prot_err_work = NULL;
766+
return 0;
767+
}
768+
EXPORT_SYMBOL_NS_GPL(cxl_cper_unregister_prot_err_work, "CXL");
769+
770+
int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd)
771+
{
772+
return kfifo_get(&cxl_cper_prot_err_fifo, wd);
773+
}
774+
EXPORT_SYMBOL_NS_GPL(cxl_cper_prot_err_kfifo_get, "CXL");
775+
677776
/* Room for 8 entries for each of the 4 event log queues */
678777
#define CXL_CPER_FIFO_DEPTH 32
679778
DEFINE_KFIFO(cxl_cper_fifo, struct cxl_cper_work_data, CXL_CPER_FIFO_DEPTH);
@@ -777,6 +876,10 @@ static bool ghes_do_proc(struct ghes *ghes,
777876
}
778877
else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
779878
queued = ghes_handle_arm_hw_error(gdata, sev, sync);
879+
} else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
880+
struct cxl_cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
881+
882+
cxl_cper_post_prot_err(prot_err, gdata->error_severity);
780883
} else if (guid_equal(sec_type, &CPER_SEC_CXL_GEN_MEDIA_GUID)) {
781884
struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata);
782885

drivers/cxl/core/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,6 @@ cxl_core-y += pci.o
1414
cxl_core-y += hdm.o
1515
cxl_core-y += pmu.o
1616
cxl_core-y += cdat.o
17+
cxl_core-y += ras.o
1718
cxl_core-$(CONFIG_TRACING) += trace.o
1819
cxl_core-$(CONFIG_CXL_REGION) += region.o

drivers/cxl/core/core.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,4 +115,7 @@ bool cxl_need_node_perf_attrs_update(int nid);
115115
int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port,
116116
struct access_coordinate *c);
117117

118+
int cxl_ras_init(void);
119+
void cxl_ras_exit(void);
120+
118121
#endif /* __CXL_CORE_H__ */

drivers/cxl/core/port.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2350,8 +2350,14 @@ static __init int cxl_core_init(void)
23502350
if (rc)
23512351
goto err_region;
23522352

2353+
rc = cxl_ras_init();
2354+
if (rc)
2355+
goto err_ras;
2356+
23532357
return 0;
23542358

2359+
err_ras:
2360+
cxl_region_exit();
23552361
err_region:
23562362
bus_unregister(&cxl_bus_type);
23572363
err_bus:
@@ -2363,6 +2369,7 @@ static __init int cxl_core_init(void)
23632369

23642370
static void cxl_core_exit(void)
23652371
{
2372+
cxl_ras_exit();
23662373
cxl_region_exit();
23672374
bus_unregister(&cxl_bus_type);
23682375
destroy_workqueue(cxl_bus_wq);

drivers/cxl/core/ras.c

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
// SPDX-License-Identifier: GPL-2.0-only
2+
/* Copyright(c) 2025 AMD Corporation. All rights reserved. */
3+
4+
#include <linux/pci.h>
5+
#include <linux/aer.h>
6+
#include <cxl/event.h>
7+
#include <cxlmem.h>
8+
#include "trace.h"
9+
10+
static void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev,
11+
struct cxl_ras_capability_regs ras_cap)
12+
{
13+
u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
14+
15+
trace_cxl_port_aer_correctable_error(&pdev->dev, status);
16+
}
17+
18+
static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev,
19+
struct cxl_ras_capability_regs ras_cap)
20+
{
21+
u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
22+
u32 fe;
23+
24+
if (hweight32(status) > 1)
25+
fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
26+
ras_cap.cap_control));
27+
else
28+
fe = status;
29+
30+
trace_cxl_port_aer_uncorrectable_error(&pdev->dev, status, fe,
31+
ras_cap.header_log);
32+
}
33+
34+
static void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev,
35+
struct cxl_ras_capability_regs ras_cap)
36+
{
37+
u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
38+
struct cxl_dev_state *cxlds;
39+
40+
cxlds = pci_get_drvdata(pdev);
41+
if (!cxlds)
42+
return;
43+
44+
trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
45+
}
46+
47+
static void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev,
48+
struct cxl_ras_capability_regs ras_cap)
49+
{
50+
u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
51+
struct cxl_dev_state *cxlds;
52+
u32 fe;
53+
54+
cxlds = pci_get_drvdata(pdev);
55+
if (!cxlds)
56+
return;
57+
58+
if (hweight32(status) > 1)
59+
fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
60+
ras_cap.cap_control));
61+
else
62+
fe = status;
63+
64+
trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
65+
ras_cap.header_log);
66+
}
67+
68+
static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
69+
{
70+
unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device,
71+
data->prot_err.agent_addr.function);
72+
struct pci_dev *pdev __free(pci_dev_put) =
73+
pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment,
74+
data->prot_err.agent_addr.bus,
75+
devfn);
76+
int port_type;
77+
78+
if (!pdev)
79+
return;
80+
81+
guard(device)(&pdev->dev);
82+
83+
port_type = pci_pcie_type(pdev);
84+
if (port_type == PCI_EXP_TYPE_ROOT_PORT ||
85+
port_type == PCI_EXP_TYPE_DOWNSTREAM ||
86+
port_type == PCI_EXP_TYPE_UPSTREAM) {
87+
if (data->severity == AER_CORRECTABLE)
88+
cxl_cper_trace_corr_port_prot_err(pdev, data->ras_cap);
89+
else
90+
cxl_cper_trace_uncorr_port_prot_err(pdev, data->ras_cap);
91+
92+
return;
93+
}
94+
95+
if (data->severity == AER_CORRECTABLE)
96+
cxl_cper_trace_corr_prot_err(pdev, data->ras_cap);
97+
else
98+
cxl_cper_trace_uncorr_prot_err(pdev, data->ras_cap);
99+
}
100+
101+
static void cxl_cper_prot_err_work_fn(struct work_struct *work)
102+
{
103+
struct cxl_cper_prot_err_work_data wd;
104+
105+
while (cxl_cper_prot_err_kfifo_get(&wd))
106+
cxl_cper_handle_prot_err(&wd);
107+
}
108+
static DECLARE_WORK(cxl_cper_prot_err_work, cxl_cper_prot_err_work_fn);
109+
110+
int cxl_ras_init(void)
111+
{
112+
return cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
113+
}
114+
115+
void cxl_ras_exit(void)
116+
{
117+
cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work);
118+
cancel_work_sync(&cxl_cper_prot_err_work);
119+
}

drivers/cxl/core/trace.h

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,34 @@
4848
{ CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" } \
4949
)
5050

51+
TRACE_EVENT(cxl_port_aer_uncorrectable_error,
52+
TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl),
53+
TP_ARGS(dev, status, fe, hl),
54+
TP_STRUCT__entry(
55+
__string(device, dev_name(dev))
56+
__string(host, dev_name(dev->parent))
57+
__field(u32, status)
58+
__field(u32, first_error)
59+
__array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
60+
),
61+
TP_fast_assign(
62+
__assign_str(device);
63+
__assign_str(host);
64+
__entry->status = status;
65+
__entry->first_error = fe;
66+
/*
67+
* Embed the 512B headerlog data for user app retrieval and
68+
* parsing, but no need to print this in the trace buffer.
69+
*/
70+
memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
71+
),
72+
TP_printk("device=%s host=%s status: '%s' first_error: '%s'",
73+
__get_str(device), __get_str(host),
74+
show_uc_errs(__entry->status),
75+
show_uc_errs(__entry->first_error)
76+
)
77+
);
78+
5179
TRACE_EVENT(cxl_aer_uncorrectable_error,
5280
TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32 *hl),
5381
TP_ARGS(cxlmd, status, fe, hl),
@@ -96,6 +124,25 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
96124
{ CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer" } \
97125
)
98126

127+
TRACE_EVENT(cxl_port_aer_correctable_error,
128+
TP_PROTO(struct device *dev, u32 status),
129+
TP_ARGS(dev, status),
130+
TP_STRUCT__entry(
131+
__string(device, dev_name(dev))
132+
__string(host, dev_name(dev->parent))
133+
__field(u32, status)
134+
),
135+
TP_fast_assign(
136+
__assign_str(device);
137+
__assign_str(host);
138+
__entry->status = status;
139+
),
140+
TP_printk("device=%s host=%s status='%s'",
141+
__get_str(device), __get_str(host),
142+
show_ce_errs(__entry->status)
143+
)
144+
);
145+
99146
TRACE_EVENT(cxl_aer_correctable_error,
100147
TP_PROTO(const struct cxl_memdev *cxlmd, u32 status),
101148
TP_ARGS(cxlmd, status),

drivers/firmware/efi/cper.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
#include <linux/bcd.h>
2525
#include <acpi/ghes.h>
2626
#include <ras/ras_event.h>
27-
#include "cper_cxl.h"
27+
#include <cxl/event.h>
2828

2929
/*
3030
* CPER record ID need to be unique even after reboot, because record
@@ -624,11 +624,11 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata
624624
else
625625
goto err_section_too_small;
626626
} else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
627-
struct cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
627+
struct cxl_cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
628628

629629
printk("%ssection_type: CXL Protocol Error\n", newpfx);
630630
if (gdata->error_data_length >= sizeof(*prot_err))
631-
cper_print_prot_err(newpfx, prot_err);
631+
cxl_cper_print_prot_err(newpfx, prot_err);
632632
else
633633
goto err_section_too_small;
634634
} else {

0 commit comments

Comments
 (0)