Skip to content

Commit a52b6a2

Browse files
Davidlohr Buesodavejiang
authored andcommitted
cxl/pci: Support Global Persistent Flush (GPF)
Add support for GPF flows. It is found that the CXL specification around this to be a bit too involved from the driver side. And while this should really all handled by the hardware, this patch takes things with a grain of salt. Upon respective port enumeration, both phase timeouts are set to a max of 20 seconds, which is the NMI watchdog default for lockup detection. The premise is that the kernel does not have enough information to set anything better than a max across the board and hope devices finish their GPF flows within the platform energy budget. Timeout detection is based on dirty Shutdown semantics. The driver will mark it as dirty, expecting that the device clear it upon a successful GPF event. The admin may consult the device Health and check the dirty shutdown counter to see if there was a problem with data integrity. [ davej: Explicitly set return to 0 in update_gpf_port_dvsec() ] [ davej: Add spec reference for 'struct cxl_mbox_set_shutdown_state_in ] [ davej: Fix 0-day reported issue ] Signed-off-by: Davidlohr Bueso <dave@stgolabs.net> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> Reviewed-by: Dan Williams <dan.j.williams@intel.com> Link: https://patch.msgid.link/20250124233533.910535-1-dave@stgolabs.net Signed-off-by: Dave Jiang <dave.jiang@intel.com>
1 parent 2014c95 commit a52b6a2

File tree

9 files changed

+132
-1
lines changed

9 files changed

+132
-1
lines changed

Documentation/driver-api/cxl/maturity-map.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ Mailbox commands
130130
* [0] Switch CCI
131131
* [3] Timestamp
132132
* [1] PMEM labels
133-
* [0] PMEM GPF / Dirty Shutdown
133+
* [1] PMEM GPF / Dirty Shutdown
134134
* [0] Scan Media
135135

136136
PMU

drivers/cxl/core/core.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,4 +115,6 @@ bool cxl_need_node_perf_attrs_update(int nid);
115115
int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port,
116116
struct access_coordinate *c);
117117

118+
int cxl_gpf_port_setup(struct device *dport_dev, struct cxl_port *port);
119+
118120
#endif /* __CXL_CORE_H__ */

drivers/cxl/core/mbox.c

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1308,6 +1308,24 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
13081308
}
13091309
EXPORT_SYMBOL_NS_GPL(cxl_mem_create_range_info, "CXL");
13101310

1311+
int cxl_dirty_shutdown_state(struct cxl_memdev_state *mds)
1312+
{
1313+
struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
1314+
struct cxl_mbox_cmd mbox_cmd;
1315+
struct cxl_mbox_set_shutdown_state_in in = {
1316+
.state = 1
1317+
};
1318+
1319+
mbox_cmd = (struct cxl_mbox_cmd) {
1320+
.opcode = CXL_MBOX_OP_SET_SHUTDOWN_STATE,
1321+
.size_in = sizeof(in),
1322+
.payload_in = &in,
1323+
};
1324+
1325+
return cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
1326+
}
1327+
EXPORT_SYMBOL_NS_GPL(cxl_dirty_shutdown_state, "CXL");
1328+
13111329
int cxl_set_timestamp(struct cxl_memdev_state *mds)
13121330
{
13131331
struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;

drivers/cxl/core/pci.c

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1054,3 +1054,90 @@ int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c)
10541054

10551055
return 0;
10561056
}
1057+
1058+
/*
1059+
* Set max timeout such that platforms will optimize GPF flow to avoid
1060+
* the implied worst-case scenario delays. On a sane platform, all
1061+
* devices should always complete GPF within the energy budget of
1062+
* the GPF flow. The kernel does not have enough information to pick
1063+
* anything better than "maximize timeouts and hope it works".
1064+
*
1065+
* A misbehaving device could block forward progress of GPF for all
1066+
* the other devices, exhausting the energy budget of the platform.
1067+
* However, the spec seems to assume that moving on from slow to respond
1068+
* devices is a virtue. It is not possible to know that, in actuality,
1069+
* the slow to respond device is *the* most critical device in the
1070+
* system to wait.
1071+
*/
1072+
#define GPF_TIMEOUT_BASE_MAX 2
1073+
#define GPF_TIMEOUT_SCALE_MAX 7 /* 10 seconds */
1074+
1075+
static int update_gpf_port_dvsec(struct pci_dev *pdev, int dvsec, int phase)
1076+
{
1077+
u64 base, scale;
1078+
int rc, offset;
1079+
u16 ctrl;
1080+
1081+
switch (phase) {
1082+
case 1:
1083+
offset = CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET;
1084+
base = CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK;
1085+
scale = CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK;
1086+
break;
1087+
case 2:
1088+
offset = CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET;
1089+
base = CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK;
1090+
scale = CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK;
1091+
break;
1092+
default:
1093+
return -EINVAL;
1094+
}
1095+
1096+
rc = pci_read_config_word(pdev, dvsec + offset, &ctrl);
1097+
if (rc)
1098+
return rc;
1099+
1100+
if (FIELD_GET(base, ctrl) == GPF_TIMEOUT_BASE_MAX &&
1101+
FIELD_GET(scale, ctrl) == GPF_TIMEOUT_SCALE_MAX)
1102+
return 0;
1103+
1104+
ctrl = FIELD_PREP(base, GPF_TIMEOUT_BASE_MAX);
1105+
ctrl |= FIELD_PREP(scale, GPF_TIMEOUT_SCALE_MAX);
1106+
1107+
rc = pci_write_config_word(pdev, dvsec + offset, ctrl);
1108+
if (!rc)
1109+
pci_dbg(pdev, "Port GPF phase %d timeout: %d0 secs\n",
1110+
phase, GPF_TIMEOUT_BASE_MAX);
1111+
1112+
return rc;
1113+
}
1114+
1115+
int cxl_gpf_port_setup(struct device *dport_dev, struct cxl_port *port)
1116+
{
1117+
struct pci_dev *pdev;
1118+
1119+
if (!dev_is_pci(dport_dev))
1120+
return 0;
1121+
1122+
pdev = to_pci_dev(dport_dev);
1123+
if (!pdev || !port)
1124+
return -EINVAL;
1125+
1126+
if (!port->gpf_dvsec) {
1127+
int dvsec;
1128+
1129+
dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL,
1130+
CXL_DVSEC_PORT_GPF);
1131+
if (!dvsec) {
1132+
pci_warn(pdev, "Port GPF DVSEC not present\n");
1133+
return -EINVAL;
1134+
}
1135+
1136+
port->gpf_dvsec = dvsec;
1137+
}
1138+
1139+
update_gpf_port_dvsec(pdev, port->gpf_dvsec, 1);
1140+
update_gpf_port_dvsec(pdev, port->gpf_dvsec, 2);
1141+
1142+
return 0;
1143+
}

drivers/cxl/core/port.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1672,6 +1672,8 @@ int devm_cxl_enumerate_ports(struct cxl_memdev *cxlmd)
16721672
if (rc && rc != -EBUSY)
16731673
return rc;
16741674

1675+
cxl_gpf_port_setup(dport_dev, port);
1676+
16751677
/* Any more ports to add between this one and the root? */
16761678
if (!dev_is_cxl_root_child(&port->dev))
16771679
continue;

drivers/cxl/cxl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -610,6 +610,7 @@ struct cxl_dax_region {
610610
* @cdat: Cached CDAT data
611611
* @cdat_available: Should a CDAT attribute be available in sysfs
612612
* @pci_latency: Upstream latency in picoseconds
613+
* @gpf_dvsec: Cached GPF port DVSEC
613614
*/
614615
struct cxl_port {
615616
struct device dev;
@@ -633,6 +634,7 @@ struct cxl_port {
633634
} cdat;
634635
bool cdat_available;
635636
long pci_latency;
637+
int gpf_dvsec;
636638
};
637639

638640
/**

drivers/cxl/cxlmem.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -693,6 +693,11 @@ struct cxl_mbox_set_partition_info {
693693

694694
#define CXL_SET_PARTITION_IMMEDIATE_FLAG BIT(0)
695695

696+
/* Set Shutdown State Input Payload CXL 3.2 Spec 8.2.10.9.3.5 Table 8-152 */
697+
struct cxl_mbox_set_shutdown_state_in {
698+
u8 state;
699+
} __packed;
700+
696701
/* Set Timestamp CXL 3.0 Spec 8.2.9.4.2 */
697702
struct cxl_mbox_set_timestamp_in {
698703
__le64 timestamp;
@@ -829,6 +834,7 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
829834
enum cxl_event_log_type type,
830835
enum cxl_event_type event_type,
831836
const uuid_t *uuid, union cxl_event *evt);
837+
int cxl_dirty_shutdown_state(struct cxl_memdev_state *mds);
832838
int cxl_set_timestamp(struct cxl_memdev_state *mds);
833839
int cxl_poison_state_init(struct cxl_memdev_state *mds);
834840
int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,

drivers/cxl/cxlpci.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,12 @@
4040

4141
/* CXL 2.0 8.1.6: GPF DVSEC for CXL Port */
4242
#define CXL_DVSEC_PORT_GPF 4
43+
#define CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET 0x0C
44+
#define CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK GENMASK(3, 0)
45+
#define CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK GENMASK(11, 8)
46+
#define CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET 0xE
47+
#define CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK GENMASK(3, 0)
48+
#define CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK GENMASK(11, 8)
4349

4450
/* CXL 2.0 8.1.7: GPF DVSEC for CXL Device */
4551
#define CXL_DVSEC_DEVICE_GPF 5

drivers/cxl/pmem.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,14 @@ static int cxl_nvdimm_probe(struct device *dev)
8585
if (!nvdimm)
8686
return -ENOMEM;
8787

88+
/*
89+
* Set dirty shutdown now, with the expectation that the device
90+
* clear it upon a successful GPF flow. The exception to this
91+
* is upon Viral detection, per CXL 3.2 section 12.4.2.
92+
*/
93+
if (cxl_dirty_shutdown_state(mds))
94+
dev_warn(dev, "GPF: could not dirty shutdown state\n");
95+
8896
dev_set_drvdata(dev, nvdimm);
8997
return devm_add_action_or_reset(dev, unregister_nvdimm, nvdimm);
9098
}

0 commit comments

Comments
 (0)