Skip to content

Commit 81e42fc

Browse files
shijujose4bp3tk0v
authored andcommitted
EDAC: Update memory repair control interface for memory sparing feature
Update memory repair control interface for memory sparing feature. CXL memory devices can support soft and hard memory sparing at cacheline, row, bank and rank granularities. Memory sparing is defined as a repair function that replaces a portion of memory with a portion of functional memory at that same granularity. When a CXL device detects an error in memory, it will report to the host that there's need for a repair maintenance operation by using an event record where the "maintenance needed" flag is set. The event records contain the device physical address (DPA) and other attributes of the memory to repair such as bank group, bank, rank, row, column, channel etc. The kernel will report the corresponding CXL general media or DRAM trace event to userspace, and userspace tools (e.g. rasdaemon) will initiate a repair operation in response to the device request via the sysfs repair control. [ bp: Massage. ] Signed-off-by: Shiju Jose <shiju.jose@huawei.com> Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> Link: https://lore.kernel.org/r/20250212143654.1893-15-shiju.jose@huawei.com
1 parent 699ea52 commit 81e42fc

File tree

3 files changed

+169
-0
lines changed

3 files changed

+169
-0
lines changed

Documentation/ABI/testing/sysfs-edac-memory-repair

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,14 @@ Description:
4242

4343
- ppr - Post package repair.
4444

45+
- cacheline-sparing
46+
47+
- row-sparing
48+
49+
- bank-sparing
50+
51+
- rank-sparing
52+
4553
- All other values are reserved.
4654

4755
What: /sys/bus/edac/devices/<dev-name>/mem_repairX/persist_mode
@@ -134,6 +142,55 @@ Description:
134142
related error records and trace events, for eg. CXL DRAM
135143
and CXL general media error records in CXL memory devices.
136144

145+
What: /sys/bus/edac/devices/<dev-name>/mem_repairX/bank_group
146+
What: /sys/bus/edac/devices/<dev-name>/mem_repairX/bank
147+
What: /sys/bus/edac/devices/<dev-name>/mem_repairX/rank
148+
What: /sys/bus/edac/devices/<dev-name>/mem_repairX/row
149+
What: /sys/bus/edac/devices/<dev-name>/mem_repairX/column
150+
What: /sys/bus/edac/devices/<dev-name>/mem_repairX/channel
151+
What: /sys/bus/edac/devices/<dev-name>/mem_repairX/sub_channel
152+
Date: March 2025
153+
KernelVersion: 6.15
154+
Contact: linux-edac@vger.kernel.org
155+
Description:
156+
(RW) The control attributes for the memory to be repaired.
157+
The specific value of attributes to use depends on the
158+
portion of memory to repair and will be reported to the host
159+
in related error records and be available to userspace
160+
in trace events, such as CXL DRAM and CXL general media
161+
error records of CXL memory devices.
162+
163+
When readng back these attributes, it returns the current
164+
value of memory requested to be repaired.
165+
166+
bank_group - The bank group of the memory to repair.
167+
168+
bank - The bank number of the memory to repair.
169+
170+
rank - The rank of the memory to repair. Rank is defined as a
171+
set of memory devices on a channel that together execute a
172+
transaction.
173+
174+
row - The row number of the memory to repair.
175+
176+
column - The column number of the memory to repair.
177+
178+
channel - The channel of the memory to repair. Channel is
179+
defined as an interface that can be independently accessed
180+
for a transaction.
181+
182+
sub_channel - The subchannel of the memory to repair.
183+
184+
The requirement to set these attributes varies based on the
185+
repair function. The attributes in sysfs are not present
186+
unless required for a repair function.
187+
188+
For example, CXL spec ver 3.1, Section 8.2.9.7.1.2 Table 8-103
189+
soft PPR and Section 8.2.9.7.1.3 Table 8-104 hard PPR operations,
190+
these attributes are not required to set. CXL spec ver 3.1,
191+
Section 8.2.9.7.1.4 Table 8-105 memory sparing, these attributes
192+
are required to set based on memory sparing granularity.
193+
137194
What: /sys/bus/edac/devices/<dev-name>/mem_repairX/repair
138195
Date: March 2025
139196
KernelVersion: 6.15

drivers/edac/mem_repair.c

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,13 @@ enum edac_mem_repair_attributes {
2222
MR_MIN_DPA,
2323
MR_MAX_DPA,
2424
MR_NIBBLE_MASK,
25+
MR_BANK_GROUP,
26+
MR_BANK,
27+
MR_RANK,
28+
MR_ROW,
29+
MR_COLUMN,
30+
MR_CHANNEL,
31+
MR_SUB_CHANNEL,
2532
MEM_DO_REPAIR,
2633
MR_MAX_ATTRS
2734
};
@@ -70,6 +77,13 @@ MR_ATTR_SHOW(dpa, get_dpa, u64, "0x%llx\n")
7077
MR_ATTR_SHOW(min_dpa, get_min_dpa, u64, "0x%llx\n")
7178
MR_ATTR_SHOW(max_dpa, get_max_dpa, u64, "0x%llx\n")
7279
MR_ATTR_SHOW(nibble_mask, get_nibble_mask, u32, "0x%x\n")
80+
MR_ATTR_SHOW(bank_group, get_bank_group, u32, "%u\n")
81+
MR_ATTR_SHOW(bank, get_bank, u32, "%u\n")
82+
MR_ATTR_SHOW(rank, get_rank, u32, "%u\n")
83+
MR_ATTR_SHOW(row, get_row, u32, "0x%x\n")
84+
MR_ATTR_SHOW(column, get_column, u32, "%u\n")
85+
MR_ATTR_SHOW(channel, get_channel, u32, "%u\n")
86+
MR_ATTR_SHOW(sub_channel, get_sub_channel, u32, "%u\n")
7387

7488
#define MR_ATTR_STORE(attrib, cb, type, conv_func) \
7589
static ssize_t attrib##_store(struct device *ras_feat_dev, \
@@ -99,6 +113,13 @@ MR_ATTR_STORE(persist_mode, set_persist_mode, unsigned long, kstrtoul)
99113
MR_ATTR_STORE(hpa, set_hpa, u64, kstrtou64)
100114
MR_ATTR_STORE(dpa, set_dpa, u64, kstrtou64)
101115
MR_ATTR_STORE(nibble_mask, set_nibble_mask, unsigned long, kstrtoul)
116+
MR_ATTR_STORE(bank_group, set_bank_group, unsigned long, kstrtoul)
117+
MR_ATTR_STORE(bank, set_bank, unsigned long, kstrtoul)
118+
MR_ATTR_STORE(rank, set_rank, unsigned long, kstrtoul)
119+
MR_ATTR_STORE(row, set_row, unsigned long, kstrtoul)
120+
MR_ATTR_STORE(column, set_column, unsigned long, kstrtoul)
121+
MR_ATTR_STORE(channel, set_channel, unsigned long, kstrtoul)
122+
MR_ATTR_STORE(sub_channel, set_sub_channel, unsigned long, kstrtoul)
102123

103124
#define MR_DO_OP(attrib, cb) \
104125
static ssize_t attrib##_store(struct device *ras_feat_dev, \
@@ -189,6 +210,62 @@ static umode_t mem_repair_attr_visible(struct kobject *kobj, struct attribute *a
189210
return 0444;
190211
}
191212
break;
213+
case MR_BANK_GROUP:
214+
if (ops->get_bank_group) {
215+
if (ops->set_bank_group)
216+
return a->mode;
217+
else
218+
return 0444;
219+
}
220+
break;
221+
case MR_BANK:
222+
if (ops->get_bank) {
223+
if (ops->set_bank)
224+
return a->mode;
225+
else
226+
return 0444;
227+
}
228+
break;
229+
case MR_RANK:
230+
if (ops->get_rank) {
231+
if (ops->set_rank)
232+
return a->mode;
233+
else
234+
return 0444;
235+
}
236+
break;
237+
case MR_ROW:
238+
if (ops->get_row) {
239+
if (ops->set_row)
240+
return a->mode;
241+
else
242+
return 0444;
243+
}
244+
break;
245+
case MR_COLUMN:
246+
if (ops->get_column) {
247+
if (ops->set_column)
248+
return a->mode;
249+
else
250+
return 0444;
251+
}
252+
break;
253+
case MR_CHANNEL:
254+
if (ops->get_channel) {
255+
if (ops->set_channel)
256+
return a->mode;
257+
else
258+
return 0444;
259+
}
260+
break;
261+
case MR_SUB_CHANNEL:
262+
if (ops->get_sub_channel) {
263+
if (ops->set_sub_channel)
264+
return a->mode;
265+
else
266+
return 0444;
267+
}
268+
break;
192269
case MEM_DO_REPAIR:
193270
if (ops->do_repair)
194271
return a->mode;
@@ -230,6 +307,13 @@ static int mem_repair_create_desc(struct device *dev,
230307
[MR_MIN_DPA] = MR_ATTR_RO(min_dpa, instance),
231308
[MR_MAX_DPA] = MR_ATTR_RO(max_dpa, instance),
232309
[MR_NIBBLE_MASK] = MR_ATTR_RW(nibble_mask, instance),
310+
[MR_BANK_GROUP] = MR_ATTR_RW(bank_group, instance),
311+
[MR_BANK] = MR_ATTR_RW(bank, instance),
312+
[MR_RANK] = MR_ATTR_RW(rank, instance),
313+
[MR_ROW] = MR_ATTR_RW(row, instance),
314+
[MR_COLUMN] = MR_ATTR_RW(column, instance),
315+
[MR_CHANNEL] = MR_ATTR_RW(channel, instance),
316+
[MR_SUB_CHANNEL] = MR_ATTR_RW(sub_channel, instance),
233317
[MEM_DO_REPAIR] = MR_ATTR_WO(repair, instance)
234318
};
235319

include/linux/edac.h

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -780,6 +780,20 @@ enum edac_mem_repair_cmd {
780780
* @get_max_dpa: get the maximum supported device physical address (DPA).
781781
* @get_nibble_mask: get current nibble mask of memory to repair.
782782
* @set_nibble_mask: set nibble mask of memory to repair.
783+
* @get_bank_group: get current bank group of memory to repair.
784+
* @set_bank_group: set bank group of memory to repair.
785+
* @get_bank: get current bank of memory to repair.
786+
* @set_bank: set bank of memory to repair.
787+
* @get_rank: get current rank of memory to repair.
788+
* @set_rank: set rank of memory to repair.
789+
* @get_row: get current row of memory to repair.
790+
* @set_row: set row of memory to repair.
791+
* @get_column: get current column of memory to repair.
792+
* @set_column: set column of memory to repair.
793+
* @get_channel: get current channel of memory to repair.
794+
* @set_channel: set channel of memory to repair.
795+
* @get_sub_channel: get current subchannel of memory to repair.
796+
* @set_sub_channel: set subchannel of memory to repair.
783797
* @do_repair: Issue memory repair operation for the HPA/DPA and
784798
* other control attributes set for the memory to repair.
785799
*
@@ -800,6 +814,20 @@ struct edac_mem_repair_ops {
800814
int (*get_max_dpa)(struct device *dev, void *drv_data, u64 *dpa);
801815
int (*get_nibble_mask)(struct device *dev, void *drv_data, u32 *val);
802816
int (*set_nibble_mask)(struct device *dev, void *drv_data, u32 val);
817+
int (*get_bank_group)(struct device *dev, void *drv_data, u32 *val);
818+
int (*set_bank_group)(struct device *dev, void *drv_data, u32 val);
819+
int (*get_bank)(struct device *dev, void *drv_data, u32 *val);
820+
int (*set_bank)(struct device *dev, void *drv_data, u32 val);
821+
int (*get_rank)(struct device *dev, void *drv_data, u32 *val);
822+
int (*set_rank)(struct device *dev, void *drv_data, u32 val);
823+
int (*get_row)(struct device *dev, void *drv_data, u32 *val);
824+
int (*set_row)(struct device *dev, void *drv_data, u32 val);
825+
int (*get_column)(struct device *dev, void *drv_data, u32 *val);
826+
int (*set_column)(struct device *dev, void *drv_data, u32 val);
827+
int (*get_channel)(struct device *dev, void *drv_data, u32 *val);
828+
int (*set_channel)(struct device *dev, void *drv_data, u32 val);
829+
int (*get_sub_channel)(struct device *dev, void *drv_data, u32 *val);
830+
int (*set_sub_channel)(struct device *dev, void *drv_data, u32 val);
803831
int (*do_repair)(struct device *dev, void *drv_data, u32 val);
804832
};
805833

0 commit comments

Comments
 (0)