Skip to content

Commit 838850c

Browse files
yghannambp3tk0v
authored andcommitted
RAS/AMD/FMPM: Save SPA values
The system physical address (SPA) of an error is not a stable value. It will change depending on the location of the memory: parts can be swapped. And it will change depending on memory topology: NUMA nodes and/or interleaving can be adjusted. Therefore, the SPA value is not part of the "FRU Memory Poison" record format. And it will not be saved to persistent storage. However, the SPA values can be helpful during debug and for system admins during run time. Save the SPA values in a separate structure. This is updated when records are restored and when new errors are saved. [ bp: Make error messages more user friendly and add and correct comments. ] Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com> Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> Link: https://lore.kernel.org/r/20240301143748.854090-3-yazen.ghannam@amd.com
1 parent 9d2b6fa commit 838850c

File tree

1 file changed

+71
-1
lines changed

1 file changed

+71
-1
lines changed

drivers/ras/amd/fmpm.c

Lines changed: 71 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,11 @@ struct fru_rec {
111111
*/
112112
static struct fru_rec **fru_records;
113113

114+
/* system physical addresses array */
115+
static u64 *spa_entries;
116+
117+
#define INVALID_SPA ~0ULL
118+
114119
#define CPER_CREATOR_FMP \
115120
GUID_INIT(0xcd5c2993, 0xf4b2, 0x41b2, 0xb5, 0xd4, 0xf9, 0xc3, \
116121
0xa0, 0x33, 0x08, 0x75)
@@ -120,7 +125,7 @@ static struct fru_rec **fru_records;
120125
0x12, 0x0a, 0x44, 0x58)
121126

122127
/**
123-
* DOC: fru_poison_entries (byte)
128+
* DOC: max_nr_entries (byte)
124129
* Maximum number of descriptor entries possible for each FRU.
125130
*
126131
* Values between '1' and '255' are valid.
@@ -140,6 +145,9 @@ static unsigned int max_nr_fru;
140145
/* Total length of record including headers and list of descriptor entries. */
141146
static size_t max_rec_len;
142147

148+
/* Total number of SPA entries across all FRUs. */
149+
static unsigned int spa_nr_entries;
150+
143151
/*
144152
* Protect the local records cache in fru_records and prevent concurrent
145153
* writes to storage. This is only needed after init once notifier block
@@ -269,6 +277,54 @@ static bool rec_has_fpd(struct fru_rec *rec, struct cper_fru_poison_desc *fpd)
269277
return false;
270278
}
271279

280+
static void save_spa(struct fru_rec *rec, unsigned int entry,
281+
u64 addr, u64 id, unsigned int cpu)
282+
{
283+
unsigned int i, fru_idx, spa_entry;
284+
struct atl_err a_err;
285+
unsigned long spa;
286+
287+
if (entry >= max_nr_entries) {
288+
pr_warn_once("FRU descriptor entry %d out-of-bounds (max: %d)\n",
289+
entry, max_nr_entries);
290+
return;
291+
}
292+
293+
/* spa_nr_entries is always multiple of max_nr_entries */
294+
for (i = 0; i < spa_nr_entries; i += max_nr_entries) {
295+
fru_idx = i / max_nr_entries;
296+
if (fru_records[fru_idx] == rec)
297+
break;
298+
}
299+
300+
if (i >= spa_nr_entries) {
301+
pr_warn_once("FRU record %d not found\n", i);
302+
return;
303+
}
304+
305+
spa_entry = i + entry;
306+
if (spa_entry >= spa_nr_entries) {
307+
pr_warn_once("spa_entries[] index out-of-bounds\n");
308+
return;
309+
}
310+
311+
memset(&a_err, 0, sizeof(struct atl_err));
312+
313+
a_err.addr = addr;
314+
a_err.ipid = id;
315+
a_err.cpu = cpu;
316+
317+
spa = amd_convert_umc_mca_addr_to_sys_addr(&a_err);
318+
if (IS_ERR_VALUE(spa)) {
319+
pr_debug("Failed to get system address\n");
320+
return;
321+
}
322+
323+
spa_entries[spa_entry] = spa;
324+
pr_debug("fru_idx: %u, entry: %u, spa_entry: %u, spa: 0x%016llx\n",
325+
fru_idx, entry, spa_entry, spa_entries[spa_entry]);
326+
}
327+
272328
static void update_fru_record(struct fru_rec *rec, struct mce *m)
273329
{
274330
struct cper_sec_fru_mem_poison *fmp = &rec->fmp;
@@ -301,6 +357,7 @@ static void update_fru_record(struct fru_rec *rec, struct mce *m)
301357
entry = fmp->nr_entries;
302358

303359
save_fpd:
360+
save_spa(rec, entry, m->addr, m->ipid, m->extcpu);
304361
fpd_dest = &rec->entries[entry];
305362
memcpy(fpd_dest, &fpd, sizeof(struct cper_fru_poison_desc));
306363

@@ -385,6 +442,7 @@ static void retire_mem_fmp(struct fru_rec *rec)
385442
continue;
386443

387444
retire_dram_row(fpd->addr, fpd->hw_id, err_cpu);
445+
save_spa(rec, i, fpd->addr, fpd->hw_id, err_cpu);
388446
}
389447
}
390448

@@ -696,6 +754,8 @@ static int get_system_info(void)
696754
if (!max_nr_entries)
697755
max_nr_entries = FMPM_DEFAULT_MAX_NR_ENTRIES;
698756

757+
spa_nr_entries = max_nr_fru * max_nr_entries;
758+
699759
max_rec_len = sizeof(struct fru_rec);
700760
max_rec_len += sizeof(struct cper_fru_poison_desc) * max_nr_entries;
701761

@@ -714,6 +774,7 @@ static void free_records(void)
714774
kfree(rec);
715775

716776
kfree(fru_records);
777+
kfree(spa_entries);
717778
}
718779

719780
static int allocate_records(void)
@@ -734,6 +795,15 @@ static int allocate_records(void)
734795
}
735796
}
736797

798+
spa_entries = kcalloc(spa_nr_entries, sizeof(u64), GFP_KERNEL);
799+
if (!spa_entries) {
800+
ret = -ENOMEM;
801+
goto out_free;
802+
}
803+
804+
for (i = 0; i < spa_nr_entries; i++)
805+
spa_entries[i] = INVALID_SPA;
806+
737807
return ret;
738808

739809
out_free:

0 commit comments

Comments
 (0)