Skip to content

Commit 0ecf4aa

Browse files
committed
Merge tag 'amd-drm-next-6.7-2023-10-20' of https://gitlab.freedesktop.org/agd5f/linux into drm-next
amd-drm-next-6.7-2023-10-20: amdgpu: - SMU 13 updates - UMSCH updates - DC MPO fixes - RAS updates - MES 11 fixes - Fix possible memory leaks in error pathes - GC 11.5 fixes - Kernel doc updates - PSP updates - APU IMU fixes - Misc code cleanups - SMU 11 fixes - OD fix - Frame size warning fixes - SR-IOV fixes - NBIO 7.11 updates - NBIO 7.7 updates - XGMI fixes - devcoredump updates amdkfd: - Misc code cleanups - SVM fixes Signed-off-by: Dave Airlie <airlied@redhat.com> From: Alex Deucher <alexander.deucher@amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20231020195043.4937-1-alexander.deucher@amd.com
2 parents 11ae5eb + 5b2c54e commit 0ecf4aa

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+2607
-2619
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu.h

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -773,6 +773,17 @@ struct amdgpu_mqd {
773773
struct amdgpu_reset_domain;
774774
struct amdgpu_fru_info;
775775

776+
struct amdgpu_reset_info {
777+
/* reset dump register */
778+
u32 *reset_dump_reg_list;
779+
u32 *reset_dump_reg_value;
780+
int num_regs;
781+
782+
#ifdef CONFIG_DEV_COREDUMP
783+
struct amdgpu_coredump_info *coredump_info;
784+
#endif
785+
};
786+
776787
/*
777788
* Non-zero (true) if the GPU has VRAM. Zero (false) otherwise.
778789
*/
@@ -1081,10 +1092,7 @@ struct amdgpu_device {
10811092

10821093
struct mutex benchmark_mutex;
10831094

1084-
/* reset dump register */
1085-
uint32_t *reset_dump_reg_list;
1086-
uint32_t *reset_dump_reg_value;
1087-
int num_regs;
1095+
struct amdgpu_reset_info reset_info;
10881096

10891097
bool scpm_enabled;
10901098
uint32_t scpm_status;
@@ -1111,15 +1119,6 @@ static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev,
11111119
return adev->ip_versions[ip][inst] & ~0xFFU;
11121120
}
11131121

1114-
#ifdef CONFIG_DEV_COREDUMP
1115-
struct amdgpu_coredump_info {
1116-
struct amdgpu_device *adev;
1117-
struct amdgpu_task_info reset_task_info;
1118-
struct timespec64 reset_time;
1119-
bool reset_vram_lost;
1120-
};
1121-
#endif
1122-
11231122
static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
11241123
{
11251124
return container_of(ddev, struct amdgpu_device, ddev);

drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2016,8 +2016,8 @@ static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
20162016
if (ret)
20172017
return ret;
20182018

2019-
for (i = 0; i < adev->num_regs; i++) {
2020-
sprintf(reg_offset, "0x%x\n", adev->reset_dump_reg_list[i]);
2019+
for (i = 0; i < adev->reset_info.num_regs; i++) {
2020+
sprintf(reg_offset, "0x%x\n", adev->reset_info.reset_dump_reg_list[i]);
20212021
up_read(&adev->reset_domain->sem);
20222022
if (copy_to_user(buf + len, reg_offset, strlen(reg_offset)))
20232023
return -EFAULT;
@@ -2074,9 +2074,9 @@ static ssize_t amdgpu_reset_dump_register_list_write(struct file *f,
20742074
if (ret)
20752075
goto error_free;
20762076

2077-
swap(adev->reset_dump_reg_list, tmp);
2078-
swap(adev->reset_dump_reg_value, new);
2079-
adev->num_regs = i;
2077+
swap(adev->reset_info.reset_dump_reg_list, tmp);
2078+
swap(adev->reset_info.reset_dump_reg_value, new);
2079+
adev->reset_info.num_regs = i;
20802080
up_write(&adev->reset_domain->sem);
20812081
ret = size;
20822082

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

Lines changed: 8 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,6 @@
3232
#include <linux/slab.h>
3333
#include <linux/iommu.h>
3434
#include <linux/pci.h>
35-
#include <linux/devcoredump.h>
36-
#include <generated/utsrelease.h>
3735
#include <linux/pci-p2pdma.h>
3836
#include <linux/apple-gmux.h>
3937

@@ -3578,9 +3576,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
35783576
if (adev->asic_reset_res)
35793577
goto fail;
35803578

3581-
if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
3582-
adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
3583-
adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
3579+
amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB);
35843580
} else {
35853581

35863582
task_barrier_full(&hive->tb);
@@ -5050,90 +5046,16 @@ static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
50505046

50515047
lockdep_assert_held(&adev->reset_domain->sem);
50525048

5053-
for (i = 0; i < adev->num_regs; i++) {
5054-
adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
5055-
trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i],
5056-
adev->reset_dump_reg_value[i]);
5057-
}
5058-
5059-
return 0;
5060-
}
5061-
5062-
#ifndef CONFIG_DEV_COREDUMP
5063-
static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
5064-
struct amdgpu_reset_context *reset_context)
5065-
{
5066-
}
5067-
#else
5068-
static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
5069-
size_t count, void *data, size_t datalen)
5070-
{
5071-
struct drm_printer p;
5072-
struct amdgpu_coredump_info *coredump = data;
5073-
struct drm_print_iterator iter;
5074-
int i;
5075-
5076-
iter.data = buffer;
5077-
iter.offset = 0;
5078-
iter.start = offset;
5079-
iter.remain = count;
5080-
5081-
p = drm_coredump_printer(&iter);
5082-
5083-
drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
5084-
drm_printf(&p, "kernel: " UTS_RELEASE "\n");
5085-
drm_printf(&p, "module: " KBUILD_MODNAME "\n");
5086-
drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec, coredump->reset_time.tv_nsec);
5087-
if (coredump->reset_task_info.pid)
5088-
drm_printf(&p, "process_name: %s PID: %d\n",
5089-
coredump->reset_task_info.process_name,
5090-
coredump->reset_task_info.pid);
5091-
5092-
if (coredump->reset_vram_lost)
5093-
drm_printf(&p, "VRAM is lost due to GPU reset!\n");
5094-
if (coredump->adev->num_regs) {
5095-
drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n");
5096-
5097-
for (i = 0; i < coredump->adev->num_regs; i++)
5098-
drm_printf(&p, "0x%08x: 0x%08x\n",
5099-
coredump->adev->reset_dump_reg_list[i],
5100-
coredump->adev->reset_dump_reg_value[i]);
5101-
}
5049+
for (i = 0; i < adev->reset_info.num_regs; i++) {
5050+
adev->reset_info.reset_dump_reg_value[i] =
5051+
RREG32(adev->reset_info.reset_dump_reg_list[i]);
51025052

5103-
return count - iter.remain;
5104-
}
5105-
5106-
static void amdgpu_devcoredump_free(void *data)
5107-
{
5108-
kfree(data);
5109-
}
5110-
5111-
static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
5112-
struct amdgpu_reset_context *reset_context)
5113-
{
5114-
struct amdgpu_coredump_info *coredump;
5115-
struct drm_device *dev = adev_to_drm(adev);
5116-
5117-
coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
5118-
5119-
if (!coredump) {
5120-
DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__);
5121-
return;
5053+
trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
5054+
adev->reset_info.reset_dump_reg_value[i]);
51225055
}
51235056

5124-
coredump->reset_vram_lost = vram_lost;
5125-
5126-
if (reset_context->job && reset_context->job->vm)
5127-
coredump->reset_task_info = reset_context->job->vm->task_info;
5128-
5129-
coredump->adev = adev;
5130-
5131-
ktime_get_ts64(&coredump->reset_time);
5132-
5133-
dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
5134-
amdgpu_devcoredump_read, amdgpu_devcoredump_free);
5057+
return 0;
51355058
}
5136-
#endif
51375059

51385060
int amdgpu_do_asic_reset(struct list_head *device_list_handle,
51395061
struct amdgpu_reset_context *reset_context)
@@ -5201,9 +5123,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
52015123

52025124
if (!r && amdgpu_ras_intr_triggered()) {
52035125
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5204-
if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops &&
5205-
tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
5206-
tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev);
5126+
amdgpu_ras_reset_error_count(tmp_adev, AMDGPU_RAS_BLOCK__MMHUB);
52075127
}
52085128

52095129
amdgpu_ras_intr_cleared();

drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "amdgpu_rlc.h"
3030
#include "amdgpu_ras.h"
3131
#include "amdgpu_xcp.h"
32+
#include "amdgpu_xgmi.h"
3233

3334
/* delay 0.1 second to enable gfx off feature */
3435
#define GFX_OFF_DELAY_ENABLE msecs_to_jiffies(100)
@@ -501,6 +502,9 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id)
501502
{
502503
struct amdgpu_kiq *kiq = &adev->gfx.kiq[xcc_id];
503504
struct amdgpu_ring *kiq_ring = &kiq->ring;
505+
struct amdgpu_hive_info *hive;
506+
struct amdgpu_ras *ras;
507+
int hive_ras_recovery = 0;
504508
int i, r = 0;
505509
int j;
506510

@@ -521,6 +525,23 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id)
521525
RESET_QUEUES, 0, 0);
522526
}
523527

528+
/**
529+
* This is workaround: only skip kiq_ring test
530+
* during ras recovery in suspend stage for gfx9.4.3
531+
*/
532+
hive = amdgpu_get_xgmi_hive(adev);
533+
if (hive) {
534+
hive_ras_recovery = atomic_read(&hive->ras_recovery);
535+
amdgpu_put_xgmi_hive(hive);
536+
}
537+
538+
ras = amdgpu_ras_get_context(adev);
539+
if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3)) &&
540+
ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) {
541+
spin_unlock(&kiq->ring_lock);
542+
return 0;
543+
}
544+
524545
if (kiq_ring->sched.ready && !adev->job_hang)
525546
r = amdgpu_ring_test_helper(kiq_ring);
526547
spin_unlock(&kiq->ring_lock);

drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c

Lines changed: 38 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1267,6 +1267,8 @@ int psp_xgmi_initialize(struct psp_context *psp, bool set_extended_data, bool lo
12671267
xgmi_cmd->cmd_id = TA_COMMAND_XGMI__INITIALIZE;
12681268

12691269
ret = psp_xgmi_invoke(psp, xgmi_cmd->cmd_id);
1270+
/* note down the capbility flag for XGMI TA */
1271+
psp->xgmi_context.xgmi_ta_caps = xgmi_cmd->caps_flag;
12701272

12711273
return ret;
12721274
}
@@ -1388,7 +1390,7 @@ int psp_xgmi_get_topology_info(struct psp_context *psp,
13881390

13891391
/* Fill in the shared memory with topology information as input */
13901392
topology_info_input = &xgmi_cmd->xgmi_in_message.get_topology_info;
1391-
xgmi_cmd->cmd_id = TA_COMMAND_XGMI__GET_GET_TOPOLOGY_INFO;
1393+
xgmi_cmd->cmd_id = TA_COMMAND_XGMI__GET_TOPOLOGY_INFO;
13921394
topology_info_input->num_nodes = number_devices;
13931395

13941396
for (i = 0; i < topology_info_input->num_nodes; i++) {
@@ -1399,7 +1401,7 @@ int psp_xgmi_get_topology_info(struct psp_context *psp,
13991401
}
14001402

14011403
/* Invoke xgmi ta to get the topology information */
1402-
ret = psp_xgmi_invoke(psp, TA_COMMAND_XGMI__GET_GET_TOPOLOGY_INFO);
1404+
ret = psp_xgmi_invoke(psp, TA_COMMAND_XGMI__GET_TOPOLOGY_INFO);
14031405
if (ret)
14041406
return ret;
14051407

@@ -1424,28 +1426,53 @@ int psp_xgmi_get_topology_info(struct psp_context *psp,
14241426

14251427
/* Invoke xgmi ta again to get the link information */
14261428
if (psp_xgmi_peer_link_info_supported(psp)) {
1427-
struct ta_xgmi_cmd_get_peer_link_info_output *link_info_output;
1429+
struct ta_xgmi_cmd_get_peer_link_info *link_info_output;
1430+
struct ta_xgmi_cmd_get_extend_peer_link_info *link_extend_info_output;
14281431
bool requires_reflection =
14291432
(psp->xgmi_context.supports_extended_data &&
14301433
get_extended_data) ||
14311434
amdgpu_ip_version(psp->adev, MP0_HWIP, 0) ==
14321435
IP_VERSION(13, 0, 6);
1436+
bool ta_port_num_support = psp->xgmi_context.xgmi_ta_caps &
1437+
EXTEND_PEER_LINK_INFO_CMD_FLAG;
14331438

1434-
xgmi_cmd->cmd_id = TA_COMMAND_XGMI__GET_PEER_LINKS;
1439+
/* popluate the shared output buffer rather than the cmd input buffer
1440+
* with node_ids as the input for GET_PEER_LINKS command execution.
1441+
* This is required for GET_PEER_LINKS per xgmi ta implementation.
1442+
* The same requirement for GET_EXTEND_PEER_LINKS command.
1443+
*/
1444+
if (ta_port_num_support) {
1445+
link_extend_info_output = &xgmi_cmd->xgmi_out_message.get_extend_link_info;
1446+
1447+
for (i = 0; i < topology->num_nodes; i++)
1448+
link_extend_info_output->nodes[i].node_id = topology->nodes[i].node_id;
1449+
1450+
link_extend_info_output->num_nodes = topology->num_nodes;
1451+
xgmi_cmd->cmd_id = TA_COMMAND_XGMI__GET_EXTEND_PEER_LINKS;
1452+
} else {
1453+
link_info_output = &xgmi_cmd->xgmi_out_message.get_link_info;
14351454

1436-
ret = psp_xgmi_invoke(psp, TA_COMMAND_XGMI__GET_PEER_LINKS);
1455+
for (i = 0; i < topology->num_nodes; i++)
1456+
link_info_output->nodes[i].node_id = topology->nodes[i].node_id;
14371457

1458+
link_info_output->num_nodes = topology->num_nodes;
1459+
xgmi_cmd->cmd_id = TA_COMMAND_XGMI__GET_PEER_LINKS;
1460+
}
1461+
1462+
ret = psp_xgmi_invoke(psp, xgmi_cmd->cmd_id);
14381463
if (ret)
14391464
return ret;
14401465

1441-
link_info_output = &xgmi_cmd->xgmi_out_message.get_link_info;
14421466
for (i = 0; i < topology->num_nodes; i++) {
1467+
uint8_t node_num_links = ta_port_num_support ?
1468+
link_extend_info_output->nodes[i].num_links : link_info_output->nodes[i].num_links;
14431469
/* accumulate num_links on extended data */
1444-
topology->nodes[i].num_links = get_extended_data ?
1445-
topology->nodes[i].num_links +
1446-
link_info_output->nodes[i].num_links :
1447-
((requires_reflection && topology->nodes[i].num_links) ? topology->nodes[i].num_links :
1448-
link_info_output->nodes[i].num_links);
1470+
if (get_extended_data) {
1471+
topology->nodes[i].num_links = topology->nodes[i].num_links + node_num_links;
1472+
} else {
1473+
topology->nodes[i].num_links = (requires_reflection && topology->nodes[i].num_links) ?
1474+
topology->nodes[i].num_links : node_num_links;
1475+
}
14491476

14501477
/* reflect the topology information for bi-directionality */
14511478
if (requires_reflection && topology->nodes[i].num_hops)

drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,7 @@ struct psp_xgmi_context {
189189
struct ta_context context;
190190
struct psp_xgmi_topology_info top_info;
191191
bool supports_extended_data;
192+
uint8_t xgmi_ta_caps;
192193
};
193194

194195
struct psp_ras_context {

0 commit comments

Comments
 (0)