Skip to content

Commit 2ba9f67

Browse files
committed
Merge tag 'drm-next-2024-11-29' of https://gitlab.freedesktop.org/drm/kernel
Pull drm fixes from Dave Airlie: "Merge window fixes, mostly amdgpu and xe, with a few other minor ones, all looks fairly normal, i915: - hdcp: Fix when the first read and write are retried xe: - Wake up waiters after wait condition set to true - Mark the preempt fence workqueue as reclaim - Update xe2 graphics name string - Fix a couple of guc submit races - Fix pat index usage in migrate - Ensure non-cached migrate pagetable bo mappings - Take a PM ref in the delayed snapshot capture worker amdgpu: - SMU 13.0.6 fixes - XGMI fixes - SMU 13.0.7 fixes - Misc code cleanups - Plane refcount fixes - DCN 4.0.1 fixes - DC power fixes - DTO fixes - NBIO 7.11 fixes - SMU 14.0.x fixes - Reset fixes - Enable DC on LoongArch - Sysfs hotplug warning fix - Misc small fixes - VCN 4.0.3 fix - Slab usage fix - Jpeg delayed work fix amdkfd: - wptr handling fixes radeon: - Use ttm_bo_move_null() - Constify struct pci_device_id - Fix spurious hotplug - HPD fix rockchip - fix 32-bit build" * tag 'drm-next-2024-11-29' of https://gitlab.freedesktop.org/drm/kernel: (48 commits) drm/xe: Take PM ref in delayed snapshot capture worker drm/xe/migrate: use XE_BO_FLAG_PAGETABLE drm/xe/migrate: fix pat index usage drm/xe/guc_submit: fix race around suspend_pending drm/xe/guc_submit: fix race around pending_disable drm/xe: Update xe2_graphics name string drm/rockchip: avoid 64-bit division Revert "drm/radeon: Delay Connector detecting when HPD singals is unstable" drm/amdgpu/jpeg: cancel the jpeg worker drm/amdgpu: fix usage slab after free drm/amdgpu/vcn: reset fw_shared when VCPU buffers corrupted on vcn v4.0.3 drm/amdgpu: Fix sysfs warning when hotplugging drm/amdgpu: Add sysfs interface for vcn reset mask drm/amdgpu/gmc7: fix wait_for_idle callers drm/amd/pm: Remove arcturus min power limit drm/amd/pm: skip setting the power source on smu v14.0.2/3 drm/amd/pm: disable pcie speed switching on Intel platform for smu v14.0.2/3 drm/amdkfd: Use the correct wptr size drm/xe: Mark preempt fence workqueue as reclaim drm/xe/ufence: Wake up waiters after setting ufence->signalled ...
2 parents 517363b + 9794b89 commit 2ba9f67

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

75 files changed

+690
-193
lines changed

drivers/gpu/drm/amd/amdgpu/aldebaran.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -330,6 +330,8 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
330330
}
331331

332332
list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
333+
amdgpu_set_init_level(tmp_adev,
334+
AMDGPU_INIT_LEVEL_RESET_RECOVERY);
333335
dev_info(tmp_adev->dev,
334336
"GPU reset succeeded, trying to resume\n");
335337
r = aldebaran_mode2_restore_ip(tmp_adev);
@@ -375,6 +377,8 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
375377
tmp_adev);
376378

377379
if (!r) {
380+
amdgpu_set_init_level(tmp_adev,
381+
AMDGPU_INIT_LEVEL_DEFAULT);
378382
amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
379383

380384
r = amdgpu_ib_ring_tests(tmp_adev);

drivers/gpu/drm/amd/amdgpu/amdgpu.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -839,6 +839,7 @@ struct amdgpu_mqd {
839839
enum amdgpu_init_lvl_id {
840840
AMDGPU_INIT_LEVEL_DEFAULT,
841841
AMDGPU_INIT_LEVEL_MINIMAL_XGMI,
842+
AMDGPU_INIT_LEVEL_RESET_RECOVERY,
842843
};
843844

844845
struct amdgpu_init_level {

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,11 @@ struct amdgpu_init_level amdgpu_init_default = {
156156
.hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
157157
};
158158

159+
struct amdgpu_init_level amdgpu_init_recovery = {
160+
.level = AMDGPU_INIT_LEVEL_RESET_RECOVERY,
161+
.hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
162+
};
163+
159164
/*
160165
* Minimal blocks needed to be initialized before a XGMI hive can be reset. This
161166
* is used for cases like reset on initialization where the entire hive needs to
@@ -182,6 +187,9 @@ void amdgpu_set_init_level(struct amdgpu_device *adev,
182187
case AMDGPU_INIT_LEVEL_MINIMAL_XGMI:
183188
adev->init_lvl = &amdgpu_init_minimal_xgmi;
184189
break;
190+
case AMDGPU_INIT_LEVEL_RESET_RECOVERY:
191+
adev->init_lvl = &amdgpu_init_recovery;
192+
break;
185193
case AMDGPU_INIT_LEVEL_DEFAULT:
186194
fallthrough;
187195
default:
@@ -3250,7 +3258,7 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
32503258
return r;
32513259
}
32523260

3253-
if (!amdgpu_in_reset(adev))
3261+
if (!amdgpu_reset_in_recovery(adev))
32543262
amdgpu_ras_set_error_query_ready(adev, true);
32553263

32563264
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
@@ -4669,8 +4677,8 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)
46694677
int idx;
46704678
bool px;
46714679

4672-
amdgpu_fence_driver_sw_fini(adev);
46734680
amdgpu_device_ip_fini(adev);
4681+
amdgpu_fence_driver_sw_fini(adev);
46744682
amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
46754683
adev->accel_working = false;
46764684
dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
@@ -5419,7 +5427,7 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
54195427
struct list_head *device_list_handle;
54205428
bool full_reset, vram_lost = false;
54215429
struct amdgpu_device *tmp_adev;
5422-
int r;
5430+
int r, init_level;
54235431

54245432
device_list_handle = reset_context->reset_device_list;
54255433

@@ -5428,10 +5436,18 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
54285436

54295437
full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
54305438

5439+
/**
5440+
* If it's reset on init, it's default init level, otherwise keep level
5441+
* as recovery level.
5442+
*/
5443+
if (reset_context->method == AMD_RESET_METHOD_ON_INIT)
5444+
init_level = AMDGPU_INIT_LEVEL_DEFAULT;
5445+
else
5446+
init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY;
5447+
54315448
r = 0;
54325449
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5433-
/* After reset, it's default init level */
5434-
amdgpu_set_init_level(tmp_adev, AMDGPU_INIT_LEVEL_DEFAULT);
5450+
amdgpu_set_init_level(tmp_adev, init_level);
54355451
if (full_reset) {
54365452
/* post card */
54375453
amdgpu_ras_set_fed(tmp_adev, false);
@@ -5518,6 +5534,9 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
55185534

55195535
out:
55205536
if (!r) {
5537+
/* IP init is complete now, set level as default */
5538+
amdgpu_set_init_level(tmp_adev,
5539+
AMDGPU_INIT_LEVEL_DEFAULT);
55215540
amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
55225541
r = amdgpu_ib_ring_tests(tmp_adev);
55235542
if (r) {

drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1778,9 +1778,11 @@ int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev)
17781778

17791779
void amdgpu_gfx_sysfs_fini(struct amdgpu_device *adev)
17801780
{
1781-
amdgpu_gfx_sysfs_xcp_fini(adev);
1782-
amdgpu_gfx_sysfs_isolation_shader_fini(adev);
1783-
amdgpu_gfx_sysfs_reset_mask_fini(adev);
1781+
if (adev->dev->kobj.sd) {
1782+
amdgpu_gfx_sysfs_xcp_fini(adev);
1783+
amdgpu_gfx_sysfs_isolation_shader_fini(adev);
1784+
amdgpu_gfx_sysfs_reset_mask_fini(adev);
1785+
}
17841786
}
17851787

17861788
int amdgpu_gfx_cleaner_shader_sw_init(struct amdgpu_device *adev,

drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -447,6 +447,8 @@ int amdgpu_jpeg_sysfs_reset_mask_init(struct amdgpu_device *adev)
447447

448448
void amdgpu_jpeg_sysfs_reset_mask_fini(struct amdgpu_device *adev)
449449
{
450-
if (adev->jpeg.num_jpeg_inst)
451-
device_remove_file(adev->dev, &dev_attr_jpeg_reset_mask);
450+
if (adev->dev->kobj.sd) {
451+
if (adev->jpeg.num_jpeg_inst)
452+
device_remove_file(adev->dev, &dev_attr_jpeg_reset_mask);
453+
}
452454
}

drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,8 @@ void amdgpu_preempt_mgr_fini(struct amdgpu_device *adev)
137137
if (ret)
138138
return;
139139

140-
device_remove_file(adev->dev, &dev_attr_mem_info_preempt_used);
140+
if (adev->dev->kobj.sd)
141+
device_remove_file(adev->dev, &dev_attr_mem_info_preempt_used);
141142

142143
ttm_resource_manager_cleanup(man);
143144
ttm_set_driver_manager(&adev->mman.bdev, AMDGPU_PL_PREEMPT, NULL);

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1298,7 +1298,7 @@ int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
12981298
struct ras_manager *obj;
12991299

13001300
/* in resume phase, no need to create aca fs node */
1301-
if (adev->in_suspend || amdgpu_in_reset(adev))
1301+
if (adev->in_suspend || amdgpu_reset_in_recovery(adev))
13021302
return 0;
13031303

13041304
obj = get_ras_manager(adev, blk);
@@ -3610,7 +3610,7 @@ static void amdgpu_ras_event_mgr_init(struct amdgpu_device *adev)
36103610
ras->event_mgr = hive ? &hive->event_mgr : &ras->__event_mgr;
36113611

36123612
/* init event manager with node 0 on xgmi system */
3613-
if (!amdgpu_in_reset(adev)) {
3613+
if (!amdgpu_reset_in_recovery(adev)) {
36143614
if (!hive || adev->gmc.xgmi.node_id == 0)
36153615
ras_event_mgr_init(ras->event_mgr);
36163616
}
@@ -3825,7 +3825,7 @@ int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
38253825

38263826
r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1);
38273827
if (r) {
3828-
if (adev->in_suspend || amdgpu_in_reset(adev)) {
3828+
if (adev->in_suspend || amdgpu_reset_in_recovery(adev)) {
38293829
/* in resume phase, if fail to enable ras,
38303830
* clean up all ras fs nodes, and disable ras */
38313831
goto cleanup;
@@ -3837,7 +3837,7 @@ int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
38373837
amdgpu_persistent_edc_harvesting(adev, ras_block);
38383838

38393839
/* in resume phase, no need to create ras fs node */
3840-
if (adev->in_suspend || amdgpu_in_reset(adev))
3840+
if (adev->in_suspend || amdgpu_reset_in_recovery(adev))
38413841
return 0;
38423842

38433843
ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm);
@@ -3967,7 +3967,7 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
39673967
amdgpu_ras_event_mgr_init(adev);
39683968

39693969
if (amdgpu_ras_aca_is_supported(adev)) {
3970-
if (amdgpu_in_reset(adev)) {
3970+
if (amdgpu_reset_in_recovery(adev)) {
39713971
if (amdgpu_aca_is_enabled(adev))
39723972
r = amdgpu_aca_reset(adev);
39733973
else

drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,3 +342,8 @@ void amdgpu_reset_get_desc(struct amdgpu_reset_context *rst_ctxt, char *buf,
342342
strscpy(buf, "unknown", len);
343343
}
344344
}
345+
346+
bool amdgpu_reset_in_recovery(struct amdgpu_device *adev)
347+
{
348+
return (adev->init_lvl->level == AMDGPU_INIT_LEVEL_RESET_RECOVERY);
349+
}

drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,4 +158,6 @@ extern struct amdgpu_reset_handler xgmi_reset_on_init_handler;
158158
int amdgpu_reset_do_xgmi_reset_on_init(
159159
struct amdgpu_reset_context *reset_context);
160160

161+
bool amdgpu_reset_in_recovery(struct amdgpu_device *adev);
162+
161163
#endif

drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -451,6 +451,8 @@ void amdgpu_sdma_sysfs_reset_mask_fini(struct amdgpu_device *adev)
451451
if (!amdgpu_gpu_recovery)
452452
return;
453453

454-
if (adev->sdma.num_instances)
455-
device_remove_file(adev->dev, &dev_attr_sdma_reset_mask);
454+
if (adev->dev->kobj.sd) {
455+
if (adev->sdma.num_instances)
456+
device_remove_file(adev->dev, &dev_attr_sdma_reset_mask);
457+
}
456458
}

0 commit comments

Comments
 (0)