Skip to content

Commit e08b575

Browse files
committed
Merge tag 'drm-next-2024-01-19' of git://anongit.freedesktop.org/drm/drm
Pull more drm fixes from Dave Airlie: "This is mostly amdgpu and xe fixes, with an amdkfd and nouveau fix thrown in. The amdgpu ones are just the usual couple of weeks of fixes. The xe ones are bunch of cleanups for the new xe driver, the fix you put in on the merge commit and the kconfig fix that was hiding the problem from me. amdgpu: - DSC fixes - DC resource pool fixes - OTG fix - DML2 fixes - Aux fix - GFX10 RLC firmware handling fix - Revert a broken workaround for SMU 13.0.2 - DC writeback fix - Enable gfxoff when ROCm apps are active on gfx11 with the proper FW version amdkfd: - Fix dma-buf exports using GEM handles nouveau: - fix a unneeded WARN_ON triggering xe: - Fix for definition of wakeref_t - Fix for an error code aliasing - Fix for VM_UNBIND_ALL in the case there are no bound VMAs - Fixes for a number of __iomem address space mismatches reported by sparse - Fixes for the assignment of exec_queue priority - A Fix for skip_guc_pc not taking effect - Workaround for a build problem on GCC 11 - A couple of fixes for error paths - Fix a Flat CCS compression metadata copy issue - Fix a misplace array bounds checking - Don't have display support depend on EXPERT (as discussed on IRC)" * tag 'drm-next-2024-01-19' of git://anongit.freedesktop.org/drm/drm: (71 commits) nouveau/vmm: don't set addr on the fail path to avoid warning drm/amdgpu: Enable GFXOFF for Compute on GFX11 drm/amd/display: Drop 'acrtc' and add 'new_crtc_state' NULL check for writeback requests. drm/amdgpu: revert "Adjust removal control flow for smu v13_0_2" drm/amdkfd: init drm_client with funcs hook drm/amd/display: Fix a switch statement in populate_dml_output_cfg_from_stream_state() drm/amdgpu: Fix the null pointer when load rlc firmware drm/amd/display: Align the returned error code with legacy DP drm/amd/display: Fix DML2 watermark calculation drm/amd/display: Clear OPTC mem select on disable drm/amd/display: Port DENTIST hang and TDR fixes to OTG disable W/A drm/amd/display: Add logging resource checks drm/amd/display: Init link enc resources in dc_state only if res_pool presents drm/amd/display: Fix late derefrence 'dsc' check in 'link_set_dsc_pps_packet()' drm/amd/display: Avoid enum conversion warning drm/amd/pm: Fix smuv13.0.6 current clock reporting drm/amd/pm: Add error log for smu v13.0.6 reset drm/amdkfd: Fix 'node' NULL check in 'svm_range_get_range_boundaries()' drm/amdgpu: drop exp hw support check for GC 9.4.3 drm/amdgpu: move debug options init prior to amdgpu device init ...
2 parents ab1e2d0 + 009f0a6 commit e08b575

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

85 files changed

+692
-463
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -254,8 +254,6 @@ extern int amdgpu_agp;
254254

255255
extern int amdgpu_wbrf;
256256

257-
extern int fw_bo_location;
258-
259257
#define AMDGPU_VM_MAX_NUM_CTX 4096
260258
#define AMDGPU_SG_THRESHOLD (256*1024*1024)
261259
#define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS 3000
@@ -1146,6 +1144,7 @@ struct amdgpu_device {
11461144
bool debug_vm;
11471145
bool debug_largebar;
11481146
bool debug_disable_soft_recovery;
1147+
bool debug_use_vram_fw_buf;
11491148
};
11501149

11511150
static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev,

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,9 @@ static void amdgpu_amdkfd_reset_work(struct work_struct *work)
138138
amdgpu_device_gpu_recover(adev, NULL, &reset_context);
139139
}
140140

141+
static const struct drm_client_funcs kfd_client_funcs = {
142+
.unregister = drm_client_release,
143+
};
141144
void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
142145
{
143146
int i;
@@ -161,7 +164,7 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
161164
.enable_mes = adev->enable_mes,
162165
};
163166

164-
ret = drm_client_init(&adev->ddev, &adev->kfd.client, "kfd", NULL);
167+
ret = drm_client_init(&adev->ddev, &adev->kfd.client, "kfd", &kfd_client_funcs);
165168
if (ret) {
166169
dev_err(adev->dev, "Failed to init DRM client: %d\n", ret);
167170
return;
@@ -695,10 +698,8 @@ int amdgpu_amdkfd_submit_ib(struct amdgpu_device *adev,
695698
void amdgpu_amdkfd_set_compute_idle(struct amdgpu_device *adev, bool idle)
696699
{
697700
enum amd_powergating_state state = idle ? AMD_PG_STATE_GATE : AMD_PG_STATE_UNGATE;
698-
/* Temporary workaround to fix issues observed in some
699-
* compute applications when GFXOFF is enabled on GFX11.
700-
*/
701-
if (IP_VERSION_MAJ(amdgpu_ip_version(adev, GC_HWIP, 0)) == 11) {
701+
if (IP_VERSION_MAJ(amdgpu_ip_version(adev, GC_HWIP, 0)) == 11 &&
702+
((adev->mes.kiq_version & AMDGPU_MES_VERSION_MASK) <= 64)) {
702703
pr_debug("GFXOFF is %s\n", idle ? "enabled" : "disabled");
703704
amdgpu_gfx_off_ctrl(adev, idle);
704705
} else if ((IP_VERSION_MAJ(amdgpu_ip_version(adev, GC_HWIP, 0)) == 9) &&

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,7 @@ void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem);
311311
int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_device *adev, struct amdgpu_bo *bo);
312312

313313
int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info,
314-
struct dma_fence **ef);
314+
struct dma_fence __rcu **ef);
315315
int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct amdgpu_device *adev,
316316
struct kfd_vm_fault_info *info);
317317
int amdgpu_amdkfd_gpuvm_import_dmabuf_fd(struct amdgpu_device *adev, int fd,

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2802,7 +2802,7 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
28022802
put_task_struct(usertask);
28032803
}
28042804

2805-
static void replace_eviction_fence(struct dma_fence **ef,
2805+
static void replace_eviction_fence(struct dma_fence __rcu **ef,
28062806
struct dma_fence *new_ef)
28072807
{
28082808
struct dma_fence *old_ef = rcu_replace_pointer(*ef, new_ef, true
@@ -2837,7 +2837,7 @@ static void replace_eviction_fence(struct dma_fence **ef,
28372837
* 7. Add fence to all PD and PT BOs.
28382838
* 8. Unreserve all BOs
28392839
*/
2840-
int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef)
2840+
int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu **ef)
28412841
{
28422842
struct amdkfd_process_info *process_info = info;
28432843
struct amdgpu_vm *peer_vm;

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

Lines changed: 2 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1544,6 +1544,7 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
15441544
return true;
15451545

15461546
fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1547+
release_firmware(adev->pm.fw);
15471548
if (fw_ver < 0x00160e00)
15481549
return true;
15491550
}
@@ -5245,7 +5246,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
52455246
struct amdgpu_device *tmp_adev = NULL;
52465247
bool need_full_reset, skip_hw_reset, vram_lost = false;
52475248
int r = 0;
5248-
bool gpu_reset_for_dev_remove = 0;
52495249

52505250
/* Try reset handler method first */
52515251
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
@@ -5265,10 +5265,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
52655265
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
52665266
skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
52675267

5268-
gpu_reset_for_dev_remove =
5269-
test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5270-
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5271-
52725268
/*
52735269
* ASIC reset has to be done on all XGMI hive nodes ASAP
52745270
* to allow proper links negotiation in FW (within 1 sec)
@@ -5311,18 +5307,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
53115307
amdgpu_ras_intr_cleared();
53125308
}
53135309

5314-
/* Since the mode1 reset affects base ip blocks, the
5315-
* phase1 ip blocks need to be resumed. Otherwise there
5316-
* will be a BIOS signature error and the psp bootloader
5317-
* can't load kdb on the next amdgpu install.
5318-
*/
5319-
if (gpu_reset_for_dev_remove) {
5320-
list_for_each_entry(tmp_adev, device_list_handle, reset_list)
5321-
amdgpu_device_ip_resume_phase1(tmp_adev);
5322-
5323-
goto end;
5324-
}
5325-
53265310
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
53275311
if (need_full_reset) {
53285312
/* post card */
@@ -5559,11 +5543,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
55595543
int i, r = 0;
55605544
bool need_emergency_restart = false;
55615545
bool audio_suspended = false;
5562-
bool gpu_reset_for_dev_remove = false;
5563-
5564-
gpu_reset_for_dev_remove =
5565-
test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5566-
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
55675546

55685547
/*
55695548
* Special case: RAS triggered and full reset isn't supported
@@ -5601,7 +5580,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
56015580
if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
56025581
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
56035582
list_add_tail(&tmp_adev->reset_list, &device_list);
5604-
if (gpu_reset_for_dev_remove && adev->shutdown)
5583+
if (adev->shutdown)
56055584
tmp_adev->shutdown = true;
56065585
}
56075586
if (!list_is_first(&adev->reset_list, &device_list))
@@ -5686,10 +5665,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
56865665

56875666
retry: /* Rest of adevs pre asic reset from XGMI hive. */
56885667
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5689-
if (gpu_reset_for_dev_remove) {
5690-
/* Workaroud for ASICs need to disable SMC first */
5691-
amdgpu_device_smu_fini_early(tmp_adev);
5692-
}
56935668
r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
56945669
/*TODO Should we stop ?*/
56955670
if (r) {
@@ -5721,9 +5696,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
57215696
r = amdgpu_do_asic_reset(device_list_handle, reset_context);
57225697
if (r && r == -EAGAIN)
57235698
goto retry;
5724-
5725-
if (!r && gpu_reset_for_dev_remove)
5726-
goto recover_end;
57275699
}
57285700

57295701
skip_hw_reset:
@@ -5779,7 +5751,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
57795751
amdgpu_ras_set_error_query_ready(tmp_adev, true);
57805752
}
57815753

5782-
recover_end:
57835754
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
57845755
reset_list);
57855756
amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);

drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1963,8 +1963,6 @@ static int amdgpu_discovery_set_gc_ip_blocks(struct amdgpu_device *adev)
19631963
amdgpu_device_ip_block_add(adev, &gfx_v9_0_ip_block);
19641964
break;
19651965
case IP_VERSION(9, 4, 3):
1966-
if (!amdgpu_exp_hw_support)
1967-
return -EINVAL;
19681966
amdgpu_device_ip_block_add(adev, &gfx_v9_4_3_ip_block);
19691967
break;
19701968
case IP_VERSION(10, 1, 10):

drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

Lines changed: 8 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ enum AMDGPU_DEBUG_MASK {
128128
AMDGPU_DEBUG_VM = BIT(0),
129129
AMDGPU_DEBUG_LARGEBAR = BIT(1),
130130
AMDGPU_DEBUG_DISABLE_GPU_SOFT_RECOVERY = BIT(2),
131+
AMDGPU_DEBUG_USE_VRAM_FW_BUF = BIT(3),
131132
};
132133

133134
unsigned int amdgpu_vram_limit = UINT_MAX;
@@ -210,7 +211,6 @@ int amdgpu_seamless = -1; /* auto */
210211
uint amdgpu_debug_mask;
211212
int amdgpu_agp = -1; /* auto */
212213
int amdgpu_wbrf = -1;
213-
int fw_bo_location = -1;
214214

215215
static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work);
216216

@@ -990,10 +990,6 @@ MODULE_PARM_DESC(wbrf,
990990
"Enable Wifi RFI interference mitigation (0 = disabled, 1 = enabled, -1 = auto(default)");
991991
module_param_named(wbrf, amdgpu_wbrf, int, 0444);
992992

993-
MODULE_PARM_DESC(fw_bo_location,
994-
"location to put firmware bo for frontdoor loading (-1 = auto (default), 0 = on ram, 1 = on vram");
995-
module_param(fw_bo_location, int, 0644);
996-
997993
/* These devices are not supported by amdgpu.
998994
* They are supported by the mach64, r128, radeon drivers
999995
*/
@@ -2122,6 +2118,11 @@ static void amdgpu_init_debug_options(struct amdgpu_device *adev)
21222118
pr_info("debug: soft reset for GPU recovery disabled\n");
21232119
adev->debug_disable_soft_recovery = true;
21242120
}
2121+
2122+
if (amdgpu_debug_mask & AMDGPU_DEBUG_USE_VRAM_FW_BUF) {
2123+
pr_info("debug: place fw in vram for frontdoor loading\n");
2124+
adev->debug_use_vram_fw_buf = true;
2125+
}
21252126
}
21262127

21272128
static unsigned long amdgpu_fix_asic_type(struct pci_dev *pdev, unsigned long flags)
@@ -2233,6 +2234,8 @@ static int amdgpu_pci_probe(struct pci_dev *pdev,
22332234

22342235
pci_set_drvdata(pdev, ddev);
22352236

2237+
amdgpu_init_debug_options(adev);
2238+
22362239
ret = amdgpu_driver_load_kms(adev, flags);
22372240
if (ret)
22382241
goto err_pci;
@@ -2313,8 +2316,6 @@ static int amdgpu_pci_probe(struct pci_dev *pdev,
23132316
amdgpu_get_secondary_funcs(adev);
23142317
}
23152318

2316-
amdgpu_init_debug_options(adev);
2317-
23182319
return 0;
23192320

23202321
err_pci:
@@ -2336,38 +2337,6 @@ amdgpu_pci_remove(struct pci_dev *pdev)
23362337
pm_runtime_forbid(dev->dev);
23372338
}
23382339

2339-
if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 2) &&
2340-
!amdgpu_sriov_vf(adev)) {
2341-
bool need_to_reset_gpu = false;
2342-
2343-
if (adev->gmc.xgmi.num_physical_nodes > 1) {
2344-
struct amdgpu_hive_info *hive;
2345-
2346-
hive = amdgpu_get_xgmi_hive(adev);
2347-
if (hive->device_remove_count == 0)
2348-
need_to_reset_gpu = true;
2349-
hive->device_remove_count++;
2350-
amdgpu_put_xgmi_hive(hive);
2351-
} else {
2352-
need_to_reset_gpu = true;
2353-
}
2354-
2355-
/* Workaround for ASICs need to reset SMU.
2356-
* Called only when the first device is removed.
2357-
*/
2358-
if (need_to_reset_gpu) {
2359-
struct amdgpu_reset_context reset_context;
2360-
2361-
adev->shutdown = true;
2362-
memset(&reset_context, 0, sizeof(reset_context));
2363-
reset_context.method = AMD_RESET_METHOD_NONE;
2364-
reset_context.reset_req_dev = adev;
2365-
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
2366-
set_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context.flags);
2367-
amdgpu_device_gpu_recover(adev, NULL, &reset_context);
2368-
}
2369-
}
2370-
23712340
amdgpu_driver_unload_kms(dev);
23722341

23732342
/*

drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1045,21 +1045,28 @@ int amdgpu_gmc_vram_checking(struct amdgpu_device *adev)
10451045
* seconds, so here, we just pick up three parts for emulation.
10461046
*/
10471047
ret = memcmp(vram_ptr, cptr, 10);
1048-
if (ret)
1049-
return ret;
1048+
if (ret) {
1049+
ret = -EIO;
1050+
goto release_buffer;
1051+
}
10501052

10511053
ret = memcmp(vram_ptr + (size / 2), cptr, 10);
1052-
if (ret)
1053-
return ret;
1054+
if (ret) {
1055+
ret = -EIO;
1056+
goto release_buffer;
1057+
}
10541058

10551059
ret = memcmp(vram_ptr + size - 10, cptr, 10);
1056-
if (ret)
1057-
return ret;
1060+
if (ret) {
1061+
ret = -EIO;
1062+
goto release_buffer;
1063+
}
10581064

1065+
release_buffer:
10591066
amdgpu_bo_free_kernel(&vram_bo, &vram_gpu,
10601067
&vram_ptr);
10611068

1062-
return 0;
1069+
return ret;
10631070
}
10641071

10651072
static ssize_t current_memory_partition_show(

drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1105,7 +1105,12 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
11051105
if (amdgpu_dpm_read_sensor(adev,
11061106
AMDGPU_PP_SENSOR_GPU_AVG_POWER,
11071107
(void *)&ui32, &ui32_size)) {
1108-
return -EINVAL;
1108+
/* fall back to input power for backwards compat */
1109+
if (amdgpu_dpm_read_sensor(adev,
1110+
AMDGPU_PP_SENSOR_GPU_INPUT_POWER,
1111+
(void *)&ui32, &ui32_size)) {
1112+
return -EINVAL;
1113+
}
11091114
}
11101115
ui32 >>= 8;
11111116
break;

drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -466,7 +466,7 @@ static int psp_sw_init(void *handle)
466466
}
467467

468468
ret = amdgpu_bo_create_kernel(adev, PSP_1_MEG, PSP_1_MEG,
469-
(amdgpu_sriov_vf(adev) || fw_bo_location == 1) ?
469+
(amdgpu_sriov_vf(adev) || adev->debug_use_vram_fw_buf) ?
470470
AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT,
471471
&psp->fw_pri_bo,
472472
&psp->fw_pri_mc_addr,

0 commit comments

Comments
 (0)