Skip to content

Commit a82866f

Browse files
committed
Merge tag 'amd-drm-next-6.15-2025-03-21' of https://gitlab.freedesktop.org/agd5f/linux into drm-next
amd-drm-next-6.15-2025-03-21: amdgpu: - Refine nomodeset handling - RAS fixes - DCN 3.x fixes - DMUB fixes - eDP fixes - SMU 14.0.2 fixes - SMU 13.0.6 fixes - SMU 13.0.12 fixes - SDMA engine reset fixes - Enforce Isolation fixes - Runtime workload profile ref count fixes - Documentation fixes - SR-IOV fixes - MES fixes - GC 11.5 cleaner shader support - SDMA VM invalidation fixes - IP discovery improvements for GC based chips amdkfd: - Dequeue wait count fixes - Precise memops fixes radeon: - Code cleanup Signed-off-by: Dave Airlie <airlied@redhat.com> From: Alex Deucher <alexander.deucher@amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20250321210909.2809595-1-alexander.deucher@amd.com
2 parents f72e21e + 7547510 commit a82866f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

73 files changed

+1248
-1052
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu.h

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1194,9 +1194,15 @@ struct amdgpu_device {
11941194
bool debug_exp_resets;
11951195
bool debug_disable_gpu_ring_reset;
11961196

1197-
bool enforce_isolation[MAX_XCP];
1198-
/* Added this mutex for cleaner shader isolation between GFX and compute processes */
1197+
/* Protection for the following isolation structure */
11991198
struct mutex enforce_isolation_mutex;
1199+
bool enforce_isolation[MAX_XCP];
1200+
struct amdgpu_isolation {
1201+
void *owner;
1202+
struct dma_fence *spearhead;
1203+
struct amdgpu_sync active;
1204+
struct amdgpu_sync prev;
1205+
} isolation[MAX_XCP];
12001206

12011207
struct amdgpu_init_level *init_lvl;
12021208

@@ -1482,6 +1488,9 @@ void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
14821488
struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev);
14831489
struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
14841490
struct dma_fence *gang);
1491+
struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev,
1492+
struct amdgpu_ring *ring,
1493+
struct amdgpu_job *job);
14851494
bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev);
14861495
ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring);
14871496
ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset);

drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,7 @@ static void aca_banks_generate_cper(struct amdgpu_device *adev,
391391
{
392392
struct aca_bank_node *node;
393393
struct aca_bank *bank;
394+
int r;
394395

395396
if (!adev->cper.enabled)
396397
return;
@@ -402,11 +403,27 @@ static void aca_banks_generate_cper(struct amdgpu_device *adev,
402403

403404
/* UEs must be encoded into separate CPER entries */
404405
if (type == ACA_SMU_TYPE_UE) {
406+
struct aca_banks de_banks;
407+
408+
aca_banks_init(&de_banks);
405409
list_for_each_entry(node, &banks->list, node) {
406410
bank = &node->bank;
407-
if (amdgpu_cper_generate_ue_record(adev, bank))
408-
dev_warn(adev->dev, "fail to generate ue cper records\n");
411+
if (bank->aca_err_type == ACA_ERROR_TYPE_DEFERRED) {
412+
r = aca_banks_add_bank(&de_banks, bank);
413+
if (r)
414+
dev_warn(adev->dev, "fail to add de banks, ret = %d\n", r);
415+
} else {
416+
if (amdgpu_cper_generate_ue_record(adev, bank))
417+
dev_warn(adev->dev, "fail to generate ue cper records\n");
418+
}
419+
}
420+
421+
if (!list_empty(&de_banks.list)) {
422+
if (amdgpu_cper_generate_ce_records(adev, &de_banks, de_banks.nr_banks))
423+
dev_warn(adev->dev, "fail to generate de cper records\n");
409424
}
425+
426+
aca_banks_release(&de_banks);
410427
} else {
411428
/*
412429
* SMU_TYPE_CE banks are combined into 1 CPER entries,
@@ -541,6 +558,10 @@ static int __aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *h
541558
if (ret)
542559
return ret;
543560

561+
/* DEs may contain in CEs or UEs */
562+
if (type != ACA_ERROR_TYPE_DEFERRED)
563+
aca_log_aca_error(handle, ACA_ERROR_TYPE_DEFERRED, err_data);
564+
544565
return aca_log_aca_error(handle, type, err_data);
545566
}
546567

drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,17 @@ struct ras_query_context;
7676
#define mmSMNAID_XCD1_MCA_SMU 0x38430400 /* SMN AID XCD1 */
7777
#define mmSMNXCD_XCD0_MCA_SMU 0x40430400 /* SMN XCD XCD0 */
7878

79-
#define ACA_BANK_ERR_CE_DE_DECODE(bank) \
80-
((ACA_REG__STATUS__POISON((bank)->regs[ACA_REG_IDX_STATUS]) || \
81-
ACA_REG__STATUS__DEFERRED((bank)->regs[ACA_REG_IDX_STATUS])) ? \
82-
ACA_ERROR_TYPE_DEFERRED : \
83-
ACA_ERROR_TYPE_CE)
79+
#define ACA_BANK_ERR_IS_DEFFERED(bank) \
80+
(ACA_REG__STATUS__POISON((bank)->regs[ACA_REG_IDX_STATUS]) || \
81+
ACA_REG__STATUS__DEFERRED((bank)->regs[ACA_REG_IDX_STATUS]))
82+
83+
#define ACA_BANK_ERR_CE_DE_DECODE(bank) \
84+
(ACA_BANK_ERR_IS_DEFFERED(bank) ? ACA_ERROR_TYPE_DEFERRED : \
85+
ACA_ERROR_TYPE_CE)
86+
87+
#define ACA_BANK_ERR_UE_DE_DECODE(bank) \
88+
(ACA_BANK_ERR_IS_DEFFERED(bank) ? ACA_ERROR_TYPE_DEFERRED : \
89+
ACA_ERROR_TYPE_UE)
8490

8591
enum aca_reg_idx {
8692
ACA_REG_IDX_CTL = 0,

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -491,7 +491,7 @@ static int vm_update_pds(struct amdgpu_vm *vm, struct amdgpu_sync *sync)
491491
if (ret)
492492
return ret;
493493

494-
return amdgpu_sync_fence(sync, vm->last_update);
494+
return amdgpu_sync_fence(sync, vm->last_update, GFP_KERNEL);
495495
}
496496

497497
static uint64_t get_pte_flags(struct amdgpu_device *adev, struct kgd_mem *mem)
@@ -1249,7 +1249,7 @@ static int unmap_bo_from_gpuvm(struct kgd_mem *mem,
12491249

12501250
(void)amdgpu_vm_clear_freed(adev, vm, &bo_va->last_pt_update);
12511251

1252-
(void)amdgpu_sync_fence(sync, bo_va->last_pt_update);
1252+
(void)amdgpu_sync_fence(sync, bo_va->last_pt_update, GFP_KERNEL);
12531253

12541254
return 0;
12551255
}
@@ -1273,7 +1273,7 @@ static int update_gpuvm_pte(struct kgd_mem *mem,
12731273
return ret;
12741274
}
12751275

1276-
return amdgpu_sync_fence(sync, bo_va->last_pt_update);
1276+
return amdgpu_sync_fence(sync, bo_va->last_pt_update, GFP_KERNEL);
12771277
}
12781278

12791279
static int map_bo_to_gpuvm(struct kgd_mem *mem,
@@ -2913,7 +2913,7 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu *
29132913
}
29142914
dma_resv_for_each_fence(&cursor, bo->tbo.base.resv,
29152915
DMA_RESV_USAGE_KERNEL, fence) {
2916-
ret = amdgpu_sync_fence(&sync_obj, fence);
2916+
ret = amdgpu_sync_fence(&sync_obj, fence, GFP_KERNEL);
29172917
if (ret) {
29182918
pr_debug("Memory eviction: Sync BO fence failed. Try again\n");
29192919
goto validate_map_fail;

drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -455,10 +455,10 @@ static u32 amdgpu_cper_ring_get_ent_sz(struct amdgpu_ring *ring, u64 pos)
455455
return umin(rec_len, chunk);
456456
}
457457

458-
void amdgpu_cper_ring_write(struct amdgpu_ring *ring,
459-
void *src, int count)
458+
void amdgpu_cper_ring_write(struct amdgpu_ring *ring, void *src, int count)
460459
{
461460
u64 pos, wptr_old, rptr = *ring->rptr_cpu_addr & ring->ptr_mask;
461+
int rec_cnt_dw = count >> 2;
462462
u32 chunk, ent_sz;
463463
u8 *s = (u8 *)src;
464464

@@ -485,6 +485,9 @@ void amdgpu_cper_ring_write(struct amdgpu_ring *ring,
485485
s += chunk;
486486
}
487487

488+
if (ring->count_dw < rec_cnt_dw)
489+
ring->count_dw = 0;
490+
488491
/* the buffer is overflow, adjust rptr */
489492
if (((wptr_old < rptr) && (rptr <= ring->wptr)) ||
490493
((ring->wptr < wptr_old) && (wptr_old < rptr)) ||
@@ -501,12 +504,10 @@ void amdgpu_cper_ring_write(struct amdgpu_ring *ring,
501504
pos = rptr;
502505
} while (!amdgpu_cper_is_hdr(ring, rptr));
503506
}
504-
mutex_unlock(&ring->adev->cper.ring_lock);
505507

506-
if (ring->count_dw >= (count >> 2))
507-
ring->count_dw -= (count >> 2);
508-
else
509-
ring->count_dw = 0;
508+
if (ring->count_dw >= rec_cnt_dw)
509+
ring->count_dw -= rec_cnt_dw;
510+
mutex_unlock(&ring->adev->cper.ring_lock);
510511
}
511512

512513
static u64 amdgpu_cper_ring_get_rptr(struct amdgpu_ring *ring)

drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -428,7 +428,7 @@ static int amdgpu_cs_p2_dependencies(struct amdgpu_cs_parser *p,
428428
dma_fence_put(old);
429429
}
430430

431-
r = amdgpu_sync_fence(&p->sync, fence);
431+
r = amdgpu_sync_fence(&p->sync, fence, GFP_KERNEL);
432432
dma_fence_put(fence);
433433
if (r)
434434
return r;
@@ -450,7 +450,7 @@ static int amdgpu_syncobj_lookup_and_add(struct amdgpu_cs_parser *p,
450450
return r;
451451
}
452452

453-
r = amdgpu_sync_fence(&p->sync, fence);
453+
r = amdgpu_sync_fence(&p->sync, fence, GFP_KERNEL);
454454
dma_fence_put(fence);
455455
return r;
456456
}
@@ -1111,7 +1111,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
11111111
struct drm_gpu_scheduler *sched = entity->rq->sched;
11121112
struct amdgpu_ring *ring = to_amdgpu_ring(sched);
11131113

1114-
if (amdgpu_vmid_uses_reserved(adev, vm, ring->vm_hub))
1114+
if (amdgpu_vmid_uses_reserved(vm, ring->vm_hub))
11151115
return -EINVAL;
11161116
}
11171117
}
@@ -1124,7 +1124,8 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
11241124
if (r)
11251125
return r;
11261126

1127-
r = amdgpu_sync_fence(&p->sync, fpriv->prt_va->last_pt_update);
1127+
r = amdgpu_sync_fence(&p->sync, fpriv->prt_va->last_pt_update,
1128+
GFP_KERNEL);
11281129
if (r)
11291130
return r;
11301131

@@ -1135,7 +1136,8 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
11351136
if (r)
11361137
return r;
11371138

1138-
r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update);
1139+
r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update,
1140+
GFP_KERNEL);
11391141
if (r)
11401142
return r;
11411143
}
@@ -1154,7 +1156,8 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
11541156
if (r)
11551157
return r;
11561158

1157-
r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update);
1159+
r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update,
1160+
GFP_KERNEL);
11581161
if (r)
11591162
return r;
11601163
}
@@ -1167,7 +1170,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
11671170
if (r)
11681171
return r;
11691172

1170-
r = amdgpu_sync_fence(&p->sync, vm->last_update);
1173+
r = amdgpu_sync_fence(&p->sync, vm->last_update, GFP_KERNEL);
11711174
if (r)
11721175
return r;
11731176

@@ -1248,7 +1251,8 @@ static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p)
12481251
continue;
12491252
}
12501253

1251-
r = amdgpu_sync_fence(&p->gang_leader->explicit_sync, fence);
1254+
r = amdgpu_sync_fence(&p->gang_leader->explicit_sync, fence,
1255+
GFP_KERNEL);
12521256
dma_fence_put(fence);
12531257
if (r)
12541258
return r;

0 commit comments

Comments
 (0)