Skip to content

Commit 7425c43

Browse files
author
Thomas Hellström
committed
drm/xe/migrate: Fix CCS copy for small VRAM copy chunks
Since the migrate code is using the identity map for addressing VRAM, copy chunks may become as small as 64K if the VRAM resource is fragmented. However, a chunk size smaller that 1MiB may lead to the *next* chunk's offset into the CCS metadata backup memory may not be page-aligned, and the XY_CTRL_SURF_COPY_BLT command can't handle that, and even if it could, the current code doesn't handle the offset calculaton correctly. To fix this, make sure we align the size of VRAM copy chunks to 1MiB. If the remaining data to copy is smaller than that, that's not a problem, so use the remaining size. If the VRAM copy cunk becomes fragmented due to the size alignment restriction, don't use the identity map, but instead emit PTEs into the page-table like we do for system memory. v2: - Rebase v3: - Future proof somewhat by taking into account the real data size to flat CCS metadata size ratio. (Matt Roper) - Invert a couple of if-statements for better readability. - Fix support for 4K-granularity VRAM sizes. (Tested on DG1). v4: - Fix up code comments - Fix debug printout format typo. v5: - Add a Fixes: tag. Cc: Matt Roper <matthew.d.roper@intel.com> Cc: Matthew Auld <matthew.william.auld@gmail.com> Cc: Matthew Brost <matthew.brost@intel.com> Fixes: e89b384 ("drm/xe/migrate: Update emit_pte to cope with a size level than 4k") Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com> Reviewed-by: Matthew Auld <matthew.auld@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20240110163415.524165-1-thomas.hellstrom@linux.intel.com (cherry picked from commit ef51d75) Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
1 parent ec32f4f commit 7425c43

File tree

2 files changed

+80
-50
lines changed

2 files changed

+80
-50
lines changed

drivers/gpu/drm/xe/tests/xe_migrate.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -331,7 +331,7 @@ static void xe_migrate_sanity_test(struct xe_migrate *m, struct kunit *test)
331331
xe_res_first_sg(xe_bo_sg(pt), 0, pt->size, &src_it);
332332

333333
emit_pte(m, bb, NUM_KERNEL_PDE - 1, xe_bo_is_vram(pt), false,
334-
&src_it, XE_PAGE_SIZE, pt);
334+
&src_it, XE_PAGE_SIZE, pt->ttm.resource);
335335

336336
run_sanity_job(m, xe, bb, bb->len, "Writing PTE for our fake PT", test);
337337

drivers/gpu/drm/xe/xe_migrate.c

Lines changed: 79 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ struct xe_migrate {
6262
* out of the pt_bo.
6363
*/
6464
struct drm_suballoc_manager vm_update_sa;
65+
/** @min_chunk_size: For dgfx, Minimum chunk size */
66+
u64 min_chunk_size;
6567
};
6668

6769
#define MAX_PREEMPTDISABLE_TRANSFER SZ_8M /* Around 1ms. */
@@ -363,6 +365,19 @@ struct xe_migrate *xe_migrate_init(struct xe_tile *tile)
363365
if (err)
364366
return ERR_PTR(err);
365367

368+
if (IS_DGFX(xe)) {
369+
if (xe_device_has_flat_ccs(xe))
370+
/* min chunk size corresponds to 4K of CCS Metadata */
371+
m->min_chunk_size = SZ_4K * SZ_64K /
372+
xe_device_ccs_bytes(xe, SZ_64K);
373+
else
374+
/* Somewhat arbitrary to avoid a huge amount of blits */
375+
m->min_chunk_size = SZ_64K;
376+
m->min_chunk_size = roundup_pow_of_two(m->min_chunk_size);
377+
drm_dbg(&xe->drm, "Migrate min chunk size is 0x%08llx\n",
378+
(unsigned long long)m->min_chunk_size);
379+
}
380+
366381
return m;
367382
}
368383

@@ -374,16 +389,35 @@ static u64 max_mem_transfer_per_pass(struct xe_device *xe)
374389
return MAX_PREEMPTDISABLE_TRANSFER;
375390
}
376391

377-
static u64 xe_migrate_res_sizes(struct xe_device *xe, struct xe_res_cursor *cur)
392+
static u64 xe_migrate_res_sizes(struct xe_migrate *m, struct xe_res_cursor *cur)
378393
{
379-
/*
380-
* For VRAM we use identity mapped pages so we are limited to current
381-
* cursor size. For system we program the pages ourselves so we have no
382-
* such limitation.
383-
*/
384-
return min_t(u64, max_mem_transfer_per_pass(xe),
385-
mem_type_is_vram(cur->mem_type) ? cur->size :
386-
cur->remaining);
394+
struct xe_device *xe = tile_to_xe(m->tile);
395+
u64 size = min_t(u64, max_mem_transfer_per_pass(xe), cur->remaining);
396+
397+
if (mem_type_is_vram(cur->mem_type)) {
398+
/*
399+
* VRAM we want to blit in chunks with sizes aligned to
400+
* min_chunk_size in order for the offset to CCS metadata to be
401+
* page-aligned. If it's the last chunk it may be smaller.
402+
*
403+
* Another constraint is that we need to limit the blit to
404+
* the VRAM block size, unless size is smaller than
405+
* min_chunk_size.
406+
*/
407+
u64 chunk = max_t(u64, cur->size, m->min_chunk_size);
408+
409+
size = min_t(u64, size, chunk);
410+
if (size > m->min_chunk_size)
411+
size = round_down(size, m->min_chunk_size);
412+
}
413+
414+
return size;
415+
}
416+
417+
static bool xe_migrate_allow_identity(u64 size, const struct xe_res_cursor *cur)
418+
{
419+
/* If the chunk is not fragmented, allow identity map. */
420+
return cur->size >= size;
387421
}
388422

389423
static u32 pte_update_size(struct xe_migrate *m,
@@ -396,7 +430,12 @@ static u32 pte_update_size(struct xe_migrate *m,
396430
u32 cmds = 0;
397431

398432
*L0_pt = pt_ofs;
399-
if (!is_vram) {
433+
if (is_vram && xe_migrate_allow_identity(*L0, cur)) {
434+
/* Offset into identity map. */
435+
*L0_ofs = xe_migrate_vram_ofs(tile_to_xe(m->tile),
436+
cur->start + vram_region_gpu_offset(res));
437+
cmds += cmd_size;
438+
} else {
400439
/* Clip L0 to available size */
401440
u64 size = min(*L0, (u64)avail_pts * SZ_2M);
402441
u64 num_4k_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE);
@@ -412,11 +451,6 @@ static u32 pte_update_size(struct xe_migrate *m,
412451

413452
/* Each chunk has a single blit command */
414453
cmds += cmd_size;
415-
} else {
416-
/* Offset into identity map. */
417-
*L0_ofs = xe_migrate_vram_ofs(tile_to_xe(m->tile),
418-
cur->start + vram_region_gpu_offset(res));
419-
cmds += cmd_size;
420454
}
421455

422456
return cmds;
@@ -426,10 +460,10 @@ static void emit_pte(struct xe_migrate *m,
426460
struct xe_bb *bb, u32 at_pt,
427461
bool is_vram, bool is_comp_pte,
428462
struct xe_res_cursor *cur,
429-
u32 size, struct xe_bo *bo)
463+
u32 size, struct ttm_resource *res)
430464
{
431465
struct xe_device *xe = tile_to_xe(m->tile);
432-
466+
struct xe_vm *vm = m->q->vm;
433467
u16 pat_index;
434468
u32 ptes;
435469
u64 ofs = at_pt * XE_PAGE_SIZE;
@@ -442,13 +476,6 @@ static void emit_pte(struct xe_migrate *m,
442476
else
443477
pat_index = xe->pat.idx[XE_CACHE_WB];
444478

445-
/*
446-
* FIXME: Emitting VRAM PTEs to L0 PTs is forbidden. Currently
447-
* we're only emitting VRAM PTEs during sanity tests, so when
448-
* that's moved to a Kunit test, we should condition VRAM PTEs
449-
* on running tests.
450-
*/
451-
452479
ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE);
453480

454481
while (ptes) {
@@ -468,20 +495,22 @@ static void emit_pte(struct xe_migrate *m,
468495

469496
addr = xe_res_dma(cur) & PAGE_MASK;
470497
if (is_vram) {
471-
/* Is this a 64K PTE entry? */
472-
if ((m->q->vm->flags & XE_VM_FLAG_64K) &&
473-
!(cur_ofs & (16 * 8 - 1))) {
474-
xe_tile_assert(m->tile, IS_ALIGNED(addr, SZ_64K));
498+
if (vm->flags & XE_VM_FLAG_64K) {
499+
u64 va = cur_ofs * XE_PAGE_SIZE / 8;
500+
501+
xe_assert(xe, (va & (SZ_64K - 1)) ==
502+
(addr & (SZ_64K - 1)));
503+
475504
flags |= XE_PTE_PS64;
476505
}
477506

478-
addr += vram_region_gpu_offset(bo->ttm.resource);
507+
addr += vram_region_gpu_offset(res);
479508
devmem = true;
480509
}
481510

482-
addr = m->q->vm->pt_ops->pte_encode_addr(m->tile->xe,
483-
addr, pat_index,
484-
0, devmem, flags);
511+
addr = vm->pt_ops->pte_encode_addr(m->tile->xe,
512+
addr, pat_index,
513+
0, devmem, flags);
485514
bb->cs[bb->len++] = lower_32_bits(addr);
486515
bb->cs[bb->len++] = upper_32_bits(addr);
487516

@@ -693,8 +722,8 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
693722
bool usm = xe->info.has_usm;
694723
u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE;
695724

696-
src_L0 = xe_migrate_res_sizes(xe, &src_it);
697-
dst_L0 = xe_migrate_res_sizes(xe, &dst_it);
725+
src_L0 = xe_migrate_res_sizes(m, &src_it);
726+
dst_L0 = xe_migrate_res_sizes(m, &dst_it);
698727

699728
drm_dbg(&xe->drm, "Pass %u, sizes: %llu & %llu\n",
700729
pass++, src_L0, dst_L0);
@@ -715,6 +744,7 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
715744
&ccs_ofs, &ccs_pt, 0,
716745
2 * avail_pts,
717746
avail_pts);
747+
xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE));
718748
}
719749

720750
/* Add copy commands size here */
@@ -727,20 +757,20 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
727757
goto err_sync;
728758
}
729759

730-
if (!src_is_vram)
731-
emit_pte(m, bb, src_L0_pt, src_is_vram, true, &src_it, src_L0,
732-
src_bo);
733-
else
760+
if (src_is_vram && xe_migrate_allow_identity(src_L0, &src_it))
734761
xe_res_next(&src_it, src_L0);
735-
736-
if (!dst_is_vram)
737-
emit_pte(m, bb, dst_L0_pt, dst_is_vram, true, &dst_it, src_L0,
738-
dst_bo);
739762
else
763+
emit_pte(m, bb, src_L0_pt, src_is_vram, true, &src_it, src_L0,
764+
src);
765+
766+
if (dst_is_vram && xe_migrate_allow_identity(src_L0, &dst_it))
740767
xe_res_next(&dst_it, src_L0);
768+
else
769+
emit_pte(m, bb, dst_L0_pt, dst_is_vram, true, &dst_it, src_L0,
770+
dst);
741771

742772
if (copy_system_ccs)
743-
emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src_bo);
773+
emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src);
744774

745775
bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
746776
update_idx = bb->len;
@@ -949,7 +979,7 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
949979
bool usm = xe->info.has_usm;
950980
u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE;
951981

952-
clear_L0 = xe_migrate_res_sizes(xe, &src_it);
982+
clear_L0 = xe_migrate_res_sizes(m, &src_it);
953983

954984
drm_dbg(&xe->drm, "Pass %u, size: %llu\n", pass++, clear_L0);
955985

@@ -976,12 +1006,12 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
9761006

9771007
size -= clear_L0;
9781008
/* Preemption is enabled again by the ring ops. */
979-
if (!clear_vram) {
980-
emit_pte(m, bb, clear_L0_pt, clear_vram, true, &src_it, clear_L0,
981-
bo);
982-
} else {
1009+
if (clear_vram && xe_migrate_allow_identity(clear_L0, &src_it))
9831010
xe_res_next(&src_it, clear_L0);
984-
}
1011+
else
1012+
emit_pte(m, bb, clear_L0_pt, clear_vram, true, &src_it, clear_L0,
1013+
dst);
1014+
9851015
bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
9861016
update_idx = bb->len;
9871017

0 commit comments

Comments
 (0)