Skip to content

Commit 617d824

Browse files
unerligelucasdemarchi
authored andcommitted
drm/xe: Add WA BB to capture active context utilization
Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the context, but only gets updated when the context switches out. In order to check how long a context has been active before it switches out, two things are required: (1) Determine if the context is running: To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in the LRC. The value chosen is 1 since 0 is the initial value when the LRC is initialized. During a query, we just check for this value to determine if the context is active. If the context switched out, it would overwrite this location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as the last part of the context restore, so reusing this LRC location will not clobber anything. (2) Calculate the time that the context has been active for: The CTX_TIMESTAMP ticks only when the context is active. If a context is active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization. While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific engine instance. Since we do not know which instance the context is running on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and store it in the PPHSWP. Using the above 2 instructions in a WA BB, capture active context utilization. v2: (Matt Brost) - This breaks TDR, fix it by saving the CTX_TIMESTAMP register "drm/xe: Save CTX_TIMESTAMP mmio value instead of LRC value" - Drop tile from LRC if using gt "drm/xe: Save the gt pointer in LRC and drop the tile" v3: - Remove helpers for bb_per_ctx_ptr (Matt) - Add define for context active value (Matt) - Use 64 bit CTX TIMESTAMP for platforms that support it. For platforms that don't, live with the rare race. (Matt, Lucas) - Convert engine id to hwe and get the MMIO value (Lucas) - Correct commit message on when WA BB runs (Lucas) v4: - s/GRAPHICS_VER(...)/xe->info.has_64bit_timestamp/ (Matt) - Drop support for active utilization on a VF (CI failure) - In xe_lrc_init ensure the lrc value is 0 to begin with (CI regression) v5: - Minor checkpatch fix - Squash into previous commit and make TDR use 32-bit time - Update code comment to match commit msg Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/4532 Cc: <stable@vger.kernel.org> # v6.13+ Suggested-by: Lucas De Marchi <lucas.demarchi@intel.com> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com> Reviewed-by: Lucas De Marchi <lucas.demarchi@intel.com> Link: https://lore.kernel.org/r/20250509161159.2173069-8-umesh.nerlige.ramappa@intel.com (cherry picked from commit 82b98ca) Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
1 parent ce15563 commit 617d824

File tree

11 files changed

+208
-19
lines changed

11 files changed

+208
-19
lines changed

drivers/gpu/drm/xe/regs/xe_engine_regs.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,10 @@
4343
#define XEHPC_BCS8_RING_BASE 0x3ee000
4444
#define GSCCS_RING_BASE 0x11a000
4545

46+
#define ENGINE_ID(base) XE_REG((base) + 0x8c)
47+
#define ENGINE_INSTANCE_ID REG_GENMASK(9, 4)
48+
#define ENGINE_CLASS_ID REG_GENMASK(2, 0)
49+
4650
#define RING_TAIL(base) XE_REG((base) + 0x30)
4751
#define TAIL_ADDR REG_GENMASK(20, 3)
4852

@@ -154,6 +158,7 @@
154158
#define STOP_RING REG_BIT(8)
155159

156160
#define RING_CTX_TIMESTAMP(base) XE_REG((base) + 0x3a8)
161+
#define RING_CTX_TIMESTAMP_UDW(base) XE_REG((base) + 0x3ac)
157162
#define CSBE_DEBUG_STATUS(base) XE_REG((base) + 0x3fc)
158163

159164
#define RING_FORCE_TO_NONPRIV(base, i) XE_REG(((base) + 0x4d0) + (i) * 4)

drivers/gpu/drm/xe/regs/xe_lrc_layout.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@
1111
#define CTX_RING_TAIL (0x06 + 1)
1212
#define CTX_RING_START (0x08 + 1)
1313
#define CTX_RING_CTL (0x0a + 1)
14+
#define CTX_BB_PER_CTX_PTR (0x12 + 1)
1415
#define CTX_TIMESTAMP (0x22 + 1)
16+
#define CTX_TIMESTAMP_UDW (0x24 + 1)
1517
#define CTX_INDIRECT_RING_STATE (0x26 + 1)
1618
#define CTX_PDP0_UDW (0x30 + 1)
1719
#define CTX_PDP0_LDW (0x32 + 1)

drivers/gpu/drm/xe/xe_device_types.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -330,6 +330,8 @@ struct xe_device {
330330
u8 has_sriov:1;
331331
/** @info.has_usm: Device has unified shared memory support */
332332
u8 has_usm:1;
333+
/** @info.has_64bit_timestamp: Device supports 64-bit timestamps */
334+
u8 has_64bit_timestamp:1;
333335
/** @info.is_dgfx: is discrete device */
334336
u8 is_dgfx:1;
335337
/**

drivers/gpu/drm/xe/xe_exec_queue.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -830,7 +830,7 @@ void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q)
830830
{
831831
struct xe_device *xe = gt_to_xe(q->gt);
832832
struct xe_lrc *lrc;
833-
u32 old_ts, new_ts;
833+
u64 old_ts, new_ts;
834834
int idx;
835835

836836
/*

drivers/gpu/drm/xe/xe_guc_submit.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -941,7 +941,7 @@ static bool check_timeout(struct xe_exec_queue *q, struct xe_sched_job *job)
941941
return xe_sched_invalidate_job(job, 2);
942942
}
943943

944-
ctx_timestamp = xe_lrc_ctx_timestamp(q->lrc[0]);
944+
ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(q->lrc[0]));
945945
ctx_job_timestamp = xe_lrc_ctx_job_timestamp(q->lrc[0]);
946946

947947
/*

drivers/gpu/drm/xe/xe_lrc.c

Lines changed: 183 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "xe_hw_fence.h"
2525
#include "xe_map.h"
2626
#include "xe_memirq.h"
27+
#include "xe_mmio.h"
2728
#include "xe_sriov.h"
2829
#include "xe_trace_lrc.h"
2930
#include "xe_vm.h"
@@ -650,6 +651,7 @@ u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
650651
#define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
651652
#define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8)
652653
#define LRC_PARALLEL_PPHWSP_OFFSET 2048
654+
#define LRC_ENGINE_ID_PPHWSP_OFFSET 2096
653655
#define LRC_PPHWSP_SIZE SZ_4K
654656

655657
u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
@@ -694,11 +696,21 @@ static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
694696
return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
695697
}
696698

699+
static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc)
700+
{
701+
return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET;
702+
}
703+
697704
static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
698705
{
699706
return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
700707
}
701708

709+
static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc)
710+
{
711+
return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32);
712+
}
713+
702714
static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
703715
{
704716
/* Indirect ring state page is at the very end of LRC */
@@ -726,8 +738,10 @@ DECL_MAP_ADDR_HELPERS(regs)
726738
DECL_MAP_ADDR_HELPERS(start_seqno)
727739
DECL_MAP_ADDR_HELPERS(ctx_job_timestamp)
728740
DECL_MAP_ADDR_HELPERS(ctx_timestamp)
741+
DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw)
729742
DECL_MAP_ADDR_HELPERS(parallel)
730743
DECL_MAP_ADDR_HELPERS(indirect_ring)
744+
DECL_MAP_ADDR_HELPERS(engine_id)
731745

732746
#undef DECL_MAP_ADDR_HELPERS
733747

@@ -742,19 +756,38 @@ u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc)
742756
return __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
743757
}
744758

759+
/**
760+
* xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address
761+
* @lrc: Pointer to the lrc.
762+
*
763+
* Returns: ctx timestamp udw GGTT address
764+
*/
765+
u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc)
766+
{
767+
return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
768+
}
769+
745770
/**
746771
* xe_lrc_ctx_timestamp() - Read ctx timestamp value
747772
* @lrc: Pointer to the lrc.
748773
*
749774
* Returns: ctx timestamp value
750775
*/
751-
u32 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
776+
u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
752777
{
753778
struct xe_device *xe = lrc_to_xe(lrc);
754779
struct iosys_map map;
780+
u32 ldw, udw = 0;
755781

756782
map = __xe_lrc_ctx_timestamp_map(lrc);
757-
return xe_map_read32(xe, &map);
783+
ldw = xe_map_read32(xe, &map);
784+
785+
if (xe->info.has_64bit_timestamp) {
786+
map = __xe_lrc_ctx_timestamp_udw_map(lrc);
787+
udw = xe_map_read32(xe, &map);
788+
}
789+
790+
return (u64)udw << 32 | ldw;
758791
}
759792

760793
/**
@@ -877,6 +910,65 @@ static void xe_lrc_finish(struct xe_lrc *lrc)
877910
xe_bo_unpin(lrc->bo);
878911
xe_bo_unlock(lrc->bo);
879912
xe_bo_put(lrc->bo);
913+
xe_bo_unpin_map_no_vm(lrc->bb_per_ctx_bo);
914+
}
915+
916+
/*
917+
* xe_lrc_setup_utilization() - Setup wa bb to assist in calculating active
918+
* context run ticks.
919+
* @lrc: Pointer to the lrc.
920+
*
921+
* Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the
922+
* context, but only gets updated when the context switches out. In order to
923+
* check how long a context has been active before it switches out, two things
924+
* are required:
925+
*
926+
* (1) Determine if the context is running:
927+
* To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in
928+
* the LRC. The value chosen is 1 since 0 is the initial value when the LRC is
929+
* initialized. During a query, we just check for this value to determine if the
930+
* context is active. If the context switched out, it would overwrite this
931+
* location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as
932+
* the last part of context restore, so reusing this LRC location will not
933+
* clobber anything.
934+
*
935+
* (2) Calculate the time that the context has been active for:
936+
* The CTX_TIMESTAMP ticks only when the context is active. If a context is
937+
* active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization.
938+
* While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific
939+
* engine instance. Since we do not know which instance the context is running
940+
* on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and
941+
* store it in the PPHSWP.
942+
*/
943+
#define CONTEXT_ACTIVE 1ULL
944+
static void xe_lrc_setup_utilization(struct xe_lrc *lrc)
945+
{
946+
u32 *cmd;
947+
948+
cmd = lrc->bb_per_ctx_bo->vmap.vaddr;
949+
950+
*cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET;
951+
*cmd++ = ENGINE_ID(0).addr;
952+
*cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc);
953+
*cmd++ = 0;
954+
955+
*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
956+
*cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
957+
*cmd++ = 0;
958+
*cmd++ = lower_32_bits(CONTEXT_ACTIVE);
959+
960+
if (lrc_to_xe(lrc)->info.has_64bit_timestamp) {
961+
*cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
962+
*cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
963+
*cmd++ = 0;
964+
*cmd++ = upper_32_bits(CONTEXT_ACTIVE);
965+
}
966+
967+
*cmd++ = MI_BATCH_BUFFER_END;
968+
969+
xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR,
970+
xe_bo_ggtt_addr(lrc->bb_per_ctx_bo) | 1);
971+
880972
}
881973

882974
#define PVC_CTX_ASID (0x2e + 1)
@@ -893,6 +985,7 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
893985
void *init_data = NULL;
894986
u32 arb_enable;
895987
u32 lrc_size;
988+
u32 bo_flags;
896989
int err;
897990

898991
kref_init(&lrc->refcount);
@@ -902,22 +995,30 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
902995
if (xe_gt_has_indirect_ring_state(gt))
903996
lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
904997

998+
bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT |
999+
XE_BO_FLAG_GGTT_INVALIDATE;
1000+
9051001
/*
9061002
* FIXME: Perma-pinning LRC as we don't yet support moving GGTT address
9071003
* via VM bind calls.
9081004
*/
9091005
lrc->bo = xe_bo_create_pin_map(xe, tile, vm, lrc_size,
9101006
ttm_bo_type_kernel,
911-
XE_BO_FLAG_VRAM_IF_DGFX(tile) |
912-
XE_BO_FLAG_GGTT |
913-
XE_BO_FLAG_GGTT_INVALIDATE);
1007+
bo_flags);
9141008
if (IS_ERR(lrc->bo))
9151009
return PTR_ERR(lrc->bo);
9161010

1011+
lrc->bb_per_ctx_bo = xe_bo_create_pin_map(xe, tile, NULL, SZ_4K,
1012+
ttm_bo_type_kernel,
1013+
bo_flags);
1014+
if (IS_ERR(lrc->bb_per_ctx_bo)) {
1015+
err = PTR_ERR(lrc->bb_per_ctx_bo);
1016+
goto err_lrc_finish;
1017+
}
1018+
9171019
lrc->size = lrc_size;
9181020
lrc->ring.size = ring_size;
9191021
lrc->ring.tail = 0;
920-
lrc->ctx_timestamp = 0;
9211022

9221023
xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
9231024
hwe->fence_irq, hwe->name);
@@ -990,7 +1091,10 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
9901091
xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
9911092
_MASKED_BIT_ENABLE(CTX_CTRL_PXP_ENABLE));
9921093

1094+
lrc->ctx_timestamp = 0;
9931095
xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0);
1096+
if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
1097+
xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0);
9941098

9951099
if (xe->info.has_asid && vm)
9961100
xe_lrc_write_ctx_reg(lrc, PVC_CTX_ASID, vm->usm.asid);
@@ -1019,6 +1123,8 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
10191123
map = __xe_lrc_start_seqno_map(lrc);
10201124
xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
10211125

1126+
xe_lrc_setup_utilization(lrc);
1127+
10221128
return 0;
10231129

10241130
err_lrc_finish:
@@ -1238,6 +1344,21 @@ struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
12381344
return __xe_lrc_parallel_map(lrc);
12391345
}
12401346

1347+
/**
1348+
* xe_lrc_engine_id() - Read engine id value
1349+
* @lrc: Pointer to the lrc.
1350+
*
1351+
* Returns: context id value
1352+
*/
1353+
static u32 xe_lrc_engine_id(struct xe_lrc *lrc)
1354+
{
1355+
struct xe_device *xe = lrc_to_xe(lrc);
1356+
struct iosys_map map;
1357+
1358+
map = __xe_lrc_engine_id_map(lrc);
1359+
return xe_map_read32(xe, &map);
1360+
}
1361+
12411362
static int instr_dw(u32 cmd_header)
12421363
{
12431364
/* GFXPIPE "SINGLE_DW" opcodes are a single dword */
@@ -1684,7 +1805,7 @@ struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)
16841805
snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc);
16851806
snapshot->lrc_size = lrc->bo->size - snapshot->lrc_offset;
16861807
snapshot->lrc_snapshot = NULL;
1687-
snapshot->ctx_timestamp = xe_lrc_ctx_timestamp(lrc);
1808+
snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc));
16881809
snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc);
16891810
return snapshot;
16901811
}
@@ -1784,22 +1905,74 @@ void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
17841905
kfree(snapshot);
17851906
}
17861907

1908+
static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts)
1909+
{
1910+
u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id);
1911+
u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id);
1912+
struct xe_hw_engine *hwe;
1913+
u64 val;
1914+
1915+
hwe = xe_gt_hw_engine(lrc->gt, class, instance, false);
1916+
if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe),
1917+
"Unexpected engine class:instance %d:%d for context utilization\n",
1918+
class, instance))
1919+
return -1;
1920+
1921+
if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
1922+
val = xe_mmio_read64_2x32(&hwe->gt->mmio,
1923+
RING_CTX_TIMESTAMP(hwe->mmio_base));
1924+
else
1925+
val = xe_mmio_read32(&hwe->gt->mmio,
1926+
RING_CTX_TIMESTAMP(hwe->mmio_base));
1927+
1928+
*reg_ctx_ts = val;
1929+
1930+
return 0;
1931+
}
1932+
17871933
/**
17881934
* xe_lrc_update_timestamp() - Update ctx timestamp
17891935
* @lrc: Pointer to the lrc.
17901936
* @old_ts: Old timestamp value
17911937
*
17921938
* Populate @old_ts current saved ctx timestamp, read new ctx timestamp and
1793-
* update saved value.
1939+
* update saved value. With support for active contexts, the calculation may be
1940+
* slightly racy, so follow a read-again logic to ensure that the context is
1941+
* still active before returning the right timestamp.
17941942
*
17951943
* Returns: New ctx timestamp value
17961944
*/
1797-
u32 xe_lrc_update_timestamp(struct xe_lrc *lrc, u32 *old_ts)
1945+
u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts)
17981946
{
1947+
u64 lrc_ts, reg_ts;
1948+
u32 engine_id;
1949+
17991950
*old_ts = lrc->ctx_timestamp;
18001951

1801-
lrc->ctx_timestamp = xe_lrc_ctx_timestamp(lrc);
1952+
lrc_ts = xe_lrc_ctx_timestamp(lrc);
1953+
/* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */
1954+
if (IS_SRIOV_VF(lrc_to_xe(lrc))) {
1955+
lrc->ctx_timestamp = lrc_ts;
1956+
goto done;
1957+
}
1958+
1959+
if (lrc_ts == CONTEXT_ACTIVE) {
1960+
engine_id = xe_lrc_engine_id(lrc);
1961+
if (!get_ctx_timestamp(lrc, engine_id, &reg_ts))
1962+
lrc->ctx_timestamp = reg_ts;
1963+
1964+
/* read lrc again to ensure context is still active */
1965+
lrc_ts = xe_lrc_ctx_timestamp(lrc);
1966+
}
1967+
1968+
/*
1969+
* If context switched out, just use the lrc_ts. Note that this needs to
1970+
* be a separate if condition.
1971+
*/
1972+
if (lrc_ts != CONTEXT_ACTIVE)
1973+
lrc->ctx_timestamp = lrc_ts;
18021974

1975+
done:
18031976
trace_xe_lrc_update_timestamp(lrc, *old_ts);
18041977

18051978
return lrc->ctx_timestamp;

0 commit comments

Comments
 (0)