Skip to content

Commit 9b0cdd6

Browse files
committed
[AMDGPU] efficiently wait for direct loads to LDS at all scopes
Currently, the memory legalizer does not generate any wait on vmcnt at workgroup scope. This is incorrect because direct loads to LDS are tracked using vmcnt and they need to be released properly at workgroup scope. The memory legalizer was previously updated to always emit a soft wait instruction even when all counts are trivially ~0. SIInsertWaitcnts now examines pending loads to LDS at each S_WAITCNT_soft instruction. If such instructions exist, the vmcnt (which could be ~0) is upgraded to a value that wiats for any such pending loads to LDS. After that, any soft instruction that has only trivial ~0 counts is automatically dropped. Thus, common programs that do not use direct loads to LDS remain unaffected, but programs that do use such loads see a correct and efficient vmcnt even at workgroup scope.
1 parent de111cd commit 9b0cdd6

File tree

2 files changed

+21
-8
lines changed

2 files changed

+21
-8
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1374,6 +1374,19 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
13741374
ScoreBrackets.simplifyWaitcnt(OldWait);
13751375
Wait = Wait.combined(OldWait);
13761376

1377+
if (!WaitcntInstr && II.getOpcode() == AMDGPU::S_WAITCNT_soft) {
1378+
// Each direct load to LDS is also a store to LDS, but we do not have a
1379+
// separate counter for it. Instead these operations increment LOAD_CNT
1380+
// and need to be waited for at a release fence. So we treat a release
1381+
// fence as if it depends on any previous LDS DMA stores.
1382+
//
1383+
// Note that a user-specified S_WAITCNT instruction is not affected; we
1384+
// only check for S_WAITCNT_soft since that represents a fence.
1385+
//
1386+
// FIXME: How does one detect that a soft wait is a release???
1387+
ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait);
1388+
}
1389+
13771390
// Merge consecutive waitcnt of the same type by erasing multiples.
13781391
if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
13791392
II.eraseFromParent();

llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,8 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
4646
; GFX90A-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds
4747
; GFX90A-NEXT: v_mov_b32_e32 v0, s13
4848
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c
49-
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
49+
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5050
; GFX90A-NEXT: s_barrier
51-
; GFX90A-NEXT: s_waitcnt vmcnt(0)
5251
; GFX90A-NEXT: ds_read_b32 v0, v0
5352
; GFX90A-NEXT: v_mov_b32_e32 v1, 0
5453
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
@@ -92,9 +91,8 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
9291
; GFX942-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds
9392
; GFX942-NEXT: v_mov_b32_e32 v0, s13
9493
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c
95-
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
94+
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9695
; GFX942-NEXT: s_barrier
97-
; GFX942-NEXT: s_waitcnt vmcnt(0)
9896
; GFX942-NEXT: ds_read_b32 v0, v0
9997
; GFX942-NEXT: v_mov_b32_e32 v1, 0
10098
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
@@ -166,6 +164,7 @@ define amdgpu_kernel void @fence_fence(<4 x i32> inreg %rsrc,
166164
; GFX900-NEXT: s_nop 0
167165
; GFX900-NEXT: buffer_load_dword v1, s[0:3], 0 offen lds
168166
; GFX900-NEXT: v_mov_b32_e32 v1, 1
167+
; GFX900-NEXT: s_waitcnt vmcnt(0)
169168
; GFX900-NEXT: global_store_dword v0, v1, s[8:9]
170169
; GFX900-NEXT: global_load_dword v1, v0, s[8:9]
171170
; GFX900-NEXT: s_waitcnt vmcnt(0)
@@ -190,7 +189,7 @@ define amdgpu_kernel void @fence_fence(<4 x i32> inreg %rsrc,
190189
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
191190
; GFX90A-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds
192191
; GFX90A-NEXT: v_mov_b32_e32 v1, 1
193-
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
192+
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
194193
; GFX90A-NEXT: global_store_dword v0, v1, s[0:1]
195194
; GFX90A-NEXT: global_load_dword v1, v0, s[0:1]
196195
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -241,7 +240,7 @@ define amdgpu_kernel void @fence_fence(<4 x i32> inreg %rsrc,
241240
; GFX942-NEXT: v_mov_b32_e32 v0, 0
242241
; GFX942-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds
243242
; GFX942-NEXT: v_mov_b32_e32 v1, 1
244-
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
243+
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
245244
; GFX942-NEXT: global_store_dword v0, v1, s[0:1] sc0
246245
; GFX942-NEXT: global_load_dword v1, v0, s[0:1] sc0
247246
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -328,6 +327,7 @@ define amdgpu_kernel void @release_acquire(<4 x i32> inreg %rsrc,
328327
; GFX900-NEXT: s_nop 0
329328
; GFX900-NEXT: buffer_load_dword v1, s[0:3], 0 offen lds
330329
; GFX900-NEXT: v_mov_b32_e32 v1, 1
330+
; GFX900-NEXT: s_waitcnt vmcnt(0)
331331
; GFX900-NEXT: global_store_dword v0, v1, s[8:9]
332332
; GFX900-NEXT: global_load_dword v1, v0, s[8:9]
333333
; GFX900-NEXT: s_waitcnt vmcnt(0)
@@ -352,7 +352,7 @@ define amdgpu_kernel void @release_acquire(<4 x i32> inreg %rsrc,
352352
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
353353
; GFX90A-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds
354354
; GFX90A-NEXT: v_mov_b32_e32 v1, 1
355-
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
355+
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
356356
; GFX90A-NEXT: global_store_dword v0, v1, s[0:1]
357357
; GFX90A-NEXT: global_load_dword v1, v0, s[0:1]
358358
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -403,7 +403,7 @@ define amdgpu_kernel void @release_acquire(<4 x i32> inreg %rsrc,
403403
; GFX942-NEXT: v_mov_b32_e32 v0, 0
404404
; GFX942-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds
405405
; GFX942-NEXT: v_mov_b32_e32 v1, 1
406-
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
406+
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
407407
; GFX942-NEXT: global_store_dword v0, v1, s[0:1] sc0
408408
; GFX942-NEXT: global_load_dword v1, v0, s[0:1] sc0
409409
; GFX942-NEXT: s_waitcnt vmcnt(0)

0 commit comments

Comments
 (0)