[AMDGPU] efficiently wait for direct loads to LDS at all scopes

ssahasra · ssahasra · commit 9b0cdd6e9140 · 2025-07-07T14:23:21.000+05:30
Currently, the memory legalizer does not generate any wait on vmcnt at workgroup
scope. This is incorrect because direct loads to LDS are tracked using vmcnt and
they need to be released properly at workgroup scope.

The memory legalizer was previously updated to always emit a soft wait
instruction even when all counts are trivially ~0. SIInsertWaitcnts now examines
pending loads to LDS at each S_WAITCNT_soft instruction. If such instructions
exist, the vmcnt (which could be ~0) is upgraded to a value that wiats for any
such pending loads to LDS. After that, any soft instruction that has only
trivial ~0 counts is automatically dropped.

Thus, common programs that do not use direct loads to LDS remain unaffected, but
programs that do use such loads see a correct and efficient vmcnt even at
workgroup scope.
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1374,6 +1374,19 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
         ScoreBrackets.simplifyWaitcnt(OldWait);
       Wait = Wait.combined(OldWait);
 
+      if (!WaitcntInstr && II.getOpcode() == AMDGPU::S_WAITCNT_soft) {
+        // Each direct load to LDS is also a store to LDS, but we do not have a
+        // separate counter for it. Instead these operations increment LOAD_CNT
+        // and need to be waited for at a release fence. So we treat a release
+        // fence as if it depends on any previous LDS DMA stores.
+        //
+        // Note that a user-specified S_WAITCNT instruction is not affected; we
+        // only check for S_WAITCNT_soft since that represents a fence.
+        //
+        // FIXME: How does one detect that a soft wait is a release???
+        ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait);
+      }
+
       // Merge consecutive waitcnt of the same type by erasing multiples.
       if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
         II.eraseFromParent();
diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
@@ -46,9 +46,8 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
 ; GFX90A-NEXT:    buffer_load_dword v0, s[8:11], 0 offen lds
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s13
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x3c
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_barrier
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    ds_read_b32 v0, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
@@ -92,9 +91,8 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
 ; GFX942-NEXT:    buffer_load_dword v0, s[8:11], 0 offen lds
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s13
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x3c
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    s_barrier
-; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    ds_read_b32 v0, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
@@ -166,6 +164,7 @@ define amdgpu_kernel void @fence_fence(<4 x i32> inreg %rsrc,
 ; GFX900-NEXT:    s_nop 0
 ; GFX900-NEXT:    buffer_load_dword v1, s[0:3], 0 offen lds
 ; GFX900-NEXT:    v_mov_b32_e32 v1, 1
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX900-NEXT:    global_load_dword v1, v0, s[8:9]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
@@ -190,7 +189,7 @@ define amdgpu_kernel void @fence_fence(<4 x i32> inreg %rsrc,
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    buffer_load_dword v1, s[8:11], 0 offen lds
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 1
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90A-NEXT:    global_load_dword v1, v0, s[0:1]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -241,7 +240,7 @@ define amdgpu_kernel void @fence_fence(<4 x i32> inreg %rsrc,
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    buffer_load_dword v1, s[8:11], 0 offen lds
 ; GFX942-NEXT:    v_mov_b32_e32 v1, 1
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    global_store_dword v0, v1, s[0:1] sc0
 ; GFX942-NEXT:    global_load_dword v1, v0, s[0:1] sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -328,6 +327,7 @@ define amdgpu_kernel void @release_acquire(<4 x i32> inreg %rsrc,
 ; GFX900-NEXT:    s_nop 0
 ; GFX900-NEXT:    buffer_load_dword v1, s[0:3], 0 offen lds
 ; GFX900-NEXT:    v_mov_b32_e32 v1, 1
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX900-NEXT:    global_load_dword v1, v0, s[8:9]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
@@ -352,7 +352,7 @@ define amdgpu_kernel void @release_acquire(<4 x i32> inreg %rsrc,
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    buffer_load_dword v1, s[8:11], 0 offen lds
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 1
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90A-NEXT:    global_load_dword v1, v0, s[0:1]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -403,7 +403,7 @@ define amdgpu_kernel void @release_acquire(<4 x i32> inreg %rsrc,
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    buffer_load_dword v1, s[8:11], 0 offen lds
 ; GFX942-NEXT:    v_mov_b32_e32 v1, 1
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    global_store_dword v0, v1, s[0:1] sc0
 ; GFX942-NEXT:    global_load_dword v1, v0, s[0:1] sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)