llvm
diff --git a/‎llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
Lines changed: 25 additions & 33 deletions b/‎llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
Lines changed: 25 additions & 33 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
Lines changed: 6 additions & 6 deletions b/‎llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
Lines changed: 6 additions & 6 deletions
@@ -1074,8 +1074,6 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
                                     SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                                     bool IsCrossAddrSpaceOrdering, Position Pos,
                                     AtomicOrdering Order) const {
-  bool Changed = false;
-
   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
 
@@ -1149,21 +1147,19 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
     }
   }
 
-  if (VMCnt || LGKMCnt) {
-    unsigned WaitCntImmediate =
-      AMDGPU::encodeWaitcnt(IV,
-                            VMCnt ? 0 : getVmcntBitMask(IV),
-                            getExpcntBitMask(IV),
-                            LGKMCnt ? 0 : getLgkmcntBitMask(IV));
-    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
-        .addImm(WaitCntImmediate);
-    Changed = true;
-  }
+  // Always emit a soft wait count, even if it is trivially ~0. SIInsertWaitcnts
+  // will later use this marker to add additional waits such as those required
+  // from direct load to LDS (formerly known as LDS DMA).
+  unsigned WaitCntImmediate = AMDGPU::encodeWaitcnt(
+      IV, VMCnt ? 0 : getVmcntBitMask(IV), getExpcntBitMask(IV),
+      LGKMCnt ? 0 : getLgkmcntBitMask(IV));
+  BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
+      .addImm(WaitCntImmediate);
 
   if (Pos == Position::AFTER)
     --MI;
 
-  return Changed;
+  return true;
 }
 
 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
@@ -1966,8 +1962,6 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                                      bool IsCrossAddrSpaceOrdering,
                                      Position Pos, AtomicOrdering Order) const {
-  bool Changed = false;
-
   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
 
@@ -2057,28 +2051,25 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
     }
   }
 
-  if (VMCnt || LGKMCnt) {
-    unsigned WaitCntImmediate =
-      AMDGPU::encodeWaitcnt(IV,
-                            VMCnt ? 0 : getVmcntBitMask(IV),
-                            getExpcntBitMask(IV),
-                            LGKMCnt ? 0 : getLgkmcntBitMask(IV));
-    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
-        .addImm(WaitCntImmediate);
-    Changed = true;
-  }
+  // Always emit a soft wait count, even if it is trivially ~0. SIInsertWaitcnts
+  // will later use this marker to add additional waits such as those required
+  // from direct load to LDS (formerly known as LDS DMA).
+  unsigned WaitCntImmediate = AMDGPU::encodeWaitcnt(
+      IV, VMCnt ? 0 : getVmcntBitMask(IV), getExpcntBitMask(IV),
+      LGKMCnt ? 0 : getLgkmcntBitMask(IV));
+  BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
+      .addImm(WaitCntImmediate);
 
   if (VSCnt) {
     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
         .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
         .addImm(0);
-    Changed = true;
   }
 
   if (Pos == Position::AFTER)
     --MI;
 
-  return Changed;
+  return true;
 }
 
 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
@@ -2287,8 +2278,6 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                                      bool IsCrossAddrSpaceOrdering,
                                      Position Pos, AtomicOrdering Order) const {
-  bool Changed = false;
-
   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
 
@@ -2372,23 +2361,26 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
       BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
     }
     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
-    Changed = true;
+  } else {
+    // Always emit a soft wait count, even if it is trivially ~0.
+    // SIInsertWaitcnts will later use this marker to add additional waits such
+    // as those required from direct load to LDS (formerly known as LDS DMA).
+    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft))
+        .addImm(getLoadcntBitMask(IV));
   }
 
   if (STORECnt) {
     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
-    Changed = true;
   }
 
   if (DSCnt) {
     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
-    Changed = true;
   }
 
   if (Pos == Position::AFTER)
     --MI;
 
-  return Changed;
+  return true;
 }
 
 bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
 
@@ -880,8 +880,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x100, v0
 ; GFX10-NEXT:    scratch_store_dword v0, v2, off offset:128
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 7
 ; GFX10-NEXT:    s_add_u32 s0, 0x100, s0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, s0, v1
@@ -921,8 +921,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX11-NEXT:    scratch_store_b32 v0, v2, off offset:384 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 7
 ; GFX11-NEXT:    s_add_u32 s0, 0x100, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -991,8 +991,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
 ; UNALIGNED_GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x100, v0
 ; UNALIGNED_GFX10-NEXT:    scratch_store_dword v0, v2, off offset:128
-; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; UNALIGNED_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; UNALIGNED_GFX10-NEXT:    s_lshl_b32 s0, s0, 7
 ; UNALIGNED_GFX10-NEXT:    s_add_u32 s0, 0x100, s0
 ; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v1, s0, v1
@@ -1032,8 +1032,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
 ; UNALIGNED_GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
 ; UNALIGNED_GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; UNALIGNED_GFX11-NEXT:    scratch_store_b32 v0, v2, off offset:384 dlc
-; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; UNALIGNED_GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; UNALIGNED_GFX11-NEXT:    s_lshl_b32 s0, s0, 7
 ; UNALIGNED_GFX11-NEXT:    s_add_u32 s0, 0x100, s0
 ; UNALIGNED_GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1520,8 +1520,8 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x4004, v0
 ; GFX10-NEXT:    scratch_store_dword v0, v2, off offset:128
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 7
 ; GFX10-NEXT:    s_add_u32 s0, 0x4004, s0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, s0, v1
@@ -1633,8 +1633,8 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) {
 ; UNALIGNED_GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x4004, v0
 ; UNALIGNED_GFX10-NEXT:    scratch_store_dword v0, v2, off offset:128
-; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; UNALIGNED_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; UNALIGNED_GFX10-NEXT:    s_lshl_b32 s0, s0, 7
 ; UNALIGNED_GFX10-NEXT:    s_add_u32 s0, 0x4004, s0
 ; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v1, s0, v1