llvm · ssahasra · Jul 4, 2025 · Jul 8, 2025
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -338,8 +338,8 @@ class WaitcntBrackets {
                              const MachineOperand &Op) const;
 
   bool counterOutOfOrder(InstCounterType T) const;
-  void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
-  void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
+  void simplifyWaitcnt(AMDGPU::Waitcnt &Wait, bool OptNone) const;
+  void simplifyWaitcnt(InstCounterType T, unsigned &Count, bool OptNone) const;
 
   void determineWait(InstCounterType T, RegInterval Interval,
                      AMDGPU::Waitcnt &Wait) const;
@@ -1164,22 +1164,33 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
 
 /// Simplify the waitcnt, in the sense of removing redundant counts, and return
 /// whether a waitcnt instruction is needed at all.
-void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
-  simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
-  simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
-  simplifyWaitcnt(DS_CNT, Wait.DsCnt);
-  simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);
-  simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
-  simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
-  simplifyWaitcnt(KM_CNT, Wait.KmCnt);
-  simplifyWaitcnt(X_CNT, Wait.XCnt);
+void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait,
+                                      bool OptNone) const {
+  simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt, OptNone);
+  simplifyWaitcnt(EXP_CNT, Wait.ExpCnt, OptNone);
+  simplifyWaitcnt(DS_CNT, Wait.DsCnt, OptNone);
+  simplifyWaitcnt(STORE_CNT, Wait.StoreCnt, OptNone);
+  simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt, OptNone);
+  simplifyWaitcnt(BVH_CNT, Wait.BvhCnt, OptNone);
+  simplifyWaitcnt(KM_CNT, Wait.KmCnt, OptNone);
+  simplifyWaitcnt(X_CNT, Wait.XCnt, OptNone);
 }
 
-void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
-                                      unsigned &Count) const {
+void WaitcntBrackets::simplifyWaitcnt(InstCounterType T, unsigned &Count,
+                                      bool OptNone) const {
   // The number of outstanding events for this type, T, can be calculated
   // as (UB - LB). If the current Count is greater than or equal to the number
   // of outstanding events, then the wait for this counter is redundant.
+  //
+  // For counts that are at max value or above, try this even when optimizations
+  // are disabled. This helps remove max waitcnt's that are inserted by the
+  // memory legalizer by default, but does not optimize actual waitcnt's that
+  // are otherwise inserted by the memory legalizer or a previous pass of the
+  // inserter. The corner case is when a max waitcnt was optimized away although
+  // it was not just a default, but was deliberately chosen. This only
+  // marginally affects the usefulness of OptNone.
+  if (Count < getWaitCountMax(T) && OptNone)
+    return;
   if (Count >= getScoreRange(T))
     Count = ~0u;
 }
@@ -1363,19 +1374,20 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
     }
 
     unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
-    bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
+    bool OpcodeIsSoft = Opcode != II.getOpcode();
 
     // Update required wait count. If this is a soft waitcnt (= it was added
     // by an earlier pass), it may be entirely removed.
     if (Opcode == AMDGPU::S_WAITCNT) {
       unsigned IEnc = II.getOperand(0).getImm();
       AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
-      if (TrySimplify)
-        ScoreBrackets.simplifyWaitcnt(OldWait);
+      if (OpcodeIsSoft)
+        ScoreBrackets.simplifyWaitcnt(OldWait, OptNone);
       Wait = Wait.combined(OldWait);
 
       // Merge consecutive waitcnt of the same type by erasing multiples.
-      if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
+      if (WaitcntInstr ||
+          (!Wait.hasWaitExceptStoreCnt() && OpcodeIsSoft && !OptNone)) {
         II.eraseFromParent();
         Modified = true;
       } else
@@ -1386,11 +1398,13 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
 
       unsigned OldVSCnt =
           TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
-      if (TrySimplify)
-        ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
+      if (OpcodeIsSoft)
+        ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt,
+                                      OptNone);
       Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
 
-      if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
+      if (WaitcntVsCntInstr ||
+          (!Wait.hasWaitStoreCnt() && OpcodeIsSoft && !OptNone)) {
         II.eraseFromParent();
         Modified = true;
       } else
@@ -1528,7 +1542,7 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
     // by an earlier pass), it may be entirely removed.
 
     unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
-    bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
+    bool OpcodeIsSoft = Opcode != II.getOpcode();
 
     // Don't crash if the programmer used legacy waitcnt intrinsics, but don't
     // attempt to do more than that either.
@@ -1539,25 +1553,25 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
       unsigned OldEnc =
           TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
       AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
-      if (TrySimplify)
-        ScoreBrackets.simplifyWaitcnt(OldWait);
+      if (OpcodeIsSoft)
+        ScoreBrackets.simplifyWaitcnt(OldWait, OptNone);
       Wait = Wait.combined(OldWait);
       UpdatableInstr = &CombinedLoadDsCntInstr;
     } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
       unsigned OldEnc =
           TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
       AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
-      if (TrySimplify)
-        ScoreBrackets.simplifyWaitcnt(OldWait);
+      if (OpcodeIsSoft)
+        ScoreBrackets.simplifyWaitcnt(OldWait, OptNone);
       Wait = Wait.combined(OldWait);
       UpdatableInstr = &CombinedStoreDsCntInstr;
     } else {
       std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
       assert(CT.has_value());
       unsigned OldCnt =
           TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
-      if (TrySimplify)
-        ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
+      if (OpcodeIsSoft)
+        ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt, OptNone);
       addWait(Wait, CT.value(), OldCnt);
       UpdatableInstr = &WaitInstrs[CT.value()];
     }
@@ -2009,7 +2023,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
   }
 
   // Verify that the wait is actually needed.
-  ScoreBrackets.simplifyWaitcnt(Wait);
+  ScoreBrackets.simplifyWaitcnt(Wait, /* OptNone = */ false);
 
   // When forcing emit, we need to skip terminators because that would break the
   // terminators of the MBB if we emit a waitcnt between terminators.
@@ -2238,7 +2252,7 @@ bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
     NeedsEndPGMCheck = true;
   }
 
-  ScoreBrackets.simplifyWaitcnt(Wait);
+  ScoreBrackets.simplifyWaitcnt(Wait, /* OptNone = */ false);
 
   auto SuccessorIt = std::next(Inst.getIterator());
   bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,

diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -1074,8 +1074,6 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
                                     SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                                     bool IsCrossAddrSpaceOrdering, Position Pos,
                                     AtomicOrdering Order) const {
-  bool Changed = false;
-
   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
 
@@ -1149,21 +1147,19 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
     }
   }
 
-  if (VMCnt || LGKMCnt) {
-    unsigned WaitCntImmediate =
-      AMDGPU::encodeWaitcnt(IV,
-                            VMCnt ? 0 : getVmcntBitMask(IV),
-                            getExpcntBitMask(IV),
-                            LGKMCnt ? 0 : getLgkmcntBitMask(IV));
-    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
-        .addImm(WaitCntImmediate);
-    Changed = true;
-  }
+  // Always emit a soft wait count, even if it is trivially ~0. SIInsertWaitcnts
+  // will later use this marker to add additional waits such as those required
+  // from direct load to LDS (formerly known as LDS DMA).
+  unsigned WaitCntImmediate = AMDGPU::encodeWaitcnt(
+      IV, VMCnt ? 0 : getVmcntBitMask(IV), getExpcntBitMask(IV),
+      LGKMCnt ? 0 : getLgkmcntBitMask(IV));
+  BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
+      .addImm(WaitCntImmediate);
 
   if (Pos == Position::AFTER)
     --MI;
 
-  return Changed;
+  return true;
 }
 
 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
@@ -1966,8 +1962,6 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                                      bool IsCrossAddrSpaceOrdering,
                                      Position Pos, AtomicOrdering Order) const {
-  bool Changed = false;
-
   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
 
@@ -2057,28 +2051,25 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
     }
   }
 
-  if (VMCnt || LGKMCnt) {
-    unsigned WaitCntImmediate =
-      AMDGPU::encodeWaitcnt(IV,
-                            VMCnt ? 0 : getVmcntBitMask(IV),
-                            getExpcntBitMask(IV),
-                            LGKMCnt ? 0 : getLgkmcntBitMask(IV));
-    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
-        .addImm(WaitCntImmediate);
-    Changed = true;
-  }
+  // Always emit a soft wait count, even if it is trivially ~0. SIInsertWaitcnts
+  // will later use this marker to add additional waits such as those required
+  // from direct load to LDS (formerly known as LDS DMA).
+  unsigned WaitCntImmediate = AMDGPU::encodeWaitcnt(
+      IV, VMCnt ? 0 : getVmcntBitMask(IV), getExpcntBitMask(IV),
+      LGKMCnt ? 0 : getLgkmcntBitMask(IV));
+  BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
+      .addImm(WaitCntImmediate);
 
   if (VSCnt) {
     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
         .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
         .addImm(0);
-    Changed = true;
   }
 
   if (Pos == Position::AFTER)
     --MI;
 
-  return Changed;
+  return true;
 }
 
 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
@@ -2287,8 +2278,6 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                                      bool IsCrossAddrSpaceOrdering,
                                      Position Pos, AtomicOrdering Order) const {
-  bool Changed = false;
-
   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
 
@@ -2372,23 +2361,26 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
       BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
     }
     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
-    Changed = true;
+  } else {
+    // Always emit a soft wait count, even if it is trivially ~0.
+    // SIInsertWaitcnts will later use this marker to add additional waits such
+    // as those required from direct load to LDS (formerly known as LDS DMA).
+    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft))
+        .addImm(getLoadcntBitMask(IV));
   }
 
   if (STORECnt) {
     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
-    Changed = true;
   }
 
   if (DSCnt) {
     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
-    Changed = true;
   }
 
   if (Pos == Position::AFTER)
     --MI;
 
-  return Changed;
+  return true;
 }
 
 bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -880,8 +880,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x100, v0
 ; GFX10-NEXT:    scratch_store_dword v0, v2, off offset:128
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 7
 ; GFX10-NEXT:    s_add_u32 s0, 0x100, s0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, s0, v1
@@ -921,8 +921,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX11-NEXT:    scratch_store_b32 v0, v2, off offset:384 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 7
 ; GFX11-NEXT:    s_add_u32 s0, 0x100, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -991,8 +991,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
 ; UNALIGNED_GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x100, v0
 ; UNALIGNED_GFX10-NEXT:    scratch_store_dword v0, v2, off offset:128
-; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; UNALIGNED_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; UNALIGNED_GFX10-NEXT:    s_lshl_b32 s0, s0, 7
 ; UNALIGNED_GFX10-NEXT:    s_add_u32 s0, 0x100, s0
 ; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v1, s0, v1
@@ -1032,8 +1032,8 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
 ; UNALIGNED_GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
 ; UNALIGNED_GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; UNALIGNED_GFX11-NEXT:    scratch_store_b32 v0, v2, off offset:384 dlc
-; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; UNALIGNED_GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; UNALIGNED_GFX11-NEXT:    s_lshl_b32 s0, s0, 7
 ; UNALIGNED_GFX11-NEXT:    s_add_u32 s0, 0x100, s0
 ; UNALIGNED_GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1520,8 +1520,8 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x4004, v0
 ; GFX10-NEXT:    scratch_store_dword v0, v2, off offset:128
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 7
 ; GFX10-NEXT:    s_add_u32 s0, 0x4004, s0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, s0, v1
@@ -1633,8 +1633,8 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) {
 ; UNALIGNED_GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x4004, v0
 ; UNALIGNED_GFX10-NEXT:    scratch_store_dword v0, v2, off offset:128
-; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; UNALIGNED_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; UNALIGNED_GFX10-NEXT:    s_lshl_b32 s0, s0, 7
 ; UNALIGNED_GFX10-NEXT:    s_add_u32 s0, 0x4004, s0
 ; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v1, s0, v1